예제 #1
0
    def __init__(
        self,
        sequences: List[List[int]],
        num_words: int,
        window_size: int = 5,
        batch_size: int = 32,
    ):
        super().__init__(sequences, window_size, batch_size)

        self._num_words = num_words
        self._sampling_table = sequence.make_sampling_table(size=num_words)
예제 #2
0
def create_dataset(text, vocab, num_words, window_size, negative_samples):
    data = vocab.texts_to_sequences([text]).pop()
    sampling_table = make_sampling_table(num_words)
    couples, labels = skipgrams(data, num_words,
                                window_size=window_size,
                                negative_samples=negative_samples,
                                sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.reshape(word_target, (-1, 1))
    word_context = np.reshape(word_context, (-1, 1))
    labels = np.asfarray(labels)
    return [word_target, word_context], labels
예제 #3
0
def training_data_generator(text_encoded,
                            window_size=4,
                            negative_samples=1.0,
                            batch_docs=50):
    """
    For given encoded text, return 3 np.array:
    words, contexts, labels
    Do not pair the w and its context cross different documents.

    input:
        text_encoded: list of list of int, each list of int is the numerical encoding of the doc
        window_size: int, define the context
        negative_samples: float, how much negative sampling you need, normally 1.0
        batch_docs: int, number of docs for which it generates one return

    return:
        words: list of int, the numerical encoding of the central words
        contexts: list of int, the numerical encoding of the context words
        labels: list of int, 1 or 0

    hint:
    1. You can use skipgrams method from keras
    2. For training purpose, words and contexts needs to be 2D array, with shape (N, 1),
       but labels is 1D array, with shape (N, )
    3. The output can be very big, you SHOULD using generator
    """
    """
    Write your code here
    """
    sampling_table = make_sampling_table(VOCAB_SIZE)
    loc = list(range(len(text_encoded)))
    random.shuffle(loc)

    for j in loc[:batch_docs]:
        couples, label = skipgrams(text_encoded[j],
                                   VOCAB_SIZE,
                                   window_size=window_size,
                                   sampling_table=sampling_table,
                                   negative_samples=negative_samples,
                                   shuffle=True)

        if len(couples) > 0:
            target, context_ = zip(*couples)
            target = np.array(target, dtype="int32")
            context_ = np.array(context_, dtype="int32")

            yield target.tolist(), context_.tolist(), label

        else:
            continue
예제 #4
0
파일: w2v_tf.py 프로젝트: shindohikaru87/k9
def downsample_skipgrams(ids, vocab_size, subsample=1e-3, window=2, neg=2):
    w = []
    y = []
    sampling_table = make_sampling_table(vocab_size, sampling_factor=subsample)
    span = 2 * window + 1
    targets = ids[window::span]
    pairs, labels = skipgrams(ids,
                              vocabulary_size=vocab_size,
                              window_size=np.random.randint(window - 1) + 1,
                              negative_samples=neg,
                              sampling_table=sampling_table,
                              shuffle=True)
    for (t, c), l in zip(pairs, labels):
        if t in targets:
            w.append([t, c])
            y.append(l)
    return w, y
예제 #5
0
class Word2VecDataGenerator(DataGenerator):
    def __init__(self, config, language):
        self.language = language
        super(Word2VecDataGenerator, self).__init__(config)

    def _generate(self, case=None):

        vocabulary_size = self.config["params"]["vocab_size"]
        window_size = self.config["params"]["word2vec"]["window_size"]
        batch_size = self.config["hyper_params"]["word2vec"]["batch_size"]
        sampling_table = sequence.make_sampling_table(vocabulary_size, 0.1)
        while True:
            pairs = []
            labels = []
            indexes = np.arange(len(self.sentences))
            np.random.shuffle(indexes)
            for i in indexes:
                p, l = sequence.skipgrams(self.sentences[i],
                                          vocabulary_size,
                                          window_size=window_size,
                                          sampling_table=sampling_table)
                pairs.extend(p)
                labels.extend(l)
                if len(pairs) >= batch_size:

                    output_pairs = pairs[:batch_size]
                    output_labels = labels[:batch_size]

                    pairs = pairs[batch_size:]
                    labels = labels[batch_size:]
                    output_targets, output_contexts = zip(*output_pairs)

                    output_targets = np.array(output_targets, dtype=np.int32)
                    output_contexts = np.array(output_contexts, dtype=np.int32)

                    output_labels = np.array(output_labels, dtype=np.float32)

                    yield (output_targets, output_contexts), output_labels
예제 #6
0
    def create_skipgrams(cls,
                         normalized_doc,
                         vocabulary_size=10000,
                         ratio=3.0):
        '''
        Create the skipgrams to be trained on the model

        normalized_doc: Normalized document nested array of mapped sentences.
        vocabulary_size: Size of the given vocabulary data has been compiled against.
        ratio: Negative to Positive sampling ratio.
        '''
        # Used for generating the sampling_table argument for skipgrams. sampling_table[i] is the
        # probability of sampling the word i-th most common word in a dataset (more common words
        # should be sampled less frequently, for balance).
        sampling_table = sequence.make_sampling_table(vocabulary_size)

        # Create Skipgrams with Keras
        # This function transforms a sequence of word indexes (list of integers) into tuples of
        # words of the form:
        #   * (word, word in the same window), with label 1 (positive samples).
        #   * (word, random word from the vocabulary), with label 0 (negative samples).
        # Flatten normalized document
        data = list(itertools.chain.from_iterable(normalized_doc))
        couples, labels = skipgrams(data,
                                    vocabulary_size,
                                    negative_samples=ratio,
                                    sampling_table=sampling_table)

        # Split couples into target and context
        word_target, word_context = zip(*couples)

        # Convert to Numpy array, ! rank 1 array!
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        return word_target, word_context, labels
vector_dim = 128    # dimensionality of the embedding vector
epochs = 10 ** 6    # one million epochs
negative_samples = 5 

#validation constants
valid_size = 16
valid_window = 100
#valid_examples = np.random.choice(valid_window, valid_size, replace=False)
#['WAT', 'SWKS', 'PAYX', 'CSCO', 'RF', 'NKE', 'INTC', 'ALL', 'GS', 'AVGO', 'HUM', 'NEE', 'T', 'CL', 'DVA', 'AMGN']
custom_examples = ['CSCO', 'NKE', 'INTC', 'GS', 'T', 'TSLA', 'AAPL', 'PAYX']
valid_examples = [tick_to_idx[tick] for tick in custom_examples]
#picks {16} of the first {100} words for validation
#may need to replace this with some pure play companies like TSLA compared to Ford

#sampling table for negative examples
sampling_table = sequence.make_sampling_table(vocab_size + 1)  #+1 due to index zero being skipped

#function to create skipgrams per ETF
targets, contexts, labels = [], [], []
for etf in etf_names:
    tokens = np.array([tick_to_idx[tick] for tick in df.loc[df['ETF'] == etf, 'Ticker'].values]) 
    etf_couples, etf_labels = skipgrams(
                        tokens,
                        vocab_size,
                        window_size = window_size,
                        negative_samples = negative_samples,
                        sampling_table = sampling_table)
    etf_targets, etf_contexts = zip(*etf_couples)   #separate into target and contexts by etf
    targets.append(np.array(etf_targets))
    contexts.append(np.array(etf_contexts))
    labels.append(np.array(etf_labels))
예제 #8
0
    def word2vec(self, WINDOW_SZ, EMBEDDING_DIM, W2V_EPOCHS):
        print("WORD2VEC...")

        valid_size = 20  #random word set to evaluate similarity
        valid_window = 500  #pick samples in 500 most common words
        valid_examples = np.random.choice(valid_window,
                                          valid_size,
                                          replace=False)

        vocab_size = self.vocab_size

        ## skipgram set up
        sampling_table = sequence.make_sampling_table(vocab_size,
                                                      sampling_factor=0.01)

        skipgrams = [
            sequence.skipgrams(tweet,
                               vocab_size,
                               window_size=WINDOW_SZ,
                               sampling_table=sampling_table)
            for tweet in self.x_train
        ]

        couples, labels = skipgrams[0][0], skipgrams[0][1]
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        ## Functional API model

        #input layers take in target and context word as ints
        input_target = layers.Input((1, ))
        input_context = layers.Input((1, ))

        #embedding layer then transpose vectors to take dot prod
        embedding = Embedding(vocab_size,
                              EMBEDDING_DIM,
                              input_length=1,
                              name='embedding')

        target = embedding(input_target)
        target = Reshape((EMBEDDING_DIM, 1))(target)
        context = embedding(input_context)
        context = Reshape((EMBEDDING_DIM, 1))(context)

        #cosine similarity to be used in validation model
        similarity = Dot(axes=0, normalize=True)

        #dot product layers to measure similarity
        dot_product = Dot(axes=1)([target, context])
        dot_product = Reshape((1, ))(dot_product)
        #sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)

        model = Model(inputs=[input_target, input_context], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        #cosine similarity to be used in validation model
        similarity = Dot(axes=1, normalize=True)([target, context])
        validation_model = Model(inputs=[input_target, input_context],
                                 outputs=similarity)

        reversed_word_index = self.reversed_word_index

        ## Helper class for validating Word2Vec while training
        class SimilarityCallback:
            def run_sim(self):
                for i in range(valid_size):
                    valid_word = reversed_word_index[valid_examples[i]]
                    top_k = 8  #num of nearest neighbors
                    sim = self._get_sim(valid_examples[i])
                    nearest = (-sim).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reversed_word_index[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)

            @staticmethod
            def _get_sim(valid_word_idx):
                sim = np.zeros((vocab_size, ))
                in_arr1 = np.zeros((1, ))
                in_arr2 = np.zeros((1, ))
                for i in range(vocab_size):
                    in_arr1[0, ] = valid_word_idx
                    in_arr2[0, ] = i
                    out = validation_model.predict_on_batch([in_arr1, in_arr2])
                    sim[i] = out
                return sim

        sim_cb = SimilarityCallback()

        arr_1 = np.zeros((1, ))
        arr_2 = np.zeros((1, ))
        arr_3 = np.zeros((1, ))
        ## Train network
        for cnt in range(W2V_EPOCHS):
            idx = np.random.randint(0, len(labels) - 1)
            arr_1[0, ] = word_target[idx]
            arr_2[0, ] = word_context[idx]
            arr_3[0, ] = labels[idx]
            loss = model.train_on_batch([arr_1, arr_2], arr_3)
            # Every 100 epochs print loss
            if cnt % 100 == 0:
                print("Iteration {}, loss={}".format(cnt, loss))
            # Every 500 run similarity test on validation data
            if cnt % 500 == 0:
                sim_cb.run_sim()
예제 #9
0
tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=vocab_size,
                                                          oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
words = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = tensorflow.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=120, truncating="post")

word_target_final = []
word_context_final = []
couples_final = []
labels_final = []

for i in range(1, int(len(padded) / 100)):
    sampling_table = sequence.make_sampling_table(vocab_size)
    couples, labels = sequence.skipgrams(padded[i], vocab_size, window_size=2)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")
    labels_final.append(labels)
    word_target_final.append(word_target)
    word_context_final.append(word_context)

input_target = tensorflow.keras.layers.Input((1, ))
input_context = tensorflow.keras.layers.Input((1, ))

embedding = tensorflow.keras.layers.Embedding(vocab_size,
                                              vector_dim,
                                              input_length=1,
                                              name='embedding')