def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW model by training on a single sentence. The sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from `Word2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ word_vocabs = [ model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] pos = 0 word = word_vocabs[0] context = word_vocabs[1:] reduced_window = model.random.randint( model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [ word2.index for pos2, word2 in enumerate(context) if (word2 is not None and pos2 != pos) ] l1 = word2vec.np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) word2vec.train_cbow_pair(model, word, word2_indices, l1, alpha) return len(word_vocabs)
def train_batch_labeled_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: document, target = sentence word_vocabs = [ model.wv.vocab[w] for w in document if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] target_vocabs = [ model.lvocab[t] for t in target if t in model.lvocab ] for target in target_vocabs: word2_indices = [w.index for w in word_vocabs] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) if model.softmax: train_cbow_pair_softmax(model, target, word2_indices, l1, alpha) else: train_cbow_pair(model, target, word2_indices, l1, alpha) result += len(word_vocabs) return result
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] for subword in word2_subwords: ngrams_subwords_indices.append(model.wv.ngrams[subword]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word result += len(word_vocabs) return result
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. Called internally from :meth:`~gensim.models.fasttext.FastText.train`. Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from :mod:`gensim.models.fasttext_inner` instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` Model instance. sentences : iterable of list of str Iterable of the sentences. alpha : float Learning rate. work : :class:`numpy.ndarray`, optional UNUSED. neu1 : :class:`numpy.ndarray`, optional UNUSED. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] ngrams_subwords_indices.extend(model.wv.buckets_word[index]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) # train on the sliding window for target word train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) result += len(word_vocabs) return result
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] lbl_sum = np_sum(model.syn0[lbl_indices], axis=0) lbl_len = len(lbl_indices) neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[lbl_indices] += neu1e return len([word for word in sentence if word is not None])
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos) ] word2_subwords = [] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[ model.wv.index2word[index]] for subword in word2_subwords: ngrams_subwords_indices.append(model.wv.ngrams[subword]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices ] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) train_cbow_pair( model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word result += len(word_vocabs) return result
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.randint(2**32)] doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0) doctag_len = len(doctag_indexes) for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum # 1 x layer1_size if word2_indexes and model.cbow_mean: l1 /= (len(word2_indexes) + doctag_len) neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if word2_indexes and not model.cbow_mean: neu1e /= (len(word2_indexes) + doctag_len) if learn_doctags: doctag_vectors[doctag_indexes] += neu1e * \ np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size) if learn_words: word_vectors[word2_indexes] += neu1e * \ np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size) return len(word_vocabs)
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_document_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) if model.cbow_mean and count > 1 : l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: for i in doctag_indexes: doctag_vectors[i] += neu1e * doctag_locks[i] if learn_words: for i in word2_indexes: word_vectors[i] += neu1e * word_locks[i] return len(word_vocabs)
def train_batch_labeled_cbow(model, tagged_docs, alpha, work=None, neu1=None): result = 0 for tagged_doc in tagged_docs: document, target = tagged_doc word_vocabs = [model.wv.vocab[w] for w in document if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] target_vocabs = [model.lvocab[t] for t in target if t in model.lvocab] for target in target_vocabs: word2_indices = [w.index for w in word_vocabs] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) if model.softmax: train_cbow_pair_softmax(model, target, word2_indices, l1, alpha) else: train_cbow_pair(model, target, word2_indices, l1, alpha) result += len(word_vocabs) return result
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] lbl_sum = np_sum(model.syn0[lbl_indices], axis=0) lbl_len = len(lbl_indices) neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint( model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( sentence[start:pos + model.window + 1 - reduced_window], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos) ] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[lbl_indices] += neu1e return len([word for word in sentence if word is not None])
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) word_context_len = len(word_context_indexes) predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count
def train_batch(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """Update CBOW model by training on a sequence of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. Warnings -------- This is the non-optimized, pure Python version. If you have a C compiler, Gensim will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead. Parameters ---------- model : :class:`~gensim.models.word2vec.Word2Vec` The Word2Vec model instance to train. sentences : iterable of list of str The corpus used to train the model. alpha : float The learning rate work : object, optional Unused. neu1 : object, optional Unused. compute_loss : bool, optional Whether or not the training loss should be computed in this batch. Returns ------- int Number of words in the vocabulary actually used for training (that already existed in the vocabulary and were not discarded by negative sampling). """ result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] word = word_vocabs[0] start = 1 window_pos = enumerate(word_vocabs[start:], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None) ] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) for word2idx in word2_indices: train_sg_pair(model, model.wv.index2word[word.index], word2idx, alpha, compute_loss=compute_loss) train_sg_pair(model, model.wv.index2word[word2idx], word.index, alpha, compute_loss=compute_loss) result += len(word_vocabs) return result