def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[ word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) examples, exactly as per Word2Vec skip-gram training. (Without this option, word vectors are neither consulted nor updated during DBOW doc vector training.) Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf if train_words and learn_words: train_batch_sg(model, [doc_words], alpha, work) for doctag_index in doctag_indexes: for word in doc_words: train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, context_vectors=doctag_vectors, context_locks=doctag_locks) return len(doc_words)
def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) examples, exactly as per Word2Vec skip-gram training. (Without this option, word vectors are neither consulted nor updated during DBOW doc vector training.) Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf if train_words and learn_words: train_batch_sg(model, [doc_words], alpha, work) for doctag_index in doctag_indexes: for word in doc_words: train_sg_pair( model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, context_vectors=doctag_vectors, context_locks=doctag_locks ) return len(doc_words)