def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. Document should be a list of (word) tokens. """ doctag_vectors = empty((1, self.vector_size), dtype=REAL) doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) doctag_locks = ones(1, dtype=REAL) doctag_indexes = [0] work = zeros(self.layer1_size, dtype=REAL) if not self.sg: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) for i in range(steps): if self.sg: train_document_dbow(self, doc_words, doctag_indexes, alpha, work, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) elif self.dm_concat: train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) else: train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0]
def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. Parameters ---------- doc_words : :obj: `list` of :obj: `str` Document should be a list of (word) tokens. alpha : float The initial learning rate. min_alpha : float Learning rate will linearly drop to `min_alpha` as training progresses. steps : int Number of times to train the new document. Returns ------- :obj: `numpy.ndarray` Returns the inferred vector for the new document. """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) doctag_indexes = [0] work = zeros(self.trainables.layer1_size, dtype=REAL) if not self.sg: neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) for i in range(steps): if self.sg: train_document_dbow( self, doc_words, doctag_indexes, alpha, work, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) elif self.dm_concat: train_document_dm_concat( self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) else: train_document_dm( self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0]
def _do_train_job(self, job, alpha, inits): work, neu1 = inits tally = 0 for doc in job: indexed_doctags = self.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags if self.sg: tally += train_document_dbow(self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) elif self.dm_concat: tally += train_document_dm_concat(self, doc.words, doctag_indexes, alpha, work, neu1, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) else: tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) self.docvecs.trained_item(indexed_doctags) return tally, self._raw_word_count(job)
def _do_train_job(self, job, alpha, inits): work, neu1 = inits tally = 0 for doc in job: doctag_indexes = self.vocabulary.indexed_doctags(doc.tags, self.docvecs) doctag_vectors = self.docvecs.vectors_docs doctag_locks = self.trainables.vectors_docs_lockf if self.sg: tally += train_document_dbow( self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) elif self.dm_concat: tally += train_document_dm_concat( self, doc.words, doctag_indexes, alpha, work, neu1, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) else: tally += train_document_dm( self, doc.words, doctag_indexes, alpha, work, neu1, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) return tally, self._raw_word_count(job)