def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.AuthorTopicModel """ self.gensim_model = models.AuthorTopicModel( corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) return self
def partial_fit(self, X, author2doc=None, doc2author=None): """ Train model over X. """ if self.gensim_model is None: self.gensim_model = models.AuthorTopicModel( corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author) return self
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : iterable of list of (int, number) Sequence of documents in BoW format. Returns ------- :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` The trained model. """ self.gensim_model = models.AuthorTopicModel( corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) return self
def partial_fit(self, X, author2doc=None, doc2author=None): """Train model over a potentially incomplete set of documents. This method can be used in two ways: * On an unfitted model in which case the model is initialized and trained on `X`. * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- X : iterable of list of (int, number) Sequence of documents in BoW format. author2doc : dict of (str, list of int), optional Maps an authors name to a list of document IDs where has has contributed. Either `author2doc` or `doc2author` **must be supplied**. doc2author : dict of (int, list of str) Maps a document (using its ID) to a list of author names that contributed to it. Either `author2doc` or `doc2author` **must be supplied**. Returns ------- :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` The trained model. """ if self.gensim_model is None: self.gensim_model = models.AuthorTopicModel( corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author) return self
db = [(k, v) for k, v in db if len(v['authors']) >= 1] return db train_db = filter_db(train_db, vocab, users) valid_db = filter_db(valid_db, vocab, users) test_db = filter_db(test_db, vocab, users) def db_to_corpus(db): author2doc = {} for i, (k, v) in enumerate(db): for a in v['authors']: if not a in author2doc: author2doc[a] = [] author2doc[a].append(i) docs = [v['cleaned'] for k, v in db] dic = corpora.Dictionary(docs) _ = dic[0] # ugly corpus = [dic.doc2bow(doc) for doc in docs] return corpus, author2doc, dic.id2token train_corpus, author2doc, id2word = db_to_corpus(train_db) lda = models.AuthorTopicModel( train_corpus, num_topics=N_TOPICS, author2doc=author2doc, id2word=id2word, )