def fit(self, X, y=None):
     """
     Fit the model according to the given training data.
     Calls gensim.models.AuthorTopicModel
     """
     self.gensim_model = models.AuthorTopicModel(
         corpus=X,
         num_topics=self.num_topics,
         id2word=self.id2word,
         author2doc=self.author2doc,
         doc2author=self.doc2author,
         chunksize=self.chunksize,
         passes=self.passes,
         iterations=self.iterations,
         decay=self.decay,
         offset=self.offset,
         alpha=self.alpha,
         eta=self.eta,
         update_every=self.update_every,
         eval_every=self.eval_every,
         gamma_threshold=self.gamma_threshold,
         serialized=self.serialized,
         serialization_path=self.serialization_path,
         minimum_probability=self.minimum_probability,
         random_state=self.random_state)
     return self
    def partial_fit(self, X, author2doc=None, doc2author=None):
        """
        Train model over X.
        """
        if self.gensim_model is None:
            self.gensim_model = models.AuthorTopicModel(
                corpus=X,
                num_topics=self.num_topics,
                id2word=self.id2word,
                author2doc=self.author2doc,
                doc2author=self.doc2author,
                chunksize=self.chunksize,
                passes=self.passes,
                iterations=self.iterations,
                decay=self.decay,
                offset=self.offset,
                alpha=self.alpha,
                eta=self.eta,
                update_every=self.update_every,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                serialized=self.serialized,
                serialization_path=self.serialization_path,
                minimum_probability=self.minimum_probability,
                random_state=self.random_state)

        self.gensim_model.update(corpus=X,
                                 author2doc=author2doc,
                                 doc2author=doc2author)
        return self
Пример #3
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of (int, number)
            Sequence of documents in BoW format.

        Returns
        -------
        :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
            The trained model.

        """
        self.gensim_model = models.AuthorTopicModel(
            corpus=X,
            num_topics=self.num_topics,
            id2word=self.id2word,
            author2doc=self.author2doc,
            doc2author=self.doc2author,
            chunksize=self.chunksize,
            passes=self.passes,
            iterations=self.iterations,
            decay=self.decay,
            offset=self.offset,
            alpha=self.alpha,
            eta=self.eta,
            update_every=self.update_every,
            eval_every=self.eval_every,
            gamma_threshold=self.gamma_threshold,
            serialized=self.serialized,
            serialization_path=self.serialization_path,
            minimum_probability=self.minimum_probability,
            random_state=self.random_state)
        return self
Пример #4
0
    def partial_fit(self, X, author2doc=None, doc2author=None):
        """Train model over a potentially incomplete set of documents.

        This method can be used in two ways:
        * On an unfitted model in which case the model is initialized and trained on `X`.
        * On an already fitted model in which case the model is **updated** by `X`.


        Parameters
        ----------
        X : iterable of list of (int, number)
            Sequence of documents in BoW format.
        author2doc : dict of (str, list of int), optional
            Maps an authors name to a list of document IDs where has has contributed.
            Either `author2doc` or `doc2author` **must be supplied**.
        doc2author : dict of (int, list of str)
            Maps a document (using its ID) to a list of author names that contributed to it.
            Either `author2doc` or `doc2author` **must be supplied**.

        Returns
        -------
        :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = models.AuthorTopicModel(
                corpus=X,
                num_topics=self.num_topics,
                id2word=self.id2word,
                author2doc=self.author2doc,
                doc2author=self.doc2author,
                chunksize=self.chunksize,
                passes=self.passes,
                iterations=self.iterations,
                decay=self.decay,
                offset=self.offset,
                alpha=self.alpha,
                eta=self.eta,
                update_every=self.update_every,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                serialized=self.serialized,
                serialization_path=self.serialization_path,
                minimum_probability=self.minimum_probability,
                random_state=self.random_state)

        self.gensim_model.update(corpus=X,
                                 author2doc=author2doc,
                                 doc2author=doc2author)
        return self
Пример #5
0
    db = [(k, v) for k, v in db if len(v['authors']) >= 1]
    return db

train_db = filter_db(train_db, vocab, users)
valid_db = filter_db(valid_db, vocab, users)
test_db = filter_db(test_db, vocab, users)

def db_to_corpus(db):
    author2doc = {}
    for i, (k, v) in enumerate(db):
        for a in v['authors']:
            if not a in author2doc:
                author2doc[a] = []
            author2doc[a].append(i)

    docs = [v['cleaned'] for k, v in db]
    dic = corpora.Dictionary(docs)
    _ = dic[0]  # ugly
    corpus = [dic.doc2bow(doc) for doc in docs]

    return corpus, author2doc, dic.id2token

train_corpus, author2doc, id2word = db_to_corpus(train_db)

lda = models.AuthorTopicModel(
        train_corpus,
        num_topics=N_TOPICS,
        author2doc=author2doc,
        id2word=id2word,
        )