def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
示例#2
0
    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, \
                    'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            # Make a temporary copy of the file where the corpus is serialized.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            # Re-serialize the old corpus, and extend it with the new corpus.
            MmCorpus.serialize(self.serialization_path, corpus_chain)
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)
    def test_apply(self):

        transformed_vtcorp = self.transformer._apply(self.vtcorp)

        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label)
        text_data_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[0])
        text_obj_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[2])

        MmCorpus.serialize(text_data_name, transformed_vtcorp)
        transformed_vtcorp.save(text_obj_name)

        self.assertTrue(self.loader.has_text_corpora(self.transformation_label))

        self.temporary_files.extend([ os.path.join(self.data_root,
                                                   self.loader.layout.corpus_dir,
                                                   transformed_name)
                                      for transformed_name in transformed_names])

        transformed_vtcorp = TransformedCorpus.load(text_obj_name)

        self.assertIsInstance(transformed_vtcorp, TransformedCorpus)
        self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus)
        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary)
        self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
def createcorpus(bg_corpus,output_dictionary,output_serialize):
	# Generating a training/background corpus from your own source of documents
	#saving dictionary and corpus in Matrix method form
	print("Creating corpus and dictionary")
	background_corpus = TextCorpus(input=bg_corpus)
	background_corpus.dictionary.save(output_dictionary)
	MmCorpus.serialize(output_serialize,background_corpus)  
	return background_corpus,background_corpus.dictionary
示例#5
0
 def _create_bow_representation(self):
     """Create bag-of-words representation of collection, and save it 
        in Matrix Matrix format to disk."""
     
     print('Create bag-of-words matrix representation.')
     self.bow_corpus = [self.dictionary.doc2bow(article) 
                        for article in self.articles]
     MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)
示例#6
0
def load_experts():
    """
    load expert data and save to file
    """
    expert_corpus = ExpertCorpus()
    MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm')

    """
    save expert-to-document map to pickle
    """
    pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
示例#7
0
 def _create_tfidf_matrix(self):
     """Create TF-IDF matrix and save it in Matrix Matrix format to 
        disk"""
     
     print('Create TF-IDF matrix of collection.')
     tfidf = TfidfModel(self.bow_corpus, 
                        id2word=self.dictionary, 
                        normalize=True)
     MmCorpus.serialize(self.tfidf_filepath, 
                        tfidf[self.bow_corpus])
     print('Number of documents:', tfidf.num_docs)
示例#8
0
    def init_empty_corpus(self):
        """
        Initialize an empty corpus. If the corpora are to be treated as lists, simply
        initialize an empty list. If serialization is used, initialize an empty corpus
        of the class `gensim.corpora.MmCorpus`.

        """
        if self.serialized:
            # Initialize the corpus as a serialized empty list.
            # This corpus will be extended in self.update.
            MmCorpus.serialize(self.serialization_path, [])  # Serialize empty corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store serialized corpus object in self.corpus.
        else:
            # All input corpora are assumed to just be lists.
            self.corpus = []
示例#9
0
def pretrain():
    """pre train the text corpus and build the dictionary"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    gutenberg_corpus.dictionary.save(dict_file)
    gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
    mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
    print mm;
def build_pyLDAvis_output(corp_loc, dict_loc, lda_loc):
    if not 'model' in lda_loc:
        lda_loc += '.model'

    corpus = MmCorpus(corp_loc)
    dictionary = Dictionary.load(dict_loc)
    lda = models.LdaModel.load(lda_loc)
    
    vis_data = gensim_vis.prepare(lda, corpus, dictionary, sort_topics=False)
    pyLDAvis.save_html(vis_data, lda_loc.split('.')[0] + '.html')
示例#11
0
    def _run_model(self):
        id2word_wiki = Dictionary.load(self.wiki_dict_file)
        mm_corpus = MmCorpus(self.mm_corpus_file)

        #to be removed
        #mm_corpus = ClippedCorpus(mm_corpus, 4000)

        tfidf_model = TfidfModel(mm_corpus, id2word=id2word_wiki)

        corpus = tfidf_model[mm_corpus]
        MmCorpus.serialize(self.wiki_tfidf_file, corpus)

        self.model = LsiModel(corpus,
                              num_topics=self.config.num_topics,
                              id2word=id2word_wiki,
                              chunksize=self.config.chunksize)

        MmCorpus.serialize(self.wiki_lsi_file, self.model[corpus])
        self.model.save(self.model_file)
示例#12
0
    def save_corpus(self):
        assert self.corpus, 'corpus is not in memory'
        assert self.run, 'run id is missing'
        self.dictionary.save(self.prefix +
                             self.corpus.dictionary.__class__.__name__)

        MmCorpus.serialize(fname=self.prefix + 'corpus', corpus=self.corpus)

        self.tfidf_vectorizer.save(self.prefix +
                                   self.tfidf_vectorizer.__class__.__name__)
        self.corpus_tfidf = self.tfidf_vectorizer[self.corpus]
        MmCorpus.serialize(fname=self.prefix + 'corpus_tfidf',
                           corpus=self.corpus_tfidf)

        with open(self.prefix + 'db_index', 'wb') as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(['db_index'])
            for i in self.index:
                writer.writerow([i])
示例#13
0
    def __init__(self, textcolname):

        self._flogger()
        self.corpus = MmCorpus(PConstant.CORPUS_DIR_PATH.value + textcolname +
                               '_corpus.mm')
        self.dictionary = Dictionary.load(PConstant.DICTIONARY_DIR_PATH.value +
                                          textcolname + '_dictionary.dict')
        self.lda = models.LdaModel.load(PConstant.LDA_DIR_PATH.value +
                                        textcolname + '_lda.model')
        self.stopwords = StopWord.EnglishStopWord().stopwords()
示例#14
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
示例#15
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # Read in the corpus from within the archive file
    fin = path.join(datadir, "reuters21578.tar.gz")
    rc = ReutersCorpus(fin)

    # filter out some of the more common words,
    # and some of the less-common ones as well
    rc.dictionary.filter_extremes(no_below=20, no_above=0.1)
    rc.dictionary.compactify()

    # Serialize the Reuters 21578 corpus
    fout = path.join(datadir, "reuters21578.mm")
    MmCorpus.serialize(fout, rc)

    # Save the dictionary to file as text
    fout = path.join(datadir, "reuters21578.dict.txt")
    rc.dictionary.save_as_text(fout)
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute(
        'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.5,
                                    keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
示例#17
0
def train_lda_model(token_tweets):
    print('Start LDA model training ...\n')    
    # Build dictionary
    tweets_dict = corpora.Dictionary(token_tweets)
    # Remove words that occur less than 10 documents, 
    # or more than 50% of the doc
    tweets_dict.filter_extremes(no_below=10, no_above=0.5)
    # Transform doc to a vectorized form by computing frequency of each word
    bow_corpus = [tweets_dict.doc2bow(doc) for doc in token_tweets]
    # Save corpus and dictionary to file
    MmCorpus.serialize(CORPUS_FILE, bow_corpus)
    tweets_dict.save(DICT_FILE)
    
    # Create tf-idf model and then apply transformation to the entire corpus
    tfidf = models.TfidfModel(bow_corpus)
    tfidf_corpus = tfidf[bow_corpus]
    
    # Train LDA model
    lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, 
                                         num_topics=NUM_TOPICS, 
                                         id2word=tweets_dict, 
                                         passes=NUM_PASSES, 
                                         alpha=ALPHA, 
                                         eta=ETA,
                                         random_state=49)
    # Save LDA model to file
    lda_model.save(LDA_MODEL_FILE)
    print ('LDA model saved\n')
    
    # Save all generated topics to a file
    msg = ''
    for idx, topic in lda_model.print_topics(-1):
        msg += 'Topic: {} \nWords: {}\n'.format(idx, topic)    
    save_print_to_file(LDA_TOPICS_FILE, msg)
    
    # Evaluate LDA model performance
    eval_lda (lda_model, tfidf_corpus, tweets_dict, token_tweets)    
    # Visualize topics
    vis_topics(lda_model, tfidf_corpus, tweets_dict)
        
    return lda_model
def build_LDA_model(corp_loc, dict_loc, num_topics, num_pass, lda_loc):
    corpus = MmCorpus(corp_loc)
    dictionary = Dictionary.load(dict_loc)

    lda = gensim.models.LdaMulticore(corpus=corpus,
                                     id2word=dictionary,
                                     num_topics=int(num_topics),
                                     alpha='asymmetric',
                                     passes=int(num_pass))
    lda.save(lda_loc + '.model')

    build_pyLDAvis_output(corp_loc, dict_loc, lda_loc)
示例#19
0
    def __init__(self,
                 docs,
                 model,
                 persist=False,
                 path="./saved_models/lsi_corpus.crp"):

        self.path = path
        self.persist = persist
        self.model = model

        if not os.path.exists(self.path) and self.persist:

            print("creating model repr.")
            corpus = model[docs]
            print("saving model repr to disk.")
            MmCorpus.serialize(self.path, corpus)

        if not self.persist:
            self.corpus = docs
        else:
            self.corpus = MmCorpus(self.path)
 def mapper_init(self):
     '''
     Load required files and models here.
     '''
     # load prerequisite document vectors and paired dataset
     self.dictionary = Dictionary.load("reviews_dictionary.dict")
     self.corpus = MmCorpus("reviews_corpus.mm")
     self.df = pd.read_csv("user_rest_pair.csv", sep="|")
     # initialize lsi space
     self.lsi = models.LsiModel(self.corpus,
                                id2word=self.dictionary,
                                num_topics=15)
示例#21
0
 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)
示例#22
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
示例#23
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
示例#24
0
def create_tfidf_corpus(corpus_file, dict_file, outputs_dir):

    # Load back the id->word mapping directly from file
    # This seems to save more memory, compared to keeping the
    # wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(dict_file)

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(corpus_file)

    tfidf_model_file = os.path.join(outputs_dir, "wikipedia.tfidf_model")
    tfidf_corpus_file = os.path.join(outputs_dir, "wikipedia_tfidf.mm")

    # build TF-IDF, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(tfidf_model_file)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(tfidf_corpus_file, tfidf[mm], progress_cnt=10000)

    return tfidf_model_file, tfidf_corpus_file
示例#25
0
def create_corpus(dump_file, outputs_dir, max_batch=None):

    # Takes about 9h on a macbook pro, for 3.5m articles (june 2011)
    wiki = WikiCorpus(dump_file, max_batch=max_batch)

    # Only keep the most frequent words (out of total ~8.2m unique tokens)
    wiki.dictionary.filter_extremes(no_below=3,
                                    no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)

    # Save dictionary and bag-of-words (term-document frequency matrix).
    # Another ~9h

    corpus_file = os.path.join(outputs_dir, "wikipedia_bow.mm")
    dict_file = os.path.join(outputs_dir, "wikipedia_wordids.txt.bz2")
    titles_files = os.path.join(outputs_dir, "wikipedia_titles")

    MmCorpus.serialize(corpus_file, corpus=wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(dict_file)
    wiki.save_titles(titles_files)

    return corpus_file, dict_file, titles_files
示例#26
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
示例#28
0
def visualise(model_file, corpus_file, dictionary_file):
    # use Notebook version if not working

    print('Loading corpus from ' + corpus_file)
    corpus = MmCorpus(corpus_file)
    print('Loading dictionary from ' + dictionary_file)
    dictionary = Dictionary.load(dictionary_file)
    print('Loading model from ' + model_file)
    model = models.ldamulticore.LdaMulticore.load(model_file)

    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)
    print('Please use Jupyter notebook visualise.ipynb if not working')
    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus,
                                        coherence="u_mass",
                                        logger="visdom",
                                        title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary,
                              num_topics=2,
                              passes=10,
                              callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097
def load_corpus_and_dict(corpus_path, id2word_path):
    print("[BLOCK] Loading  corpus and dictionary files from %s and %s" %
          (data_path, id2word_path))
    sys.stdout.flush()
    dictionary = Dictionary.load_from_text(id2word_path)

    print("[BLOCK] Loading corpus iterator")
    sys.stdout.flush()
    #mm = gensim.corpora.MmCorpus(corpus_path)
    corpus = MmCorpus(
        bz2.BZ2File(corpus_path)
    )  # use this if you compressed the TFIDF output (recommended)

    return corpus, dictionary
 def loadModel(self, filename):
     self.util.logDebug('LDA', 'Loading model from ' + filename)
     self.model = LdaMulticore.load(fname=filename)
     self.dictionary = Dictionary.load(fname=filename + '.dict')
     self.corpus = MmCorpus(filename + '.corpus')
     print(self.dictionary)
     print(self.model.print_topic(0, topn=5))
     print(self.model.print_topic(1, topn=5))
     print(self.model.print_topic(2, topn=5))
     print(self.model.print_topic(3, topn=5))
     self.loaded = True
     self.util.logDebug('LDA',
                        'Model loaded in ' + self.util.stopTimeTrack())
     self.labelTopics(filename)
def create_similarity_matrix(doc_term_matrix, dictionary):
    model_tfidf = TfidfModel(doc_term_matrix,
                             id2word=dictionary,
                             normalize=False)
    MmCorpus.serialize('./corpus_tfidf.mm',
                       model_tfidf[doc_term_matrix],
                       progress_cnt=100)
    corpus_tfidf = MmCorpus(
        './corpus_tfidf.mm'
    )  # Loading back the corpus file after applying tf-idf
    model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)

    # Creating the similarity matrix with simple bag-of-words model
    # index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))

    # Creating the similarity matrix with LSI model
    index = similarities.MatrixSimilarity(
        model_lsi[corpus_tfidf],
        num_features=len(dictionary))  # Applying LSI model to all vectors

    # index.save('./similarity_matrix_' + fileName + '.mm')

    return index
示例#33
0
def main(args):

    logging.info('Initializing loaders with root %s, name %s' % (
        args.root, args.name))

    dloader = MultimodalDatasetLoader(args.root, args.name)

    icorp = dloader.load_image_corpus(args.img_label)

    transformer = NormalizationTransform()

    normalized_icorp = transformer._apply(icorp)

    corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label)
    corpus_full_path = os.path.join(args.root, corpus_names[0])

    logging.info('Serializing to file %s' % corpus_full_path)

    MmCorpus.serialize(corpus_full_path, normalized_icorp)

    logging.info('Re-saving original corpus object with infix %s' % args.transformation_label)

    dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)
示例#34
0
    def _build_model(self, all_documents, remove_once=False):
        '''
        Builds the lsa model

        Returns:
            dictionary, corpus
        '''
        doc_hash = hash_obj(all_documents)
        corp_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_corp_' + str(int(remove_once))
        dic_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_dic_' + str(int(remove_once))
        lsi_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_lsi_' + str(int(remove_once))
        if os.path.exists(corp_cache_path) \
                and os.path.exists(dic_cache_path)\
                and os.path.exists(lsi_cache_path):
            lsi = models.LsiModel.load(lsi_cache_path)
            corp = MmCorpus(corp_cache_path)
            dic = Dictionary.load(dic_cache_path)
        else:
            texts = [self.tokenize(doc) for doc in all_documents]
            all_tokens = sum(texts, [])
            if remove_once:
                tokens_once = set(word for word in set(all_tokens)
                                  if all_tokens.count(word) == 1)
                texts = [[word for word in text if word not in tokens_once]
                         for text in texts]
            dic = Dictionary(texts)
            corp = [dic.doc2bow(text) for text in texts]

            MmCorpus.serialize(corp_cache_path, corp)
            dic.save(dic_cache_path)
            lsi = models.LsiModel(
                corp, id2word=dic, num_topics=20)
            lsi.save(lsi_cache_path)
        return dic, corp, lsi
    def generate_lda_topics(self):
        from gensim.corpora import Dictionary, MmCorpus
        from gensim.models.ldamulticore import LdaMulticore
        import pyLDAvis
        import pyLDAvis.gensim
        import warnings
        import _pickle as pickle

        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        trigram_dictionary = Dictionary(trigram_sentences)
        # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
        trigram_dictionary.compactify()
        trigram_dictionary.save(self.trigram_dictionary_filepath)

        def trigram_bow_generator(filepath):
            for sentence in LineSentence(filepath):
                yield trigram_dictionary.doc2bow(sentence)

        MmCorpus.serialize(
            self.trigram_bow_filepath,
            trigram_bow_generator(self.trigram_sentences_filepath))
        trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=3,
                               id2word=trigram_dictionary,
                               workers=3)
            lda.save(self.lda_model_filepath)
        lda = LdaMulticore.load(self.lda_model_filepath)
        lda.show_topic(0)
        lda.show_topic(1)
        lda.show_topic(2)
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                                  trigram_dictionary)
        pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
示例#36
0
    def get_trigram_bow_corpus(self,
                               trigram_dictionary,
                               recalculate=False,
                               from_scratch=True):

        if not os.path.isfile(self.paths.trigram_bow_filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No BOW corpus file exists but from_scratch is False')

            print('Building bow corpus...')
            trigram_corpus = LineSentence(self.paths.trigram_corpus_filepath)
            # generate bag-of-words representation
            trigram_bow_generator = (trigram_dictionary.doc2bow(doc)
                                     for doc in trigram_corpus)
            mm_corpus = MmCorpus.serialize(self.paths.trigram_bow_filepath,
                                           trigram_bow_generator)
            print('Done!')
        else:
            print('Loading bow corpus...')
            mm_corpus = MmCorpus(self.paths.trigram_bow_filepath)

        return mm_corpus
示例#37
0
    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')  # Make a temporary copy of the file where the corpus is serialized.
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            MmCorpus.serialize(self.serialization_path, corpus_chain)  # Re-serialize the old corpus, and extend it with the new corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)
示例#38
0
    def _build_model(self, all_documents, remove_once=False):
        '''
        Builds the lsa model

        Returns:
            dictionary, corpus
        '''
        doc_hash = hash_obj(all_documents)
        corp_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_corp_' + str(int(remove_once))
        dic_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_dic_' + str(int(remove_once))
        lsi_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_lsi_' + str(int(remove_once))
        if os.path.exists(corp_cache_path) \
                and os.path.exists(dic_cache_path)\
                and os.path.exists(lsi_cache_path):
            lsi = models.LsiModel.load(lsi_cache_path)
            corp = MmCorpus(corp_cache_path)
            dic = Dictionary.load(dic_cache_path)
        else:
            texts = [self.tokenize(doc) for doc in all_documents]
            all_tokens = sum(texts, [])
            if remove_once:
                tokens_once = set(word for word in set(all_tokens)
                                  if all_tokens.count(word) == 1)
                texts = [[word for word in text if word not in tokens_once]
                         for text in texts]
            dic = Dictionary(texts)
            corp = [dic.doc2bow(text) for text in texts]

            MmCorpus.serialize(corp_cache_path, corp)
            dic.save(dic_cache_path)
            lsi = models.LsiModel(corp, id2word=dic, num_topics=20)
            lsi.save(lsi_cache_path)
        return dic, corp, lsi
示例#39
0
    def _run_model(self):
        id2word_wiki = Dictionary.load(self.wiki_dict_file)
        mm_corpus = MmCorpus(self.mm_corpus_file)

        #mm_corpus = ClippedCorpus(mm_corpus, 4000)

        self.model = LdaModel(mm_corpus,
                              num_topics=self.config.num_topics,
                              id2word=id2word_wiki,
                              alpha=self.config.alpha,
                              chunksize=self.config.chunksize,
                              iterations=self.config.iterations,
                              passes=self.config.passes)

        self.model.save(self.model_file)
示例#40
0
def read_matrix_market_file(filepath):
    """Reads a Matrix Market file for Gensim.

    With this function you can read a Matrix Market file to process it with \
    `Gensim <https://radimrehurek.com/gensim/>`_.

    Args:
        filepath (str): Path to Matrix Market file.

    Returns:
        Matrix Market model for Gensim.
    """
    if os.path.splitext(filepath)[1] != '.mm':
        raise ValueError(
            "The file {} is not a Matrix Market file.".format(filepath))
    return MmCorpus(filepath)
    def visualizeLDA(self, filename):

        dictionary = Dictionary.load(filename + '.dict')
        corpus = MmCorpus(filename + '.corpus')
        lda = LdaMulticore.load(filename)
        self.util.logDebug('LDA', 'Preparing HTML ')
        ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        self.util.logDebug('LDA',
                           'HTML prepared in ' + self.util.stopTimeTrack())
        pyLDAvis.save_html(ldavis, filename + '.html')
        self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack())


#
# lda = LDA(logfilename='/home/kah1/test.log')
# lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model')
# lda.labelTopics()
 def get_corpus(self,
                lang,
                data_version,
                dictionary_version,
                language_processed_data: list = None):
     logging.info("--- Getting corpus")
     if self.corpus is None:
         corpus_file_path = Advisor.get_dictionary_version_folder_file_path(
             lang, data_version, dictionary_version, self.file_types[1][0],
             self.file_types[1][1])
         if path.exists(corpus_file_path):
             logging.info("---- Corpus was created before")
             self.corpus = list(MmCorpus(corpus_file_path))
         else:
             self.set_corpus(language_processed_data, corpus_file_path)
     logging.info("--- Corpus captured")
     return
示例#43
0
def train(corpus_file, dictionary_file, model_file, no_topic, no_iteration,
          no_worker):
    print('Loading corpus from ' + corpus_file)
    corpus = MmCorpus(corpus_file)
    print('Loading dictionary from ' + dictionary_file)
    dictionary = Dictionary.load(dictionary_file)

    print('Training model %d topics in %d interations with %d workers' %
          (no_topic, no_iteration, no_worker))
    lda = models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=no_topic,
                                           iterations=no_iteration,
                                           workers=no_worker)

    print('Writing model to ' + model_file)
    lda.save(model_file)
def predict_tfid(pred_data, data):
    path = input("Enter path to LDA model: ")
    tfid = gensim.models.TfidfModel.load(path + "tfid_model")
    corpus = MmCorpus(path + "tfid_corpus.mm")
    tfid_corpus = tfid[corpus]
    new_dictionary = Dictionary(data['tokens'])
    new_corpus = [new_dictionary.doc2bow(doc) for doc in data['tokens']]
    index_sparse = SparseMatrixSimilarity(tfid_corpus,
                                          num_features=corpus.num_terms)
    index_sparse.num_best = 500
    idx = (index_sparse[new_corpus])
    print("Most Similar users are as follows: ")
    print("Name\t\t\tscore ")
    m = 1
    for i in idx[0]:
        display("{}. {}     {}".format(m, data.iloc[i[0]]['handles'], i[1]))
        m += 1
    return
示例#45
0
def load_imdb(root=DATA_DIR, train=True, download=False):
    data_folder = os.path.join(root, "datasets/imdb")
    processed_folder = os.path.join(data_folder, "processed")
    subfolder = "train" if train else "test"
    if download:
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)
        pull_from_url(data_folder)
        docs = []
        ratings = []
        labels = []
        for cls in ["pos", "neg"]:
            work_dir = os.path.join(data_folder, IMDB_DECOMPRESS_FOLDER,
                                    subfolder, cls)
            filenames = os.listdir(work_dir)
            for file in filenames:
                with open(os.path.join(work_dir, file), "r",
                          encoding="utf8") as f:
                    docs.append(word_tokenize(f.read().strip()))
                    ratings.append(PATTERN.search(file).group(1))
                    labels.append(1 if cls == "pos" else 0)
        docs = [[LEMMATIZER.lemmatize(token) for token in doc] for doc in docs]
        if not os.path.exists(processed_folder):
            os.mkdir(processed_folder)
        if train:
            logging.info(
                "Building dictionary and BOW corpus for training data...")
            corpus, dictionary = build_corpus(docs)
            MmCorpus.serialize(
                os.path.join(processed_folder, "train_corpus.mm"), corpus)
            dictionary.save(os.path.join(processed_folder, "dictionary"))
        else:
            logging.info("Building BOW corpus for testing data...")
            dictionary = Dictionary.load(
                os.path.join(processed_folder, "dictionary"))
            corpus, _ = build_corpus(docs, dictionary)
            MmCorpus.serialize(
                os.path.join(processed_folder, "test_corpus.mm"), corpus)
        return corpus, dictionary
    else:
        try:
            if train:
                return MmCorpus(os.path.join(processed_folder, "train_corpus.mm")), \
                       Dictionary.load(os.path.join(processed_folder, "dictionary")),
            else:
                return MmCorpus(os.path.join(processed_folder, "test_corpus.mm")), \
                       Dictionary.load(os.path.join(processed_folder, "dictionary"))
        except FileNotFoundError:
            logging.warning(
                "The dataset does not exist, please set download to True!")
            return None, None
示例#46
0
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))
    
    corpus = MmCorpus(config.resultFile('bow.mm'))

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        model.save(config.resultFile('model_tfidf.pkl'))
    elif method == 'lda':
        model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
        model.save(config.resultFile('model_lda.pkl'))
    elif method == 'lsi':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
        model.save(config.resultFile('model_lsi.pkl'))
    elif method == 'rp':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
        model.save(config.resultFile('model_rp.pkl'))
    else:
        raise ValueError('unknown topic extraction method: %s' % repr(method))
    
    MmCorpus.saveCorpus(config.resultFile('corpus_%s.mm' % method), model[corpus])
            
    logging.info("finished running %s" % program)

示例#47
0

logging.basicConfig(stream=sys.stdout, level=logging.INFO)


### Generating a training/background corpus
from gensim.corpora import TextCorpus, MmCorpus, Dictionary

# Provide a filename or a file-like object as input and TextCorpus will be automatically initialized with a
# dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only
# need to override `get_texts` and provide your own implementation..
background_corpus = TextCorpus(input=YOUR_CORPUS)

background_corpus.dictionary.save("my_dict.dict")  # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results back to original words.

MmCorpus.serialize("background_corpus.mm", background_corpus)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.


### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

wiki_corpus = WikiCorpus(articles)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.


### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus
    input, output = sys.argv[1:3]
    
    # build dictionary. only keep 200k most frequent words (out of total ~7m unique tokens)
    # takes about 8h on a macbook pro
    wiki = gensim.corpora.WikiCorpus('/Users/kofola/gensim/results/enwiki-20100622-pages-articles.xml.bz2',
                                     keep_words = 200000)
    
    # save dictionary and bag-of-words
    # another ~8h
    wiki.saveAsText(output)
    del wiki
    
    # initialize corpus reader and word->id mapping
    from gensim.corpora import MmCorpus
    id2token = WikiCorpus.loadDictionary(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')
    
    # build tfidf
    # ~20min
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word = id2token, normalize = True)
    
    # save tfidf vectors in matrix market format
    # ~1.5h; result file is 14GB! bzip2'ed down to 4.5GB
    MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt = 10000)
    
    logging.info("finished running %s" % program)
    
    # running lsi (chunks=20000, numTopics=400) on wiki_tfidf then takes about 14h.

示例#49
0
文件: GenSim.py 项目: rsteckel/EDA
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 2]
dictionary.filter_tokens(once_ids)


dictionary.save(os.environ['NM_HOME']+'/Data/product_text.dict") 


corpus = [dictionary.doc2bow(text) for text in texts]
#corpus = TextCorpus(input=texts)

# Important -- save the dictionary generated by the corpus, or future operations will not be able to map results
# back to original words.
#corpus.dictionary.save("/Users/rsteckel/Workspace/NM/product_text.dict")
#dictionary = corpus.dictionary

MmCorpus.serialize(os.environ['NM_HOME']+"/Data/product_corpus.mm", corpus)  

documents.close()


#-------------LDA-------------
lda = LdaModel(corpus, num_topics=10, id2word=dictionary)

#lda.show_topics()
for i in np.arange(10):
    print lda.print_topic(i), '\n'



#--------------LSI----------------
tfidf = models.TfidfModel(corpus)
    wiki = WikiCorpus(inp, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=min_threshold, no_above=max_threshold, keep_n=keep_words)

    # Remove stop words (additional removal of common words used in spoken language)
    stop_ids = []
    with open(stop_words_file, 'r') as infile:
        for line in infile:
            try:
                stop_ids.append(wiki.dictionary.token2id[line.lower().strip()])
            except KeyError:
                continue
    wiki.dictionary.filter_tokens(bad_ids=stop_ids)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
示例#51
0
def make_corpus(path):
    wiki = WikiCorpus(path)
    MmCorpus.serialize('/mnt/ebs/wikidata/wiki_jp_vocab.mm', wiki)
示例#52
0
class AuthorTopicModel(LdaModel):
    """
    The constructor estimates the author-topic model parameters based
    on a training corpus:

    >>> model = AuthorTopicModel(corpus, num_topics=10, author2doc=author2doc, id2word=id2word)

    The model can be updated (trained) with new documents via

    >>> model.update(other_corpus, other_author2doc)

    Model persistency is achieved through its `load`/`save` methods.
    """

    def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
                 chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
                 alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
                 gamma_threshold=0.001, serialized=False, serialization_path=None,
                 minimum_probability=0.01, random_state=None):
        """
        If the iterable corpus and one of author2doc/doc2author dictionaries are given,
        start training straight away. If not given, the model is left untrained
        (presumably because you want to call the `update` method manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `author2doc` is a dictionary where the keys are the names of authors, and the
        values are lists of documents that the author contributes to.

        `doc2author` is a dictionary where the keys are document IDs (indexes to corpus)
        and the values are lists of author names. I.e. this is the reverse mapping of
        `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be
        supplied.

        `passes` is the number of times the model makes a pass over the entire trianing
        data.

        `iterations` is the maximum number of times the model loops over each document
        (M-step). The iterations stop when convergence is reached.

        `chunksize` controls the size of the mini-batches.

        `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates. Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively. `decay` controls how quickly old documents are
        forgotten, while `offset` down-weights early iterations.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be an integer or a numpy.random.RandomState object. Set the
        state of the random number generator inside the author-topic model, to ensure
        reproducibility of your experiments, for example.

        `serialized` indicates whether the input corpora to the model are simple
        in-memory lists (`serialized = False`) or saved to the hard-drive
        (`serialized = True`). Note that this behaviour is quite different from
        other Gensim models. If your data is too large to fit in to memory, use
        this functionality. Note that calling `AuthorTopicModel.update` with new
        data may be cumbersome as it requires all the existing data to be
        re-serialized.

        `serialization_path` must be set to a filepath, if `serialized = True` is
        used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your
        working directory by setting `serialization_path = serialized_model.mm`. An existing
        file *cannot* be overwritten; either delete the old file or choose a different
        name.

        Example:

        >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word)  # train model
        >>> model.update(corpus2)  # update the author-topic model with additional documents

        >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                "at least one of corpus/id2word must be specified, to establish input space dimensionality"
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute the author-topic model over an empty collection (no terms)")

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).")
        if serialized and serialization_path:
            assert not isfile(serialization_path), \
                "A file already exists at the serialization_path path; " \
                "choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), \
            "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
                "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
                (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
        )

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)

    def __str__(self):
        return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \
            (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize)

    def init_empty_corpus(self):
        """
        Initialize an empty corpus. If the corpora are to be treated as lists, simply
        initialize an empty list. If serialization is used, initialize an empty corpus
        of the class `gensim.corpora.MmCorpus`.

        """
        if self.serialized:
            # Initialize the corpus as a serialized empty list.
            # This corpus will be extended in self.update.
            MmCorpus.serialize(self.serialization_path, [])  # Serialize empty corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store serialized corpus object in self.corpus.
        else:
            # All input corpora are assumed to just be lists.
            self.corpus = []

    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')  # Make a temporary copy of the file where the corpus is serialized.
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            MmCorpus.serialize(self.serialization_path, corpus_chain)  # Re-serialize the old corpus, and extend it with the new corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)

    def compute_phinorm(self, expElogthetad, expElogbetad):
        """Efficiently computes the normalizing factor in phi."""
        expElogtheta_sum = expElogthetad.sum(axis=0)
        phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100

        return phinorm

    def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None):
        """
        Given a chunk of sparse document vectors, update gamma (parameters
        controlling the topic weights) for each author corresponding to the
        documents in the chunk.

        The whole input chunk of document is assumed to fit in RAM; chunking of
        a large corpus must be done earlier in the pipeline.

        If `collect_sstats` is True, also collect sufficient statistics needed
        to update the model's topic-word distributions, and return a 2-tuple
        `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`.
        `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where
        `chunk_authors` is the number of authors in the documents in the
        current chunk.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

        """
        try:
            len(chunk)
        except TypeError:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents", len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Stack all the computed gammas into this output array.
        gamma_chunk = np.zeros((0, self.num_topics))

        # Now, for each document d update gamma and phi w.r.t. all authors in those documents.
        for d, doc in enumerate(chunk):
            if chunk_doc_idx is not None:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get the IDs and counts of all the words in the current document.
            # TODO: this is duplication of code in LdaModel. Refactor.
            if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(idx) for idx, _ in doc]
            else:
                ids = [idx for idx, _ in doc]
            cts = np.array([cnt for _, cnt in doc])

            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]

            gammad = self.state.gamma[authors_d, :]  # gamma of document d before update.
            tilde_gamma = gammad.copy()  # gamma that will be updated.

            # Compute the expectation of the log of the Dirichlet parameters theta and beta.
            Elogthetad = dirichlet_expectation(tilde_gamma)
            expElogthetad = np.exp(Elogthetad)
            expElogbetad = self.expElogbeta[:, ids]

            # Compute the normalizing constant of phi for the current document.
            phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

            # Iterate between gamma and phi until convergence
            for _ in xrange(self.iterations):
                lastgamma = tilde_gamma.copy()

                # Update gamma.
                # phi is computed implicitly below,
                for ai, a in enumerate(authors_d):
                    tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]]) * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T)

                # Update gamma.
                # Interpolation between document d's "local" gamma (tilde_gamma),
                # and "global" gamma (gammad).
                tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma

                # Update Elogtheta and Elogbeta, since gamma and lambda have been updated.
                Elogthetad = dirichlet_expectation(tilde_gamma)
                expElogthetad = np.exp(Elogthetad)

                # Update the normalizing constant in phi.
                phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

                # Check for convergence.
                # Criterion is mean change in "local" gamma.
                meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
                gamma_condition = meanchange_gamma < self.gamma_threshold
                if gamma_condition:
                    converged += 1
                    break
            # End of iterations loop.

            # Store the updated gammas in the model state.
            self.state.gamma[authors_d, :] = tilde_gamma

            # Stack the new gammas into the output array.
            gamma_chunk = np.vstack([gamma_chunk, tilde_gamma])

            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                expElogtheta_sum_a = expElogthetad.sum(axis=0)
                sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug(
                "%i/%i documents converged within %i iterations",
                converged, len(chunk), self.iterations
            )

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak}
            # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma_chunk, sstats

    def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None):
        """
        Perform inference on a chunk of documents, and accumulate the collected
        sufficient statistics in `state` (or `self.state` if None).

        """

        # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible.
        if state is None:
            state = self.state
        gamma, sstats = self.inference(
            chunk, author2doc, doc2author, rhot,
            collect_sstats=True, chunk_doc_idx=chunk_doc_idx
        )
        state.sstats += sstats
        state.numdocs += len(chunk)
        return gamma

    def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None):
        """
        Calculate and return per-word likelihood bound, using the `chunk` of
        documents as evaluation corpus. Also output the calculated statistics. incl.
        perplexity=2^(-bound), to log at INFO level.

        """

        # TODO: This method is very similar to the one in LdaModel. Refactor.
        if total_docs is None:
            total_docs = len(chunk)
        corpus_words = sum(cnt for document in chunk for _, cnt in document)
        subsample_ratio = 1.0 * total_docs / len(chunk)
        perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
        logger.info(
            "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words",
            perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
        )
        return perwordbound

    def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None,
               update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>. Additionally, for smaller
        `corpus` sizes, an increasing `offset` may be beneficial (see
        Table 1 in Hoffman et al.)

        If update is called with authors that already exist in the model, it will
        resume training on not only new documents for that author, but also the
        previously seen documents. This is necessary for those authors' topic
        distributions to converge.

        Every time `update(corpus, author2doc)` is called, the new documents are
        to appended to all the previously seen documents, and author2doc is
        combined with the previously seen authors.

        To resume training on all the data seen by the model, simply call
        `update()`.

        It is not possible to add new authors to existing documents, as all
        documents in `corpus` are assumed to be new documents.

        Args:
            corpus (gensim corpus): The corpus with which the author-topic model should be updated.

            author2doc (dictionary): author to document mapping corresponding to indexes in input
                corpus.

            doc2author (dictionary): document to author mapping corresponding to indexes in input
                corpus.

            chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np
                array of not. np can in some settings turn the term IDs
                into floats, these will be converted back into integers in
                inference, which incurs a performance hit. For distributed
                computing it may be desirable to keep the chunks as np
                arrays.

        For other parameter settings, see :class:`AuthorTopicModel` constructor.

        """

        # use parameters given in constructor, unless user explicitly overrode them
        if decay is None:
            decay = self.decay
        if offset is None:
            offset = self.offset
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"),
        # the process simply gets killed.
        author2doc = deepcopy(author2doc)
        doc2author = deepcopy(doc2author)

        # TODO: it is not possible to add new authors to an existing document (all input documents are treated
        # as completely new documents). Perhaps this functionality could be implemented.
        # If it's absolutely necessary, the user can delete the documents that have new authors, and call update
        # on them with the new and old authors.

        if corpus is None:
            # Just keep training on the already available data.
            # Assumes self.update() has been called before with input documents and corresponding authors.
            assert self.total_docs > 0, 'update() was called with no documents to train on.'
            train_corpus_idx = [d for d in xrange(self.total_docs)]
            num_input_authors = len(self.author2doc)
        else:
            if doc2author is None and author2doc is None:
                raise ValueError('at least one of author2doc/doc2author must be specified, to establish input space dimensionality')

            # If either doc2author or author2doc is missing, construct them from the other.
            if doc2author is None:
                doc2author = construct_doc2author(corpus, author2doc)
            elif author2doc is None:
                author2doc = construct_author2doc(doc2author)

            # Number of authors that need to be updated.
            num_input_authors = len(author2doc)

            try:
                len_input_corpus = len(corpus)
            except TypeError:
                logger.warning("input corpus stream has no len(); counting documents")
                len_input_corpus = sum(1 for _ in corpus)
            if len_input_corpus == 0:
                logger.warning("AuthorTopicModel.update() called with an empty corpus")
                return

            self.total_docs += len_input_corpus

            # Add new documents in corpus to self.corpus.
            self.extend_corpus(corpus)

            # Obtain a list of new authors.
            new_authors = []
            # Sorting the author names makes the model more reproducible.
            for a in sorted(author2doc.keys()):
                if not self.author2doc.get(a):
                    new_authors.append(a)

            num_new_authors = len(new_authors)

            # Add new authors do author2id/id2author dictionaries.
            for a_id, a_name in enumerate(new_authors):
                self.author2id[a_name] = a_id + self.num_authors
                self.id2author[a_id + self.num_authors] = a_name

            # Increment the number of total authors seen.
            self.num_authors += num_new_authors

            # Initialize the variational distributions q(theta|gamma)
            gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
            self.state.gamma = np.vstack([self.state.gamma, gamma_new])

            # Combine author2doc with self.author2doc.
            # First, increment the document IDs by the number of previously seen documents.
            for a, doc_ids in author2doc.items():
                doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids]

            # For all authors in the input corpus, add the new documents.
            for a, doc_ids in author2doc.items():
                if self.author2doc.get(a):
                    # This is not a new author, append new documents.
                    self.author2doc[a].extend(doc_ids)
                else:
                    # This is a new author, create index.
                    self.author2doc[a] = doc_ids

            # Add all new documents to self.doc2author.
            for d, a_list in doc2author.items():
                self.doc2author[d] = a_list

            # Train on all documents of authors in input_corpus.
            train_corpus_idx = []
            for _ in author2doc.keys():  # For all authors in input corpus.
                for doc_ids in self.author2doc.values():  # For all documents in total corpus.
                    train_corpus_idx.extend(doc_ids)

            # Make the list of training documents unique.
            train_corpus_idx = list(set(train_corpus_idx))

        # train_corpus_idx is only a list of indexes, so "len" is valid.
        lencorpus = len(train_corpus_idx)

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once "
            "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter,
            evalafter, iterations, gamma_threshold
        )

        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy")

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(offset + pass_ + (self.num_updates / chunksize), -decay)

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers', self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                # gamma is not needed in "other", thus its shape is (0, 0).
                other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
            dirty = False

            reallen = 0
            for chunk_no, chunk_doc_idx in enumerate(utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)):
                chunk = [self.corpus[d] for d in chunk_doc_idx]
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                    # log_perplexity requires the indexes of the documents being evaluated, to know what authors
                    # correspond to the documents.
                    self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info(
                        "PROGRESS: pass %i, dispatching documents up to #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info(
                        "PROGRESS: pass %i, at document #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # do_estep requires the indexes of the documents being trained on, to know what authors
                    # correspond to the documents.
                    gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho())

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other, pass_ > 0)
                    del other  # frees up memory

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
                    dirty = False
            # endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other, pass_ > 0)
                del other

    def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
        """
        Estimate the variational bound of documents from `corpus`:
        E_q[log p(corpus)] - E_q[log q(corpus)]

        There are basically two use cases of this method:
        1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
        indicating the indexes of the documents in the training corpus.
        2. `chunk` is a test set (held-out data), and author2doc and doc2author
        corrsponding to this test set are provided. There must not be any new authors
        passed to this method. `chunk_doc_idx` is not needed in this case.

        To obtain the per-word bound, compute:

        >>> corpus_words = sum(cnt for document in corpus for _, cnt in document)
        >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words

        """

        # TODO: enable evaluation of documents with new authors. One could, for example, make it
        # possible to pass a list of documents to self.inference with no author dictionaries,
        # assuming all the documents correspond to one (unseen) author, learn the author's
        # gamma, and return gamma (without adding it to self.state.gamma). Of course,
        # collect_sstats should be set to false, so that the model is not updated w.r.t. these
        # new documents.

        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)
        expElogbeta = np.exp(Elogbeta)

        gamma = self.state.gamma

        if author2doc is None and doc2author is None:
            # Evaluating on training documents (chunk of self.corpus).
            author2doc = self.author2doc
            doc2author = self.doc2author

            if not chunk_doc_idx:
                # If author2doc and doc2author are not provided, chunk is assumed to be a subset of
                # self.corpus, and chunk_doc_idx is thus required.
                raise ValueError('Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.')
        elif author2doc is not None and doc2author is not None:
            # Training on held-out documents (documents not seen during training).
            # All authors in dictionaries must still be seen during training.
            for a in author2doc.keys():
                if not self.author2doc.get(a):
                    raise ValueError('bound cannot be called with authors not seen during training.')

            if chunk_doc_idx:
                raise ValueError('Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.')
        else:
            raise ValueError('Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.')

        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)

        word_score = 0.0
        theta_score = 0.0
        for d, doc in enumerate(chunk):
            if chunk_doc_idx:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
            ids = np.array([id for id, _ in doc])  # Word IDs in doc.
            cts = np.array([cnt for _, cnt in doc])  # Word counts.

            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i in chunk", d)

            # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
            # is the same computation as in normalizing phi.
            phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids])
            word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm))

        # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always roughly on the same scale.
        word_score *= subsample_ratio

        # E[log p(theta | alpha) - log q(theta | gamma)]
        for a in author2doc.keys():
            a = self.author2id[a]
            theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :])
            theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha))
            theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :]))

        # theta_score is rescaled in a similar fashion.
        # TODO: treat this in a more general way, similar to how it is done with word_score.
        theta_score *= self.num_authors / len(author2doc)

        # E[log p(beta | eta) - log q (beta | lambda)]
        beta_score = 0.0
        beta_score += np.sum((self.eta - _lambda) * Elogbeta)
        beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta))
        sum_eta = np.sum(self.eta)
        beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        total_score = word_score + theta_score + beta_score

        return total_score

    def get_document_topics(self, word_id, minimum_probability=None):
        """
        This method overwrites `LdaModel.get_document_topics` and simply raises an
        exception. `get_document_topics` is not valid for the author-topic model,
        use `get_author_topics` instead.

        """

        raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.')

    def get_author_topics(self, author_name, minimum_probability=None):
        """
        Return topic distribution the given author, as a list of
        (topic_id, topic_probability) 2-tuples.
        Ignore topics with very low probability (below `minimum_probability`).

        Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`),
        is not supported.

        """

        author_id = self.author2id[author_name]

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :])

        author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability]

        return author_topics

    def __getitem__(self, author_names, eps=None):
        """
        Return topic distribution for input author as a list of
        (topic_id, topic_probabiity) 2-tuples.

        Ingores topics with probaility less than `eps`.

        Do not call this method directly, instead use `model[author_names]`.

        """
        if isinstance(author_names, list):
            items = []
            for a in author_names:
                items.append(self.get_author_topics(a, minimum_probability=eps))
        else:
            items = self.get_author_topics(author_names, minimum_probability=eps)

        return items
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus

dictionary ,corpus = prep_corpus(docs['tokens'])

MmCorpus.serialize('data/model/newsgroups.mm', corpus)
dictionary.save('data/model/newsgroups.dict')

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
lda.save('data/model/newsgroups_50.model')
示例#54
0
def setup():
    documents = []
    import glob
    import os
    directoryNames = list(set(glob.glob(os.path.join("Data", "*"))).difference(set(glob.glob(os.path.join("Data","*.*")))))
    numberOfDocuments = 0

    for folder in directoryNames:
        for fileNameDir in os.walk(folder):
            for fileName in fileNameDir[2]:
                if fileName[-4:] != ".txt":
                    continue
                nameFileDocument = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName)
                with open(nameFileDocument, 'r') as doc:
                    doc_text = doc.read().replace('\n', '')
                import re
                processed_doc_text = re.sub('[^a-zA-Z0-9\n]', ' ', doc_text)
                documents.append(processed_doc_text)
                numberOfDocuments += 1
                break

    print(numberOfDocuments)

    # remove common words and tokenize

    #from gensim.utils import lemmatize
    #lemmatized_docs = [lemmatize(document) for document in documents]

    from stop_words import get_stop_words
    stop_words = get_stop_words('english')
    texts = [[word for word in document if word not in stop_words]
             for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

    from gensim import corpora
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000)
    dictionary.save('files/pmc-data.dict') # store the dictionary, for future reference

    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('files/pmc-data.mm', corpus) # store to disk, for later use


    from gensim.corpora import MmCorpus

    mm = MmCorpus('files/pmc-data.mm')

    from gensim.models import TfidfModel

    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    MmCorpus.serialize('files/pmc-data-tfidf.mm', tfidf[mm], progress_cnt=10000)
示例#55
0
文件: lda2.py 项目: pielstroem/Topics

class MyCorpus(object):
     def __iter__(self):
         for line in open('mycorpus.txt'):
             # assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())
        
corpus = MyCorpus()

#create output folder
if not os.path.exists("out"): os.makedirs("out")

corpusPath = os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'mm']))

MmCorpus.serialize(corpusPath, corpus)

mm = MmCorpus(corpusPath)

doc_labels = makeDocLabels(path)

log.info('fitting the model ...')

# fitting the model
model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

log.info('generated topics...')

# print topics
topics = model.show_topics(num_topics=no_of_topics)
示例#56
0
文件: demo.py 项目: pielstroem/Topics
def upload_file():
    """
    Upload csv files and create:
        * ~/out/corpus.dict
        * ~/out/corpus.lda
        * ~/out/corpus.lda.state
        * ~/out/corpus.mm
        * ~/out/corpus.mm.index
        * ~/out/corpus_doclabels.txt
        * ~/out/corpus_topics.txt
        * ~/mycorpus.txt

    As well as (for example):
        * ~/swcorp/Doyle_AStudyinScarlet.txt
        * ~/swcorp/Lovecraft_AttheMountainofMadness.txt
        * etc.
    """

    # INPUT
    # columns to read from csv file
    columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

    # parts-of-speech to include into the model
    pos_tags = ['ADJ', 'NN', 'V']

    # stopwords
    regex = re.compile('\w+')
    stopwords = request.files['stoplist']
    stopwords = str(stopwords.readlines())
    stopwords = regex.findall(stopwords)
    stopwords.extend(("'", "'d", "'s")) # temporary solution
    print(stopwords)

    # document size (in words)
    doc_size = 1000

    # uses the pipeline's ParagraphId to split text into documents,
    # overrides doc_size - 1: on, 0: off
    doc_split = 0

    # no. of topics to be generated
    no_of_topics = 30

    # no. of lda iterations - usually, the more the better, but
    # increases computing time
    no_of_passes = 1

    # perplexity estimation every n chunks -
    # the smaller the better, but increases computing time
    eval = 1

    # documents to process at once
    chunk = 100

    # "symmetric", "asymmetric", "auto", or array
    # (default: a symmetric 1.0/num_topics prior) affects sparsity of
    # the document-topic (theta) distribution
    alpha = "symmetric"

    # custom alpha may increase topic coherence, but may also produce
    # more topics with zero probability alpha = np.array([ 0.02, 0.02,
    # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04,
    # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02])

    # can be a number (int/float), an array, or None
    # affects topic-word (lambda) distribution - not necessarily
    # beneficial to topic coherence
    eta = None

    # PREPROCESSING
    files = request.files.getlist('files')
    docs = []
    doc_labels = []

    print("\n reading files ...\n")

    for file in files:
        file_label = secure_filename(file.filename).split('.')[0]

        df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
        df = df[columns]
        df = df.groupby('CPOS')

        doc = pd.DataFrame()
        for p in pos_tags:  # collect only the specified parts-of-speech
            doc = doc.append(df.get_group(p))
            # construct documents
            if doc_split:  # size according to paragraph id
                doc = doc.groupby('ParagraphId')
                for para_id, para in doc:
                    docs.append(para['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(para_id)]))
            else:  # size according to doc_size
                doc = doc.sort_values(by='TokenId')
                i = 1
                while(doc_size < doc.shape[0]):
                    docs.append(
                        doc[:doc_size]['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(i)]))
                    doc = doc.drop(doc.index[:doc_size])
                    i += 1
                docs.append(doc['Lemma'].values.astype(str))
                doc_labels.append(''.join([file_label, " #", str(i)]))

            if not os.path.exists(os.path.join(os.getcwd(), "swcorp")):
                os.makedirs(os.path.join(os.getcwd(), "swcorp"))

            swpath = os.path.join('swcorp', "".join(file_label))

            with open(swpath + ".txt", 'w', encoding="utf-8") as text:
                text.write(" ".join(
                    word for word in doc['Lemma'].values.astype(str)
                    if word not in stopwords))

    print("\n normalizing and vectorizing ...\n")

    # texts = [
    #   [word for word in doc if word not in stopwords] for doc in docs]

    print("\n stopwords removed ...\n")

    print("\n writing mastercorpus ...\n")

    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    with open(mastercorpus, 'w', encoding="utf-8") as data:
        folder = glob.glob("swcorp/*")
        for text in folder:
            with open(text, 'r', encoding="utf-8") as text:
                textline = [re.sub(
                    r'\\n\\r', '', document) for document in ' '.join(
                        text.read().split())]
                if text != folder[-1]:
                    data.write("".join(textline) + "\n")
                else:
                    data.write("".join(textline))

    # MAIN PART
    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    dictionary = corpora.Dictionary(
        line.lower().split() for line in open(
            mastercorpus, encoding="utf-8"))

    class MyCorpus(object):
        def __iter__(self):
            for line in open('mycorpus.txt'):
                # assume there's one document per line, tokens
                # separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

    # corpus = buildCorpus(mastercorpus, dictionary)

    corpus = MyCorpus()

    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
示例#57
0
print 'Building dictionary of terms ...'
dictionary = corpora.Dictionary(texts)
print '%d word types' % len(dictionary)

print 'Filtering infrequent and frequent terms ...'
dictionary.filter_extremes(no_below=5, no_above=0.5)
print '%d word types, after filtering' % len(dictionary)

print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
示例#58
0
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
示例#59
0
if(argc>2):
    topicnum=int(sys.argv[2])

if(argc>3):
    conference=sys.argv[3]

relpath= conference+str(year)
rname= relpath+'/papers'
print  conference,year, topicnum

if(not os.path.exists(rname+'.mm')):
    with open(relpath+'/allpapers.txt') as fp:
        d=fp.readlines()
        docs=[i.split(" ") for i in d]

        dictionary, corpus = prep_corpus(docs)

        MmCorpus.serialize(rname+'.mm',
                           corpus)
        dictionary.save(rname+'.dict')

t0=time.clock()
lda = models.ldamodel.LdaModel(corpus=corpus, 
                               id2word=dictionary,
                               num_topics=topicnum,
                               passes=10)
print time.clock()-t0
                                      
lda.save(relpath+'/papers_%d.model'%(topicnum))