def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def extend_corpus(self, corpus): """ Add new documents in `corpus` to `self.corpus`. If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. assert self.corpus.input != corpus.input, \ 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents. # Make a temporary copy of the file where the corpus is serialized. copyfile(self.serialization_path, self.serialization_path + '.tmp') self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. # Re-serialize the old corpus, and extend it with the new corpus. MmCorpus.serialize(self.serialization_path, corpus_chain) self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: # self.corpus and corpus are just lists, just extend the list. # First check that corpus is actually a list. assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus)
def test_apply(self): transformed_vtcorp = self.transformer._apply(self.vtcorp) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label) text_data_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[0]) text_obj_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[2]) MmCorpus.serialize(text_data_name, transformed_vtcorp) transformed_vtcorp.save(text_obj_name) self.assertTrue(self.loader.has_text_corpora(self.transformation_label)) self.temporary_files.extend([ os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_name) for transformed_name in transformed_names]) transformed_vtcorp = TransformedCorpus.load(text_obj_name) self.assertIsInstance(transformed_vtcorp, TransformedCorpus) self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary) self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
def createcorpus(bg_corpus,output_dictionary,output_serialize): # Generating a training/background corpus from your own source of documents #saving dictionary and corpus in Matrix method form print("Creating corpus and dictionary") background_corpus = TextCorpus(input=bg_corpus) background_corpus.dictionary.save(output_dictionary) MmCorpus.serialize(output_serialize,background_corpus) return background_corpus,background_corpus.dictionary
def _create_bow_representation(self): """Create bag-of-words representation of collection, and save it in Matrix Matrix format to disk.""" print('Create bag-of-words matrix representation.') self.bow_corpus = [self.dictionary.doc2bow(article) for article in self.articles] MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)
def load_experts(): """ load expert data and save to file """ expert_corpus = ExpertCorpus() MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm') """ save expert-to-document map to pickle """ pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
def _create_tfidf_matrix(self): """Create TF-IDF matrix and save it in Matrix Matrix format to disk""" print('Create TF-IDF matrix of collection.') tfidf = TfidfModel(self.bow_corpus, id2word=self.dictionary, normalize=True) MmCorpus.serialize(self.tfidf_filepath, tfidf[self.bow_corpus]) print('Number of documents:', tfidf.num_docs)
def init_empty_corpus(self): """ Initialize an empty corpus. If the corpora are to be treated as lists, simply initialize an empty list. If serialization is used, initialize an empty corpus of the class `gensim.corpora.MmCorpus`. """ if self.serialized: # Initialize the corpus as a serialized empty list. # This corpus will be extended in self.update. MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus. self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus. else: # All input corpora are assumed to just be lists. self.corpus = []
def pretrain(): """pre train the text corpus and build the dictionary""" gutenberg_corpus = TextCorpus(text_corpus_file) gutenberg_corpus.dictionary.save(dict_file) gutenberg_corpus.dictionary.save_as_text(dic_txt_file) mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus) print mm;
def build_pyLDAvis_output(corp_loc, dict_loc, lda_loc): if not 'model' in lda_loc: lda_loc += '.model' corpus = MmCorpus(corp_loc) dictionary = Dictionary.load(dict_loc) lda = models.LdaModel.load(lda_loc) vis_data = gensim_vis.prepare(lda, corpus, dictionary, sort_topics=False) pyLDAvis.save_html(vis_data, lda_loc.split('.')[0] + '.html')
def _run_model(self): id2word_wiki = Dictionary.load(self.wiki_dict_file) mm_corpus = MmCorpus(self.mm_corpus_file) #to be removed #mm_corpus = ClippedCorpus(mm_corpus, 4000) tfidf_model = TfidfModel(mm_corpus, id2word=id2word_wiki) corpus = tfidf_model[mm_corpus] MmCorpus.serialize(self.wiki_tfidf_file, corpus) self.model = LsiModel(corpus, num_topics=self.config.num_topics, id2word=id2word_wiki, chunksize=self.config.chunksize) MmCorpus.serialize(self.wiki_lsi_file, self.model[corpus]) self.model.save(self.model_file)
def save_corpus(self): assert self.corpus, 'corpus is not in memory' assert self.run, 'run id is missing' self.dictionary.save(self.prefix + self.corpus.dictionary.__class__.__name__) MmCorpus.serialize(fname=self.prefix + 'corpus', corpus=self.corpus) self.tfidf_vectorizer.save(self.prefix + self.tfidf_vectorizer.__class__.__name__) self.corpus_tfidf = self.tfidf_vectorizer[self.corpus] MmCorpus.serialize(fname=self.prefix + 'corpus_tfidf', corpus=self.corpus_tfidf) with open(self.prefix + 'db_index', 'wb') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['db_index']) for i in self.index: writer.writerow([i])
def __init__(self, textcolname): self._flogger() self.corpus = MmCorpus(PConstant.CORPUS_DIR_PATH.value + textcolname + '_corpus.mm') self.dictionary = Dictionary.load(PConstant.DICTIONARY_DIR_PATH.value + textcolname + '_dictionary.dict') self.lda = models.LdaModel.load(PConstant.LDA_DIR_PATH.value + textcolname + '_lda.model') self.stopwords = StopWord.EnglishStopWord().stopwords()
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # Read in the corpus from within the archive file fin = path.join(datadir, "reuters21578.tar.gz") rc = ReutersCorpus(fin) # filter out some of the more common words, # and some of the less-common ones as well rc.dictionary.filter_extremes(no_below=20, no_above=0.1) rc.dictionary.compactify() # Serialize the Reuters 21578 corpus fout = path.join(datadir, "reuters21578.mm") MmCorpus.serialize(fout, rc) # Save the dictionary to file as text fout = path.join(datadir, "reuters21578.dict.txt") rc.dictionary.save_as_text(fout)
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute( 'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def train_lda_model(token_tweets): print('Start LDA model training ...\n') # Build dictionary tweets_dict = corpora.Dictionary(token_tweets) # Remove words that occur less than 10 documents, # or more than 50% of the doc tweets_dict.filter_extremes(no_below=10, no_above=0.5) # Transform doc to a vectorized form by computing frequency of each word bow_corpus = [tweets_dict.doc2bow(doc) for doc in token_tweets] # Save corpus and dictionary to file MmCorpus.serialize(CORPUS_FILE, bow_corpus) tweets_dict.save(DICT_FILE) # Create tf-idf model and then apply transformation to the entire corpus tfidf = models.TfidfModel(bow_corpus) tfidf_corpus = tfidf[bow_corpus] # Train LDA model lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, num_topics=NUM_TOPICS, id2word=tweets_dict, passes=NUM_PASSES, alpha=ALPHA, eta=ETA, random_state=49) # Save LDA model to file lda_model.save(LDA_MODEL_FILE) print ('LDA model saved\n') # Save all generated topics to a file msg = '' for idx, topic in lda_model.print_topics(-1): msg += 'Topic: {} \nWords: {}\n'.format(idx, topic) save_print_to_file(LDA_TOPICS_FILE, msg) # Evaluate LDA model performance eval_lda (lda_model, tfidf_corpus, tweets_dict, token_tweets) # Visualize topics vis_topics(lda_model, tfidf_corpus, tweets_dict) return lda_model
def build_LDA_model(corp_loc, dict_loc, num_topics, num_pass, lda_loc): corpus = MmCorpus(corp_loc) dictionary = Dictionary.load(dict_loc) lda = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=int(num_topics), alpha='asymmetric', passes=int(num_pass)) lda.save(lda_loc + '.model') build_pyLDAvis_output(corp_loc, dict_loc, lda_loc)
def __init__(self, docs, model, persist=False, path="./saved_models/lsi_corpus.crp"): self.path = path self.persist = persist self.model = model if not os.path.exists(self.path) and self.persist: print("creating model repr.") corpus = model[docs] print("saving model repr to disk.") MmCorpus.serialize(self.path, corpus) if not self.persist: self.corpus = docs else: self.corpus = MmCorpus(self.path)
def mapper_init(self): ''' Load required files and models here. ''' # load prerequisite document vectors and paired dataset self.dictionary = Dictionary.load("reviews_dictionary.dict") self.corpus = MmCorpus("reviews_corpus.mm") self.df = pd.read_csv("user_rest_pair.csv", sep="|") # initialize lsi space self.lsi = models.LsiModel(self.corpus, id2word=self.dictionary, num_topics=15)
def display_data(self): lda = LdaMulticore.load(self.lda_model_filepath) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open(self.LDAvis_data_filepath, 'w') as f: f.write(str(LDAvis_prepared)) # json.dump(LDAvis_prepared.to_json(), f) with open(self.LDAvis_data_filepath) as f: LDAvis_prepared = f pyLDAvis.display(LDAvis_prepared)
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load back the id->word mapping directly from file fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm") mm = MmCorpus(fin) # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True) # save the TfidfModel instance to file fout = path.join(datadir, "reuters21578.tfidf.model") tfidf.save(fout) # save TF-IDF vectors in matrix market format fout = path.join(datadir, "reuters21578.tfidf.mm") MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def load_corpus(self, corpus_name): ''' This is were we load the corpus files. This needs to be moved to a more general class initialization. (FIXME Freija) ''' corpusfile = corpus_name + '.mm' corpusdict = corpus_name + '_wordids.txt' lsimodel = corpus_name + '.lsi_model' lsiindex = corpus_name + '-lsi.index' self.corpus_name = corpus_name self.corpus_mm = MmCorpus(corpusfile) self.corpus_dict = Dictionary.load_from_text(corpusdict) self.model = LsiModel.load(lsimodel) self.index = similarities.MatrixSimilarity.load(lsiindex)
def create_tfidf_corpus(corpus_file, dict_file, outputs_dir): # Load back the id->word mapping directly from file # This seems to save more memory, compared to keeping the # wiki.dictionary object from above dictionary = Dictionary.load_from_text(dict_file) # initialize corpus reader and word->id mapping mm = MmCorpus(corpus_file) tfidf_model_file = os.path.join(outputs_dir, "wikipedia.tfidf_model") tfidf_corpus_file = os.path.join(outputs_dir, "wikipedia_tfidf.mm") # build TF-IDF, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(tfidf_model_file) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(tfidf_corpus_file, tfidf[mm], progress_cnt=10000) return tfidf_model_file, tfidf_corpus_file
def create_corpus(dump_file, outputs_dir, max_batch=None): # Takes about 9h on a macbook pro, for 3.5m articles (june 2011) wiki = WikiCorpus(dump_file, max_batch=max_batch) # Only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=3, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # Save dictionary and bag-of-words (term-document frequency matrix). # Another ~9h corpus_file = os.path.join(outputs_dir, "wikipedia_bow.mm") dict_file = os.path.join(outputs_dir, "wikipedia_wordids.txt.bz2") titles_files = os.path.join(outputs_dir, "wikipedia_titles") MmCorpus.serialize(corpus_file, corpus=wiki, progress_cnt=10000) wiki.dictionary.save_as_text(dict_file) wiki.save_titles(titles_files) return corpus_file, dict_file, titles_files
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def visualise(model_file, corpus_file, dictionary_file): # use Notebook version if not working print('Loading corpus from ' + corpus_file) corpus = MmCorpus(corpus_file) print('Loading dictionary from ' + dictionary_file) dictionary = Dictionary.load(dictionary_file) print('Loading model from ' + model_file) model = models.ldamulticore.LdaMulticore.load(model_file) vis_data = gensimvis.prepare(model, corpus, dictionary) pyLDAvis.display(vis_data) print('Please use Jupyter notebook visualise.ipynb if not working')
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097
def load_corpus_and_dict(corpus_path, id2word_path): print("[BLOCK] Loading corpus and dictionary files from %s and %s" % (data_path, id2word_path)) sys.stdout.flush() dictionary = Dictionary.load_from_text(id2word_path) print("[BLOCK] Loading corpus iterator") sys.stdout.flush() #mm = gensim.corpora.MmCorpus(corpus_path) corpus = MmCorpus( bz2.BZ2File(corpus_path) ) # use this if you compressed the TFIDF output (recommended) return corpus, dictionary
def loadModel(self, filename): self.util.logDebug('LDA', 'Loading model from ' + filename) self.model = LdaMulticore.load(fname=filename) self.dictionary = Dictionary.load(fname=filename + '.dict') self.corpus = MmCorpus(filename + '.corpus') print(self.dictionary) print(self.model.print_topic(0, topn=5)) print(self.model.print_topic(1, topn=5)) print(self.model.print_topic(2, topn=5)) print(self.model.print_topic(3, topn=5)) self.loaded = True self.util.logDebug('LDA', 'Model loaded in ' + self.util.stopTimeTrack()) self.labelTopics(filename)
def create_similarity_matrix(doc_term_matrix, dictionary): model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False) MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100) corpus_tfidf = MmCorpus( './corpus_tfidf.mm' ) # Loading back the corpus file after applying tf-idf model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary) # Creating the similarity matrix with simple bag-of-words model # index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary)) # Creating the similarity matrix with LSI model index = similarities.MatrixSimilarity( model_lsi[corpus_tfidf], num_features=len(dictionary)) # Applying LSI model to all vectors # index.save('./similarity_matrix_' + fileName + '.mm') return index
def main(args): logging.info('Initializing loaders with root %s, name %s' % ( args.root, args.name)) dloader = MultimodalDatasetLoader(args.root, args.name) icorp = dloader.load_image_corpus(args.img_label) transformer = NormalizationTransform() normalized_icorp = transformer._apply(icorp) corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label) corpus_full_path = os.path.join(args.root, corpus_names[0]) logging.info('Serializing to file %s' % corpus_full_path) MmCorpus.serialize(corpus_full_path, normalized_icorp) logging.info('Re-saving original corpus object with infix %s' % args.transformation_label) dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)
def _build_model(self, all_documents, remove_once=False): ''' Builds the lsa model Returns: dictionary, corpus ''' doc_hash = hash_obj(all_documents) corp_cache_path = CACHE_DIR + '/' + doc_hash +\ '_corp_' + str(int(remove_once)) dic_cache_path = CACHE_DIR + '/' + doc_hash +\ '_dic_' + str(int(remove_once)) lsi_cache_path = CACHE_DIR + '/' + doc_hash +\ '_lsi_' + str(int(remove_once)) if os.path.exists(corp_cache_path) \ and os.path.exists(dic_cache_path)\ and os.path.exists(lsi_cache_path): lsi = models.LsiModel.load(lsi_cache_path) corp = MmCorpus(corp_cache_path) dic = Dictionary.load(dic_cache_path) else: texts = [self.tokenize(doc) for doc in all_documents] all_tokens = sum(texts, []) if remove_once: tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dic = Dictionary(texts) corp = [dic.doc2bow(text) for text in texts] MmCorpus.serialize(corp_cache_path, corp) dic.save(dic_cache_path) lsi = models.LsiModel( corp, id2word=dic, num_topics=20) lsi.save(lsi_cache_path) return dic, corp, lsi
def generate_lda_topics(self): from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore import pyLDAvis import pyLDAvis.gensim import warnings import _pickle as pickle trigram_sentences = LineSentence(self.trigram_sentences_filepath) trigram_dictionary = Dictionary(trigram_sentences) # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(self.trigram_dictionary_filepath) def trigram_bow_generator(filepath): for sentence in LineSentence(filepath): yield trigram_dictionary.doc2bow(sentence) MmCorpus.serialize( self.trigram_bow_filepath, trigram_bow_generator(self.trigram_sentences_filepath)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore(trigram_bow_corpus, num_topics=3, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath) lda = LdaMulticore.load(self.lda_model_filepath) lda.show_topic(0) lda.show_topic(1) lda.show_topic(2) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
def get_trigram_bow_corpus(self, trigram_dictionary, recalculate=False, from_scratch=True): if not os.path.isfile(self.paths.trigram_bow_filepath) or recalculate: if not from_scratch: raise ValueError( 'No BOW corpus file exists but from_scratch is False') print('Building bow corpus...') trigram_corpus = LineSentence(self.paths.trigram_corpus_filepath) # generate bag-of-words representation trigram_bow_generator = (trigram_dictionary.doc2bow(doc) for doc in trigram_corpus) mm_corpus = MmCorpus.serialize(self.paths.trigram_bow_filepath, trigram_bow_generator) print('Done!') else: print('Loading bow corpus...') mm_corpus = MmCorpus(self.paths.trigram_bow_filepath) return mm_corpus
def extend_corpus(self, corpus): """ Add new documents in `corpus` to `self.corpus`. If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents. copyfile(self.serialization_path, self.serialization_path + '.tmp') # Make a temporary copy of the file where the corpus is serialized. self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. MmCorpus.serialize(self.serialization_path, corpus_chain) # Re-serialize the old corpus, and extend it with the new corpus. self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: # self.corpus and corpus are just lists, just extend the list. # First check that corpus is actually a list. assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus)
def _build_model(self, all_documents, remove_once=False): ''' Builds the lsa model Returns: dictionary, corpus ''' doc_hash = hash_obj(all_documents) corp_cache_path = CACHE_DIR + '/' + doc_hash +\ '_corp_' + str(int(remove_once)) dic_cache_path = CACHE_DIR + '/' + doc_hash +\ '_dic_' + str(int(remove_once)) lsi_cache_path = CACHE_DIR + '/' + doc_hash +\ '_lsi_' + str(int(remove_once)) if os.path.exists(corp_cache_path) \ and os.path.exists(dic_cache_path)\ and os.path.exists(lsi_cache_path): lsi = models.LsiModel.load(lsi_cache_path) corp = MmCorpus(corp_cache_path) dic = Dictionary.load(dic_cache_path) else: texts = [self.tokenize(doc) for doc in all_documents] all_tokens = sum(texts, []) if remove_once: tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dic = Dictionary(texts) corp = [dic.doc2bow(text) for text in texts] MmCorpus.serialize(corp_cache_path, corp) dic.save(dic_cache_path) lsi = models.LsiModel(corp, id2word=dic, num_topics=20) lsi.save(lsi_cache_path) return dic, corp, lsi
def _run_model(self): id2word_wiki = Dictionary.load(self.wiki_dict_file) mm_corpus = MmCorpus(self.mm_corpus_file) #mm_corpus = ClippedCorpus(mm_corpus, 4000) self.model = LdaModel(mm_corpus, num_topics=self.config.num_topics, id2word=id2word_wiki, alpha=self.config.alpha, chunksize=self.config.chunksize, iterations=self.config.iterations, passes=self.config.passes) self.model.save(self.model_file)
def read_matrix_market_file(filepath): """Reads a Matrix Market file for Gensim. With this function you can read a Matrix Market file to process it with \ `Gensim <https://radimrehurek.com/gensim/>`_. Args: filepath (str): Path to Matrix Market file. Returns: Matrix Market model for Gensim. """ if os.path.splitext(filepath)[1] != '.mm': raise ValueError( "The file {} is not a Matrix Market file.".format(filepath)) return MmCorpus(filepath)
def visualizeLDA(self, filename): dictionary = Dictionary.load(filename + '.dict') corpus = MmCorpus(filename + '.corpus') lda = LdaMulticore.load(filename) self.util.logDebug('LDA', 'Preparing HTML ') ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) self.util.logDebug('LDA', 'HTML prepared in ' + self.util.stopTimeTrack()) pyLDAvis.save_html(ldavis, filename + '.html') self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack()) # # lda = LDA(logfilename='/home/kah1/test.log') # lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model') # lda.labelTopics()
def get_corpus(self, lang, data_version, dictionary_version, language_processed_data: list = None): logging.info("--- Getting corpus") if self.corpus is None: corpus_file_path = Advisor.get_dictionary_version_folder_file_path( lang, data_version, dictionary_version, self.file_types[1][0], self.file_types[1][1]) if path.exists(corpus_file_path): logging.info("---- Corpus was created before") self.corpus = list(MmCorpus(corpus_file_path)) else: self.set_corpus(language_processed_data, corpus_file_path) logging.info("--- Corpus captured") return
def train(corpus_file, dictionary_file, model_file, no_topic, no_iteration, no_worker): print('Loading corpus from ' + corpus_file) corpus = MmCorpus(corpus_file) print('Loading dictionary from ' + dictionary_file) dictionary = Dictionary.load(dictionary_file) print('Training model %d topics in %d interations with %d workers' % (no_topic, no_iteration, no_worker)) lda = models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=no_topic, iterations=no_iteration, workers=no_worker) print('Writing model to ' + model_file) lda.save(model_file)
def predict_tfid(pred_data, data): path = input("Enter path to LDA model: ") tfid = gensim.models.TfidfModel.load(path + "tfid_model") corpus = MmCorpus(path + "tfid_corpus.mm") tfid_corpus = tfid[corpus] new_dictionary = Dictionary(data['tokens']) new_corpus = [new_dictionary.doc2bow(doc) for doc in data['tokens']] index_sparse = SparseMatrixSimilarity(tfid_corpus, num_features=corpus.num_terms) index_sparse.num_best = 500 idx = (index_sparse[new_corpus]) print("Most Similar users are as follows: ") print("Name\t\t\tscore ") m = 1 for i in idx[0]: display("{}. {} {}".format(m, data.iloc[i[0]]['handles'], i[1])) m += 1 return
def load_imdb(root=DATA_DIR, train=True, download=False): data_folder = os.path.join(root, "datasets/imdb") processed_folder = os.path.join(data_folder, "processed") subfolder = "train" if train else "test" if download: if not os.path.exists(data_folder): os.makedirs(data_folder) pull_from_url(data_folder) docs = [] ratings = [] labels = [] for cls in ["pos", "neg"]: work_dir = os.path.join(data_folder, IMDB_DECOMPRESS_FOLDER, subfolder, cls) filenames = os.listdir(work_dir) for file in filenames: with open(os.path.join(work_dir, file), "r", encoding="utf8") as f: docs.append(word_tokenize(f.read().strip())) ratings.append(PATTERN.search(file).group(1)) labels.append(1 if cls == "pos" else 0) docs = [[LEMMATIZER.lemmatize(token) for token in doc] for doc in docs] if not os.path.exists(processed_folder): os.mkdir(processed_folder) if train: logging.info( "Building dictionary and BOW corpus for training data...") corpus, dictionary = build_corpus(docs) MmCorpus.serialize( os.path.join(processed_folder, "train_corpus.mm"), corpus) dictionary.save(os.path.join(processed_folder, "dictionary")) else: logging.info("Building BOW corpus for testing data...") dictionary = Dictionary.load( os.path.join(processed_folder, "dictionary")) corpus, _ = build_corpus(docs, dictionary) MmCorpus.serialize( os.path.join(processed_folder, "test_corpus.mm"), corpus) return corpus, dictionary else: try: if train: return MmCorpus(os.path.join(processed_folder, "train_corpus.mm")), \ Dictionary.load(os.path.join(processed_folder, "dictionary")), else: return MmCorpus(os.path.join(processed_folder, "test_corpus.mm")), \ Dictionary.load(os.path.join(processed_folder, "dictionary")) except FileNotFoundError: logging.warning( "The dataset does not exist, please set download to True!") return None, None
id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) model.save(config.resultFile('model_rp.pkl')) else: raise ValueError('unknown topic extraction method: %s' % repr(method)) MmCorpus.saveCorpus(config.resultFile('corpus_%s.mm' % method), model[corpus]) logging.info("finished running %s" % program)
logging.basicConfig(stream=sys.stdout, level=logging.INFO) ### Generating a training/background corpus from gensim.corpora import TextCorpus, MmCorpus, Dictionary # Provide a filename or a file-like object as input and TextCorpus will be automatically initialized with a # dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only # need to override `get_texts` and provide your own implementation.. background_corpus = TextCorpus(input=YOUR_CORPUS) background_corpus.dictionary.save("my_dict.dict") # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results back to original words. MmCorpus.serialize("background_corpus.mm", background_corpus) # Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs. ### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download wiki_corpus = WikiCorpus(articles) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus
input, output = sys.argv[1:3] # build dictionary. only keep 200k most frequent words (out of total ~7m unique tokens) # takes about 8h on a macbook pro wiki = gensim.corpora.WikiCorpus('/Users/kofola/gensim/results/enwiki-20100622-pages-articles.xml.bz2', keep_words = 200000) # save dictionary and bag-of-words # another ~8h wiki.saveAsText(output) del wiki # initialize corpus reader and word->id mapping from gensim.corpora import MmCorpus id2token = WikiCorpus.loadDictionary(output + '_wordids.txt') mm = MmCorpus(output + '_bow.mm') # build tfidf # ~20min from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word = id2token, normalize = True) # save tfidf vectors in matrix market format # ~1.5h; result file is 14GB! bzip2'ed down to 4.5GB MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt = 10000) logging.info("finished running %s" % program) # running lsi (chunks=20000, numTopics=400) on wiki_tfidf then takes about 14h.
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 2] dictionary.filter_tokens(once_ids) dictionary.save(os.environ['NM_HOME']+'/Data/product_text.dict") corpus = [dictionary.doc2bow(text) for text in texts] #corpus = TextCorpus(input=texts) # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results # back to original words. #corpus.dictionary.save("/Users/rsteckel/Workspace/NM/product_text.dict") #dictionary = corpus.dictionary MmCorpus.serialize(os.environ['NM_HOME']+"/Data/product_corpus.mm", corpus) documents.close() #-------------LDA------------- lda = LdaModel(corpus, num_topics=10, id2word=dictionary) #lda.show_topics() for i in np.arange(10): print lda.print_topic(i), '\n' #--------------LSI---------------- tfidf = models.TfidfModel(corpus)
wiki = WikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=min_threshold, no_above=max_threshold, keep_n=keep_words) # Remove stop words (additional removal of common words used in spoken language) stop_ids = [] with open(stop_words_file, 'r') as infile: for line in infile: try: stop_ids.append(wiki.dictionary.token2id[line.lower().strip()]) except KeyError: continue wiki.dictionary.filter_tokens(bad_ids=stop_ids) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format
def make_corpus(path): wiki = WikiCorpus(path) MmCorpus.serialize('/mnt/ebs/wikidata/wiki_jp_vocab.mm', wiki)
class AuthorTopicModel(LdaModel): """ The constructor estimates the author-topic model parameters based on a training corpus: >>> model = AuthorTopicModel(corpus, num_topics=10, author2doc=author2doc, id2word=id2word) The model can be updated (trained) with new documents via >>> model.update(other_corpus, other_author2doc) Model persistency is achieved through its `load`/`save` methods. """ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute the author-topic model over an empty collection (no terms)") logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") if serialized and serialization_path: assert not isfile(serialization_path), \ "A file already exists at the serialization_path path; " \ "choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) ) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy) def __str__(self): return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) def init_empty_corpus(self): """ Initialize an empty corpus. If the corpora are to be treated as lists, simply initialize an empty list. If serialization is used, initialize an empty corpus of the class `gensim.corpora.MmCorpus`. """ if self.serialized: # Initialize the corpus as a serialized empty list. # This corpus will be extended in self.update. MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus. self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus. else: # All input corpora are assumed to just be lists. self.corpus = [] def extend_corpus(self, corpus): """ Add new documents in `corpus` to `self.corpus`. If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents. copyfile(self.serialization_path, self.serialization_path + '.tmp') # Make a temporary copy of the file where the corpus is serialized. self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. MmCorpus.serialize(self.serialization_path, corpus_chain) # Re-serialize the old corpus, and extend it with the new corpus. self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: # self.corpus and corpus are just lists, just extend the list. # First check that corpus is actually a list. assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus) def compute_phinorm(self, expElogthetad, expElogbetad): """Efficiently computes the normalizing factor in phi.""" expElogtheta_sum = expElogthetad.sum(axis=0) phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100 return phinorm def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None): """ Given a chunk of sparse document vectors, update gamma (parameters controlling the topic weights) for each author corresponding to the documents in the chunk. The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where `chunk_authors` is the number of authors in the documents in the current chunk. Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ try: len(chunk) except TypeError: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk if collect_sstats: sstats = np.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Stack all the computed gammas into this output array. gamma_chunk = np.zeros((0, self.num_topics)) # Now, for each document d update gamma and phi w.r.t. all authors in those documents. for d, doc in enumerate(chunk): if chunk_doc_idx is not None: doc_no = chunk_doc_idx[d] else: doc_no = d # Get the IDs and counts of all the words in the current document. # TODO: this is duplication of code in LdaModel. Refactor. if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] gammad = self.state.gamma[authors_d, :] # gamma of document d before update. tilde_gamma = gammad.copy() # gamma that will be updated. # Compute the expectation of the log of the Dirichlet parameters theta and beta. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) expElogbetad = self.expElogbeta[:, ids] # Compute the normalizing constant of phi for the current document. phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = tilde_gamma.copy() # Update gamma. # phi is computed implicitly below, for ai, a in enumerate(authors_d): tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]]) * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) # Update gamma. # Interpolation between document d's "local" gamma (tilde_gamma), # and "global" gamma (gammad). tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma # Update Elogtheta and Elogbeta, since gamma and lambda have been updated. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) # Update the normalizing constant in phi. phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Check for convergence. # Criterion is mean change in "local" gamma. meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma)) gamma_condition = meanchange_gamma < self.gamma_threshold if gamma_condition: converged += 1 break # End of iterations loop. # Store the updated gammas in the model state. self.state.gamma[authors_d, :] = tilde_gamma # Stack the new gammas into the output array. gamma_chunk = np.vstack([gamma_chunk, tilde_gamma]) if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. expElogtheta_sum_a = expElogthetad.sum(axis=0) sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm) if len(chunk) > 1: logger.debug( "%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations ) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak} # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma_chunk, sstats def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None): """ Perform inference on a chunk of documents, and accumulate the collected sufficient statistics in `state` (or `self.state` if None). """ # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible. if state is None: state = self.state gamma, sstats = self.inference( chunk, author2doc, doc2author, rhot, collect_sstats=True, chunk_doc_idx=chunk_doc_idx ) state.sstats += sstats state.numdocs += len(chunk) return gamma def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): """ Calculate and return per-word likelihood bound, using the `chunk` of documents as evaluation corpus. Also output the calculated statistics. incl. perplexity=2^(-bound), to log at INFO level. """ # TODO: This method is very similar to the one in LdaModel. Refactor. if total_docs is None: total_docs = len(chunk) corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) logger.info( "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words", perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words ) return perwordbound def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see Table 1 in Hoffman et al.) If update is called with authors that already exist in the model, it will resume training on not only new documents for that author, but also the previously seen documents. This is necessary for those authors' topic distributions to converge. Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen documents, and author2doc is combined with the previously seen authors. To resume training on all the data seen by the model, simply call `update()`. It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be new documents. Args: corpus (gensim corpus): The corpus with which the author-topic model should be updated. author2doc (dictionary): author to document mapping corresponding to indexes in input corpus. doc2author (dictionary): document to author mapping corresponding to indexes in input corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs into floats, these will be converted back into integers in inference, which incurs a performance hit. For distributed computing it may be desirable to keep the chunks as np arrays. For other parameter settings, see :class:`AuthorTopicModel` constructor. """ # use parameters given in constructor, unless user explicitly overrode them if decay is None: decay = self.decay if offset is None: offset = self.offset if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"), # the process simply gets killed. author2doc = deepcopy(author2doc) doc2author = deepcopy(doc2author) # TODO: it is not possible to add new authors to an existing document (all input documents are treated # as completely new documents). Perhaps this functionality could be implemented. # If it's absolutely necessary, the user can delete the documents that have new authors, and call update # on them with the new and old authors. if corpus is None: # Just keep training on the already available data. # Assumes self.update() has been called before with input documents and corresponding authors. assert self.total_docs > 0, 'update() was called with no documents to train on.' train_corpus_idx = [d for d in xrange(self.total_docs)] num_input_authors = len(self.author2doc) else: if doc2author is None and author2doc is None: raise ValueError('at least one of author2doc/doc2author must be specified, to establish input space dimensionality') # If either doc2author or author2doc is missing, construct them from the other. if doc2author is None: doc2author = construct_doc2author(corpus, author2doc) elif author2doc is None: author2doc = construct_author2doc(doc2author) # Number of authors that need to be updated. num_input_authors = len(author2doc) try: len_input_corpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: logger.warning("AuthorTopicModel.update() called with an empty corpus") return self.total_docs += len_input_corpus # Add new documents in corpus to self.corpus. self.extend_corpus(corpus) # Obtain a list of new authors. new_authors = [] # Sorting the author names makes the model more reproducible. for a in sorted(author2doc.keys()): if not self.author2doc.get(a): new_authors.append(a) num_new_authors = len(new_authors) # Add new authors do author2id/id2author dictionaries. for a_id, a_name in enumerate(new_authors): self.author2id[a_name] = a_id + self.num_authors self.id2author[a_id + self.num_authors] = a_name # Increment the number of total authors seen. self.num_authors += num_new_authors # Initialize the variational distributions q(theta|gamma) gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics)) self.state.gamma = np.vstack([self.state.gamma, gamma_new]) # Combine author2doc with self.author2doc. # First, increment the document IDs by the number of previously seen documents. for a, doc_ids in author2doc.items(): doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids] # For all authors in the input corpus, add the new documents. for a, doc_ids in author2doc.items(): if self.author2doc.get(a): # This is not a new author, append new documents. self.author2doc[a].extend(doc_ids) else: # This is a new author, create index. self.author2doc[a] = doc_ids # Add all new documents to self.doc2author. for d, a_list in doc2author.items(): self.doc2author[d] = a_list # Train on all documents of authors in input_corpus. train_corpus_idx = [] for _ in author2doc.keys(): # For all authors in input corpus. for doc_ids in self.author2doc.values(): # For all documents in total corpus. train_corpus_idx.extend(doc_ids) # Make the list of training documents unique. train_corpus_idx = list(set(train_corpus_idx)) # train_corpus_idx is only a list of indexes, so "len" is valid. lencorpus = len(train_corpus_idx) if chunksize is None: chunksize = min(lencorpus, self.chunksize) self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold ) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy") # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow(offset + pass_ + (self.num_updates / chunksize), -decay) for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: # gamma is not needed in "other", thus its shape is (0, 0). other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False reallen = 0 for chunk_no, chunk_doc_idx in enumerate(utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)): chunk = [self.corpus[d] for d in chunk_doc_idx] reallen += len(chunk) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): # log_perplexity requires the indexes of the documents being evaluated, to know what authors # correspond to the documents. self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info( "PROGRESS: pass %i, dispatching documents up to #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus ) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info( "PROGRESS: pass %i, at document #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus ) # do_estep requires the indexes of the documents being trained on, to know what authors # correspond to the documents. gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx) if self.optimize_alpha: self.update_alpha(gammat, rho()) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other # frees up memory if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False # endfor single corpus iteration if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): """ Estimate the variational bound of documents from `corpus`: E_q[log p(corpus)] - E_q[log q(corpus)] There are basically two use cases of this method: 1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, indicating the indexes of the documents in the training corpus. 2. `chunk` is a test set (held-out data), and author2doc and doc2author corrsponding to this test set are provided. There must not be any new authors passed to this method. `chunk_doc_idx` is not needed in this case. To obtain the per-word bound, compute: >>> corpus_words = sum(cnt for document in corpus for _, cnt in document) >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words """ # TODO: enable evaluation of documents with new authors. One could, for example, make it # possible to pass a list of documents to self.inference with no author dictionaries, # assuming all the documents correspond to one (unseen) author, learn the author's # gamma, and return gamma (without adding it to self.state.gamma). Of course, # collect_sstats should be set to false, so that the model is not updated w.r.t. these # new documents. _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) expElogbeta = np.exp(Elogbeta) gamma = self.state.gamma if author2doc is None and doc2author is None: # Evaluating on training documents (chunk of self.corpus). author2doc = self.author2doc doc2author = self.doc2author if not chunk_doc_idx: # If author2doc and doc2author are not provided, chunk is assumed to be a subset of # self.corpus, and chunk_doc_idx is thus required. raise ValueError('Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.') elif author2doc is not None and doc2author is not None: # Training on held-out documents (documents not seen during training). # All authors in dictionaries must still be seen during training. for a in author2doc.keys(): if not self.author2doc.get(a): raise ValueError('bound cannot be called with authors not seen during training.') if chunk_doc_idx: raise ValueError('Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.') else: raise ValueError('Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.') Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) word_score = 0.0 theta_score = 0.0 for d, doc in enumerate(chunk): if chunk_doc_idx: doc_no = chunk_doc_idx[d] else: doc_no = d # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] ids = np.array([id for id, _ in doc]) # Word IDs in doc. cts = np.array([cnt for _, cnt in doc]) # Word counts. if d % self.chunksize == 0: logger.debug("bound: at document #%i in chunk", d) # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which # is the same computation as in normalizing phi. phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids]) word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm)) # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures # that the likelihood is always roughly on the same scale. word_score *= subsample_ratio # E[log p(theta | alpha) - log q(theta | gamma)] for a in author2doc.keys(): a = self.author2id[a] theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :]) theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha)) theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :])) # theta_score is rescaled in a similar fashion. # TODO: treat this in a more general way, similar to how it is done with word_score. theta_score *= self.num_authors / len(author2doc) # E[log p(beta | eta) - log q (beta | lambda)] beta_score = 0.0 beta_score += np.sum((self.eta - _lambda) * Elogbeta) beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta)) sum_eta = np.sum(self.eta) beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) total_score = word_score + theta_score + beta_score return total_score def get_document_topics(self, word_id, minimum_probability=None): """ This method overwrites `LdaModel.get_document_topics` and simply raises an exception. `get_document_topics` is not valid for the author-topic model, use `get_author_topics` instead. """ raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') def get_author_topics(self, author_name, minimum_probability=None): """ Return topic distribution the given author, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `minimum_probability`). Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), is not supported. """ author_id = self.author2id[author_name] if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :]) author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability] return author_topics def __getitem__(self, author_names, eps=None): """ Return topic distribution for input author as a list of (topic_id, topic_probabiity) 2-tuples. Ingores topics with probaility less than `eps`. Do not call this method directly, instead use `model[author_names]`. """ if isinstance(author_names, list): items = [] for a in author_names: items.append(self.get_author_topics(a, minimum_probability=eps)) else: items = self.get_author_topics(author_names, minimum_probability=eps) return items
# remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus dictionary ,corpus = prep_corpus(docs['tokens']) MmCorpus.serialize('data/model/newsgroups.mm', corpus) dictionary.save('data/model/newsgroups.dict') lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10) lda.save('data/model/newsgroups_50.model')
def setup(): documents = [] import glob import os directoryNames = list(set(glob.glob(os.path.join("Data", "*"))).difference(set(glob.glob(os.path.join("Data","*.*"))))) numberOfDocuments = 0 for folder in directoryNames: for fileNameDir in os.walk(folder): for fileName in fileNameDir[2]: if fileName[-4:] != ".txt": continue nameFileDocument = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName) with open(nameFileDocument, 'r') as doc: doc_text = doc.read().replace('\n', '') import re processed_doc_text = re.sub('[^a-zA-Z0-9\n]', ' ', doc_text) documents.append(processed_doc_text) numberOfDocuments += 1 break print(numberOfDocuments) # remove common words and tokenize #from gensim.utils import lemmatize #lemmatized_docs = [lemmatize(document) for document in documents] from stop_words import get_stop_words stop_words = get_stop_words('english') texts = [[word for word in document if word not in stop_words] for document in documents] # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from gensim import corpora dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000) dictionary.save('files/pmc-data.dict') # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('files/pmc-data.mm', corpus) # store to disk, for later use from gensim.corpora import MmCorpus mm = MmCorpus('files/pmc-data.mm') from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('files/pmc-data-tfidf.mm', tfidf[mm], progress_cnt=10000)
class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(line.lower().split()) corpus = MyCorpus() #create output folder if not os.path.exists("out"): os.makedirs("out") corpusPath = os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'mm'])) MmCorpus.serialize(corpusPath, corpus) mm = MmCorpus(corpusPath) doc_labels = makeDocLabels(path) log.info('fitting the model ...') # fitting the model model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) log.info('generated topics...') # print topics topics = model.show_topics(num_topics=no_of_topics)
def upload_file(): """ Upload csv files and create: * ~/out/corpus.dict * ~/out/corpus.lda * ~/out/corpus.lda.state * ~/out/corpus.mm * ~/out/corpus.mm.index * ~/out/corpus_doclabels.txt * ~/out/corpus_topics.txt * ~/mycorpus.txt As well as (for example): * ~/swcorp/Doyle_AStudyinScarlet.txt * ~/swcorp/Lovecraft_AttheMountainofMadness.txt * etc. """ # INPUT # columns to read from csv file columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'] # parts-of-speech to include into the model pos_tags = ['ADJ', 'NN', 'V'] # stopwords regex = re.compile('\w+') stopwords = request.files['stoplist'] stopwords = str(stopwords.readlines()) stopwords = regex.findall(stopwords) stopwords.extend(("'", "'d", "'s")) # temporary solution print(stopwords) # document size (in words) doc_size = 1000 # uses the pipeline's ParagraphId to split text into documents, # overrides doc_size - 1: on, 0: off doc_split = 0 # no. of topics to be generated no_of_topics = 30 # no. of lda iterations - usually, the more the better, but # increases computing time no_of_passes = 1 # perplexity estimation every n chunks - # the smaller the better, but increases computing time eval = 1 # documents to process at once chunk = 100 # "symmetric", "asymmetric", "auto", or array # (default: a symmetric 1.0/num_topics prior) affects sparsity of # the document-topic (theta) distribution alpha = "symmetric" # custom alpha may increase topic coherence, but may also produce # more topics with zero probability alpha = np.array([ 0.02, 0.02, # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04, # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02]) # can be a number (int/float), an array, or None # affects topic-word (lambda) distribution - not necessarily # beneficial to topic coherence eta = None # PREPROCESSING files = request.files.getlist('files') docs = [] doc_labels = [] print("\n reading files ...\n") for file in files: file_label = secure_filename(file.filename).split('.')[0] df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE) df = df[columns] df = df.groupby('CPOS') doc = pd.DataFrame() for p in pos_tags: # collect only the specified parts-of-speech doc = doc.append(df.get_group(p)) # construct documents if doc_split: # size according to paragraph id doc = doc.groupby('ParagraphId') for para_id, para in doc: docs.append(para['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(para_id)])) else: # size according to doc_size doc = doc.sort_values(by='TokenId') i = 1 while(doc_size < doc.shape[0]): docs.append( doc[:doc_size]['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(i)])) doc = doc.drop(doc.index[:doc_size]) i += 1 docs.append(doc['Lemma'].values.astype(str)) doc_labels.append(''.join([file_label, " #", str(i)])) if not os.path.exists(os.path.join(os.getcwd(), "swcorp")): os.makedirs(os.path.join(os.getcwd(), "swcorp")) swpath = os.path.join('swcorp', "".join(file_label)) with open(swpath + ".txt", 'w', encoding="utf-8") as text: text.write(" ".join( word for word in doc['Lemma'].values.astype(str) if word not in stopwords)) print("\n normalizing and vectorizing ...\n") # texts = [ # [word for word in doc if word not in stopwords] for doc in docs] print("\n stopwords removed ...\n") print("\n writing mastercorpus ...\n") mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') with open(mastercorpus, 'w', encoding="utf-8") as data: folder = glob.glob("swcorp/*") for text in folder: with open(text, 'r', encoding="utf-8") as text: textline = [re.sub( r'\\n\\r', '', document) for document in ' '.join( text.read().split())] if text != folder[-1]: data.write("".join(textline) + "\n") else: data.write("".join(textline)) # MAIN PART mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') dictionary = corpora.Dictionary( line.lower().split() for line in open( mastercorpus, encoding="utf-8")) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens # separated by whitespace yield dictionary.doc2bow(line.lower().split()) # corpus = buildCorpus(mastercorpus, dictionary) corpus = MyCorpus() # corpus = glob.glob("swcorpus/*") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): os.makedirs(os.path.join # (os.path.join(os.getcwd(), 'out'), foldername)) MmCorpus.serialize( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus.mm'])), corpus) mm = MmCorpus('out/corpus.mm') print(mm) # doc_labels = glob.glob("corpus/*") print("fitting the model ...\n") model = LdaModel( corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) # model = LdaMulticore(corpus=corpus, id2word=dictionary, # num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'), # foldername)) with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item + "\n") with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write( "".join(["topic #", str(i[0]), ": ", str(item), "\n"])) dictionary.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'dict']))) # MmCorpus.serialize( # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( # [foldername, 'mm'])), corpus) model.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'lda']))) print("\n ta-daaaa ...\n") # VISUALIZATION no_of_topics = model.num_topics no_of_docs = len(doc_labels) doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): # topic_dist is a list of tuples (topic_id, topic_prob) topic_dist = model.__getitem__(doc) for topic in topic_dist: doc_topic[i][topic[0]] = topic[1] # get plot labels topic_labels = [] for i in range(no_of_topics): # show_topic() returns tuples (word_prob, word) topic_terms = [x[0] for x in model.show_topic(i, topn=3)] topic_labels.append(" ".join(topic_terms)) # cf. https://de.dariah.eu/tatom/topic_model_visualization.html if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20, 20)) # if many items, enlarge figure plt.pcolor(doc_topic, norm=None, cmap='Reds') plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels) plt.xticks( np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis() plt.colorbar(cmap='Reds') plt.tight_layout() plt.savefig("./static/corpus_heatmap.svg") return render_template('success.html')
print 'Building dictionary of terms ...' dictionary = corpora.Dictionary(texts) print '%d word types' % len(dictionary) print 'Filtering infrequent and frequent terms ...' dictionary.filter_extremes(no_below=5, no_above=0.5) print '%d word types, after filtering' % len(dictionary) print 'Saving dictionary (%s)...' % DICT dictionary.save(DICT) print 'Building bag-of-words corpus ...' bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 4 / 5 training = bow_corpus[:size] testing = bow_corpus[size:] print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing)
print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) mywiki = myWikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file
if(argc>2): topicnum=int(sys.argv[2]) if(argc>3): conference=sys.argv[3] relpath= conference+str(year) rname= relpath+'/papers' print conference,year, topicnum if(not os.path.exists(rname+'.mm')): with open(relpath+'/allpapers.txt') as fp: d=fp.readlines() docs=[i.split(" ") for i in d] dictionary, corpus = prep_corpus(docs) MmCorpus.serialize(rname+'.mm', corpus) dictionary.save(rname+'.dict') t0=time.clock() lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topicnum, passes=10) print time.clock()-t0 lda.save(relpath+'/papers_%d.model'%(topicnum))