def train_tfidf(inpath=config.path_train_cut): train_df = pd.read_csv(inpath, sep="\t", header=None, names=["id", "s1", "s2", "label"], encoding="utf-8") tfidf_txt = train_df["s1"].tolist() + train_df["s2"].tolist() texts = [tokenize(text) for text in tfidf_txt] # remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 documents = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = Dictionary(documents) dictionary.save_as_text("./model/words.dic") # dictionary = Dictionary.load_from_text("./model/words.dic") class MyCorpus(object): def __iter__(self): for doc in documents: yield dictionary.doc2bow(doc) corpus = MyCorpus() MmCorpus.serialize("./model/corpus.mm", corpus) # corpus = MmCorpus("./model/corpus.mm") tfidf = TfidfModel(corpus) tfidf.save("./model/tf_idf.model")
def load_20newsgroup(root=DATA_DIR, download=False): data_folder = os.path.join(root, "datasets/20newsgroup") processed_folder = os.path.join(data_folder, "processed") if download: if not os.path.exists(data_folder): os.makedirs(data_folder) pull_from_url(data_folder, dataset="20newsgroup") docs = [] groups = [] subfolders = os.listdir( os.path.join(data_folder, NEWSGROUP_DECOMPRESS_FOLDER)) for subf in subfolders: work_dir = os.path.join(data_folder, NEWSGROUP_DECOMPRESS_FOLDER, subf) filenames = os.listdir(work_dir) for file in filenames: with open(os.path.join(work_dir, file), "r") as f: docs.append(word_tokenize(f.read().strip())) groups.append(subf) docs = [[LEMMATIZER.lemmatize(token) for token in doc] for doc in docs] if not os.path.exists(processed_folder): os.mkdir(processed_folder) logging.info("Building dictionary and BOW corpus for the dataset...") corpus, dictionary = build_corpus(docs) MmCorpus.serialize(os.path.join(processed_folder, "corpus.mm"), corpus) dictionary.save(os.path.join(processed_folder, "dictionary")) return corpus, dictionary else: try: return MmCorpus(os.path.join(processed_folder, "corpus.mm")), \ Dictionary.load(os.path.join(processed_folder, "dictionary")) except FileNotFoundError: logging.warning( "The dataset does not exist, please set download to True!") return None, None
def __convertToCorpus(self, documents): """ Steps to make the documents compatible to gensim Changelog - 15/3 KS First commit :param documents: :return: """ #Preprocessing the text dp = DataPreprocessing() text = dp.getBagOfWords(documentDF=documents, return_type='document_tokens') #Create a Gensim text corpus based on documents print("Creating a text dictionary") self.dictionary = Dictionary(line.lower().split() for line in documents) print(self.dictionary) print("Saving text dictionary to file") self.dictionary.save('../data.prune/producttext.dict') #Create a Gensim document corpus based on text corpus and each document print("Creating a Gensim document corpus") self.corpus = [self.dictionary.doc2bow(line) for line in text] print("Saving corpus to file") MmCorpus.serialize('../data.prune/productcorpus.mm', self.corpus) self.corpus = MmCorpus('../data.prune/productcorpus.mm') print(self.corpus)
def create_topic_model(application_id: str) -> None: """Creates the topic model from all completely fetched accounts""" logging.info('Starting to create topic model for application id: %s' % application_id) with engine.begin() as connection: logging.info('Requesting complete accounts') accounts = list( models.account.select_multiple_complete(application_id, SOURCES['TWITTER'], connection)) logging.info('Loading documents') documents = load_documents(accounts, connection) topic_model_path = get_topic_model_path(application_id) create_folder_if_not_exists(topic_model_path) logging.info('Creating dictionary') dictionary = create_dictionary(documents) dictionary.save(os.path.join(topic_model_path, 'dictionary')) logging.info('Creating corpus') MmCorpus.serialize(os.path.join(topic_model_path, 'corpus.mm'), MyCorpus(dictionary, documents)) corpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm')) logging.info('Creating LDA Model') lda_model = create_lda_model(corpus, dictionary) lda_model.save(os.path.join(topic_model_path, 'ldamodel')) topics_words = get_topic_words(lda_model, NUM_TOPIC_WORDS) models.topic_model.insert_one(application_id, SOURCES['TWITTER'], topics_words, connection)
def extend_corpus(self, corpus): """ Add new documents in `corpus` to `self.corpus`. If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain( self.corpus, corpus) # A generator with the old and new documents. copyfile( self.serialization_path, self.serialization_path + '.tmp' ) # Make a temporary copy of the file where the corpus is serialized. self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. MmCorpus.serialize( self.serialization_path, corpus_chain ) # Re-serialize the old corpus, and extend it with the new corpus. self.corpus = MmCorpus( self.serialization_path ) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: # self.corpus and corpus are just lists, just extend the list. # First check that corpus is actually a list. assert isinstance( corpus, list ), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus)
def create_LDA_dict(): #ONE TIME USE, to create and save LDA model trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict' trigram_reviews = LineSentence( '../Dataset/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dictionary_filepath) print('LDA dict saved.') trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm' MmCorpus.serialize( trigram_bow_filepath, trigram_bow_generator( '../Dataset/trigram_transformed_reviews_all.txt')) trigram_bow_corpus = MmCorpus(trigram_bow_filepath) lda_model_filepath = '../Models/lda_model_all' #lda_model_all_30, lda_model_10topic # created LDA model with 10, 30, 50 topics, found 30 has best result with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore( trigram_bow_corpus, num_topics=30, #10, 30, 50 id2word=trigram_dictionary, workers=8) lda.save(lda_model_filepath) print('LDA model saved.')
def apply_tfidf(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) mm = MmCorpus(mm_corpus_path) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm', tfidf[mm], progress_cnt=10000)
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic): stop_words = set(stopwords.words('english')) stop_words.add(u'rt') print('Loading tweets from ' + tweets_file) tweets = pd.read_pickle(tweets_file) if author_topic: tweets = tweets.groupby('user').agg({'text': 'sum'}) print('%d tweets loaded' % len(tweets.index)) dictionary = Dictionary(tweets['text']) stopword_ids = map(dictionary.token2id.get, stop_words) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in tweets['text']] # print(corpus) print("Writing corpus to " + corpus_file) MmCorpus.serialize(corpus_file, corpus) # print(dictionary) print("Writing dictionary to " + dictionary_file) dictionary.save(dictionary_file)
def test_apply(self): transformed_vtcorp = self.transformer._apply(self.vtcorp) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label) text_data_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[0]) text_obj_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[2]) MmCorpus.serialize(text_data_name, transformed_vtcorp) transformed_vtcorp.save(text_obj_name) self.assertTrue(self.loader.has_text_corpora(self.transformation_label)) self.temporary_files.extend([ os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_name) for transformed_name in transformed_names]) transformed_vtcorp = TransformedCorpus.load(text_obj_name) self.assertIsInstance(transformed_vtcorp, TransformedCorpus) self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary) self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
def extend_corpus(self, corpus): """ Add new documents in `corpus` to `self.corpus`. If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. assert self.corpus.input != corpus.input, \ 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents. # Make a temporary copy of the file where the corpus is serialized. copyfile(self.serialization_path, self.serialization_path + '.tmp') self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. # Re-serialize the old corpus, and extend it with the new corpus. MmCorpus.serialize(self.serialization_path, corpus_chain) self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: # self.corpus and corpus are just lists, just extend the list. # First check that corpus is actually a list. assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus)
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def get_corpus(docs): print("Building corpus ...") tfidf_model = None # load corpus from disk if ARGS.load_corpus: corpus = MmCorpus(ARGS.path_corpus) else: corpus = [dictionary.doc2bow(doc) for doc in docs] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm') if ARGS.corpus_type == "TFIDF": tfidf_model = TfidfModel(corpus) tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm") corpus = tfidf_model[corpus] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm') return corpus, tfidf_model
def save(self, dictionary_file="corpus.dict", corpus_file="corpus.mm", sup_file=None): if dictionary_file: Dictionary.save(self.dictionary, dictionary_file) if corpus_file: MmCorpus.serialize(corpus_file, self) if sup_file and type(self.docs) is PaperCorpus: self.docs.save(sup_file)
def lda(clean_docs, model_name, topics): # turn all data into a dictionary mappping of normalized words and their integer ids from gensim import corpora dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LDA model from gensim.models import LdaModel num_topics = topics # find this number of topics in the data passes = 15 ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes) ldamodel.save('model_' + model_name + '.gensim') topics = ldamodel.print_topics(num_words=5) for topic in topics: print(topic)
def _create_corpus(self, data_path, stopwords_path, corpus_path, data_ready_path, save=True): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open(stopwords_path, 'r', encoding='utf8') as f: stopwords = f.read().split() with open(data_path, 'r', encoding='utf8') as f: data = f.readlines() texts = [] doc = [] for row in map(lambda r: json.loads(r), data): tmp = [word for word in tokenize(row) if word not in stopwords] texts.append(tmp) doc.append(" ".join(tmp)) # Create Dictionary without_stopwords = data_path + 'processed' with open(without_stopwords, 'w', encoding='utf8') as f: for raw in doc: f.write(raw) f.write('\n') corpus = gensim.corpora.textcorpus.TextCorpus(without_stopwords) if save: tmp_file = get_tmpfile(corpus_path) MmCorpus.serialize(tmp_file, corpus) with open(data_ready_path, 'wb') as f: pkl.dump(texts, f) return corpus, texts
def guidedLDA_Model(topics, cores=11): """ Topics represents desired LDA topics, cores should be physical cores minus one. Both should be integers. """ # load finished dictionary from disk trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('./models2/trigram_bow_corpus.nm', trigram_bow_generator('./models2/trigram_transformed_reviews.txt')) # load finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs, # along with the number of topics the model should learn # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=topics, id2word=trigram_dictionary, workers=cores) lda.save('./models2/lda_model') # load the finished LDA model from disk #lda = LdaMulticore.load('./models/lda_model_neg') return trigram_bow_corpus, lda
def pylda_visualize(csv_chemin, ecriture_chemin, tfidf_visualization = False, num_topic=3, filter_by_cluster=None): ''' gets the clustering result from csv_chemin and then writes the LDA visualisation as an html file into ecriture_chemin csv_chemin points to a dataframe with two columns: one corresponding to the cluster, the other containing the text num_topic is the number of topics we want to extract from the texts filter_by_cluster is the cluster index, if we want to extract topics from one cluster only ''' #df = pd.read_csv('df_brown.csv') clustering_result_df = pd.read_csv(csv_chemin) if filter_by_cluster: clustering_result_df[clustering_result_df['pred_cluster'] == filter_by_cluster] text = clustering_result_df['text'].values #text = ' '.join(text) docs = pd.DataFrame(list(map(load_doc, enumerate(list(clustering_result_df['text'].apply(clean)))))) docs.head() dictionary, corpus = prep_corpus(docs['tokens']) #dictionary : keys = word_id ; value = word #corpus[i] = list of tuples (word_id, count) where count is the number of occurence of the word in the text corpus[i] if tfidf_visualization: # Instead of representing each text as tuples (word_idx, term_frequency), we represent them as (word_idx, word_tfidf_weight) model = TfidfModel(corpus) new_corpus = [] for i in range(len(corpus)): element = corpus[i] new_element = [] for j in range(len(element)): #word = dictionary[pair[0]] pair = element[j] #dict_idx = pair[0] tfidf_vector = model[element] word_tfidf_weight = tfidf_vector[j] new_element += (pair[0], word_tfidf_weight) new_corpus.append(new_element) MmCorpus.serialize(ecriture_chemin + '.mm', corpus) dictionary.save(ecriture_chemin + '.dict') lda = models.ldamodel.LdaModel(corpus=new_corpus, id2word=dictionary, num_topics=15, passes=10) lda.save(ecriture_chemin + '.model') vis_data = gensimvis.prepare(lda, new_corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.save_html(vis_data, ecriture_chemin + '.html') else: MmCorpus.serialize(ecriture_chemin + '.mm', corpus) dictionary.save(ecriture_chemin + '.dict') lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=10) lda.save(ecriture_chemin + '.model') vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')
def createcorpus(bg_corpus,output_dictionary,output_serialize): # Generating a training/background corpus from your own source of documents #saving dictionary and corpus in Matrix method form print("Creating corpus and dictionary") background_corpus = TextCorpus(input=bg_corpus) background_corpus.dictionary.save(output_dictionary) MmCorpus.serialize(output_serialize,background_corpus) return background_corpus,background_corpus.dictionary
def _create_bow_representation(self): """Create bag-of-words representation of collection, and save it in Matrix Matrix format to disk.""" print('Create bag-of-words matrix representation.') self.bow_corpus = [self.dictionary.doc2bow(article) for article in self.articles] MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)
def _getCorpus(): try: wc = MmCorpus(corpusPath) except FileNotFoundError: wc = WikiCorpus(getInputPath(), tokenizer_func=tokenize) wc.dictionary.save(dictPath) MmCorpus.serialize(corpusPath, wc) return wc
def model_all(): dictionary, corpus = prep_corpus(df_text['text_tokens']) MmCorpus.serialize('wiki_articles.mm', corpus) dictionary.save('wiki_articles_new.dict') lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, passes=50) return lda, dictionary, corpus
def main(): # a command line interface for running Gensim operations # can create a corpus from a directory of texts or from a wikipedia dump # options for lemmatize words, build model and/or pyLDAvis graph output parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model') subparsers = parser.add_subparsers(dest='mode') text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files') text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored') text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus') text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words') wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles') wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump') wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus') wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words') lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus') lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus') lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary') lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model') lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model') lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model') lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model') lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus') lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary') lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model') argcomplete.autocomplete(parser) args = parser.parse_args() if args.mode == 'text': doc_corpus = DocCorpus(args.docs_loc, args.lemma) doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus) doc_corpus.dictionary.save(args.corp_loc + '.dict') if args.mode == 'wiki': if args.lemma: wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15) else: wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15) wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus) wiki_corpus.dictionary.save(args.corp_loc + '.dict') if args.mode == 'lda': build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc) if args.mode == 'ldavis': build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)
def generate_matrix_market( dictionary, save=False, file=os.path.join(config.map("Storage")['storage_dir'] + 'corpus.mm')): corpus = iter_docs(dictionary) if save: MmCorpus.serialize(file, corpus) return corpus
def load_experts(): """ load expert data and save to file """ expert_corpus = ExpertCorpus() MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm') """ save expert-to-document map to pickle """ pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
def create_bow(trigram_reviews_filepath, trigram_bow_filepath, trigram_dictionary): """ generate bag-of-words representations for # all reviews and save them as a matrix """ MmCorpus.serialize( trigram_bow_filepath, trigram_bow_generator(trigram_reviews_filepath, trigram_dictionary))
def _create_files(self): wikidata = WikiData(self.config.dataset_dir, self.tokenizer) doc_stream = (tokens for tokens in wikidata) dictionary_wiki = Dictionary(doc_stream) dictionary_wiki.filter_extremes(no_below=20, no_above=0.1) dictionary_wiki.save(self.wiki_dict_file) wiki_corpus = WikiCorpus(wikidata, dictionary_wiki) MmCorpus.serialize(self.mm_corpus_file, wiki_corpus)
def create_bow_corpus(self, persist=True): self.log('Making Bag-of-Words corpus...') self.bow_corpus = list( map(self.dict.doc2bow, self.preprocessed_article_texts)) sleep(2) if persist: self.log('Saving Bag-of-Words corpus to disk for future use...') MmCorpus.serialize(self.bow_corpus_file, self.bow_corpus) sleep(2)
def _serialize_corpus_(fpath, dic, outfpath=fpathroot + fpathappend + '_serialized.mm', returncorp=True): """ create serialized corpus """ MmCorpus.serialize(outfpath, _bow_generator_(fpath, dic)) if returncorp == True: return MmCorpus(outfpath)
def set_corpus(self, language_processed_data: list, corpus_file_path: str): logging.info("---- Creating corpus from processed data") corpus = [ self.dictionary.doc2bow(list_of_words_of_doc) for list_of_words_of_doc in language_processed_data ] MmCorpus.serialize(corpus_file_path, corpus) self.corpus = corpus logging.info("---- Corpus is created") return
def _create_tfidf_matrix(self): """Create TF-IDF matrix and save it in Matrix Matrix format to disk""" print('Create TF-IDF matrix of collection.') tfidf = TfidfModel(self.bow_corpus, id2word=self.dictionary, normalize=True) MmCorpus.serialize(self.tfidf_filepath, tfidf[self.bow_corpus]) print('Number of documents:', tfidf.num_docs)
def persist_corpus(self, corpus: Corpus, key: str = "corpus") -> Corpus: """Takes a transient corpus generator and persists it on disk. Only necessary when using a corpus more than once.""" with Message(f"Storing {key} corpus"): f = get_tmpfile(f"irsel_{key}") # By serializing a corpus to disk, we can read it multiple times (which is impossible with a generator) # without having to load it into RAM as a whole at any time. MmCorpus.serialize(f, corpus) corpus = MmCorpus( f) # this instance can be consumed as often as we want printq(corpus) return corpus
def create_corpus(input_file, corpus_filepath, dictionary, run_or_load_flag): # generate bag-of-words representations for # all reviews and save them as a matrix if run_or_load_flag: MmCorpus.serialize(corpus_filepath, trigram_bow_generator(input_file, dictionary)) corpus = MmCorpus(corpus_filepath) else: corpus = MmCorpus(corpus_filepath) return corpus
def _train_lda(vectorizer, corpora_path, id2word_path, model_dir, model_fname=model_fname, num_topics=10): """训练和保存基于tfidf的lda模型 基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型, 保存该模型到{model_dir}文件夹下 Args: vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"] corpora_path(path) :- 保存语料的.txt文件 id2word_path(path) :- 保存gensim字典的文件 model_dir(path) :- 保存gensim LDA模型的文件夹 model_fname(path) :- 模型文件名 num_topics(int) :- lda的超参,主题数 """ try: assert vectorizer in ["bow", "tfidf"] except AssertionError: raise AssertionError("vectorizer must be bow or tfidf") if not os.path.isdir(model_dir): raise OSError(model_dir, "doesn't exist") corpora = [] with open(corpora_path, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_path) corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora] # tfidf的话需要计算idf if vectorizer == "tfidf": MmCorpus.serialize(corpus_tfidf_mm, corpus) corpus = MmCorpus(corpus_tfidf_mm) model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) model_path = os.path.join(model_dir, vectorizer) make_dir(model_path) model_path = os.path.join(model_path, model_fname) if not os.path.isfile(model_path): model.save(model_path) print('model saved') else: print(f"{model_path} already exists") return model
def _create_files1(self): dir = "/media/rohola/data/dialog_systems/alexa_prize_topical_chat_dataset/reading_sets/" wikidata = TopicalDataset(dir, self.tokenizer) doc_stream = (tokens for tokens in wikidata) id2word_wiki = Dictionary(doc_stream) id2word_wiki.filter_extremes(no_below=20, no_above=0.2) id2word_wiki.save(self.wiki_dict_file) wiki_corpus = WikiCorpus(wikidata, id2word_wiki) MmCorpus.serialize(self.mm_corpus_file, wiki_corpus)
def main(): dataset, version, nbfiles, pos_tags, tfidf, args = parse_args() corpus_type = "tfidf" if tfidf else "bow" logger = init_logging(name=f'MM_{dataset}_{corpus_type}', basic=False, to_stdout=True, to_file=True) logg = logger.info if logger else print log_args(logger, args) texts, stats, nbfiles = make_texts(dataset, nbfiles, pos_tags, logg=logg) gc.collect() file_name = f'{dataset}{nbfiles if nbfiles else ""}_{version}' directory = join(LDA_PATH, version) if not exists(directory): makedirs(directory) # --- saving texts --- file_path = join(directory, f'{file_name}_texts.json') logg(f'Saving {file_path}') with open(file_path, 'w') as fp: json.dump(texts, fp, ensure_ascii=False) # --- saving stats --- file_path = join(directory, f'{file_name}_stats.json') logg(f'Saving {file_path}') with open(file_path, 'w') as fp: json.dump(stats, fp) # generate and save the dataset as bow or tfidf corpus in Matrix Market format, # including dictionary, texts (json) and some stats about corpus size (json) corpus, dictionary = texts2corpus(texts, tfidf=tfidf, filter_below=5, filter_above=0.5, logg=logg) file_name += f'_{corpus_type}' directory = join(directory, corpus_type) # --- saving corpus --- file_path = join(directory, f'{file_name}.mm') logg(f'Saving {file_path}') MmCorpus.serialize(file_path, corpus) # --- saving dictionary --- file_path = join(directory, f'{file_name}.dict') logg(f'Saving {file_path}') dictionary.save(file_path)
def discover(self, textcolname, num_topics, passes): self.num_topics = num_topics self.passes = passes self._logger.info("cleanzing data for '%s'", textcolname) self.__cleanze(textcolname) self._logger.info("creating corpus and dictionary for '%s'", textcolname) self.dictionary, self.corpus = self.__corpus() self._logger.info("applying lda model '%s'", textcolname) self.lda = self.__model() self._logger.info("saving models for '%s'", textcolname) MmCorpus.serialize(PConstant.CORPUS_DIR_PATH.value + textcolname +'_corpus.mm', self.corpus) self.dictionary.save( PConstant.DICTIONARY_DIR_PATH.value + textcolname + '_dictionary.dict') self.lda.save( PConstant.LDA_DIR_PATH.value + textcolname + '_lda.model')
def init_empty_corpus(self): """ Initialize an empty corpus. If the corpora are to be treated as lists, simply initialize an empty list. If serialization is used, initialize an empty corpus of the class `gensim.corpora.MmCorpus`. """ if self.serialized: # Initialize the corpus as a serialized empty list. # This corpus will be extended in self.update. MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus. self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus. else: # All input corpora are assumed to just be lists. self.corpus = []
def pretrain(): """pre train the text corpus and build the dictionary""" gutenberg_corpus = TextCorpus(text_corpus_file) gutenberg_corpus.dictionary.save(dict_file) gutenberg_corpus.dictionary.save_as_text(dic_txt_file) mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus) print mm;
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # Read in the corpus from within the archive file fin = path.join(datadir, "reuters21578.tar.gz") rc = ReutersCorpus(fin) # filter out some of the more common words, # and some of the less-common ones as well rc.dictionary.filter_extremes(no_below=20, no_above=0.1) rc.dictionary.compactify() # Serialize the Reuters 21578 corpus fout = path.join(datadir, "reuters21578.mm") MmCorpus.serialize(fout, rc) # Save the dictionary to file as text fout = path.join(datadir, "reuters21578.dict.txt") rc.dictionary.save_as_text(fout)
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load back the id->word mapping directly from file fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm") mm = MmCorpus(fin) # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True) # save the TfidfModel instance to file fout = path.join(datadir, "reuters21578.tfidf.model") tfidf.save(fout) # save TF-IDF vectors in matrix market format fout = path.join(datadir, "reuters21578.tfidf.mm") MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def main(args): logging.info('Initializing loaders with root %s, name %s' % ( args.root, args.name)) dloader = MultimodalDatasetLoader(args.root, args.name) icorp = dloader.load_image_corpus(args.img_label) transformer = NormalizationTransform() normalized_icorp = transformer._apply(icorp) corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label) corpus_full_path = os.path.join(args.root, corpus_names[0]) logging.info('Serializing to file %s' % corpus_full_path) MmCorpus.serialize(corpus_full_path, normalized_icorp) logging.info('Re-saving original corpus object with infix %s' % args.transformation_label) dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)
def _build_model(self, all_documents, remove_once=False): ''' Builds the lsa model Returns: dictionary, corpus ''' doc_hash = hash_obj(all_documents) corp_cache_path = CACHE_DIR + '/' + doc_hash +\ '_corp_' + str(int(remove_once)) dic_cache_path = CACHE_DIR + '/' + doc_hash +\ '_dic_' + str(int(remove_once)) lsi_cache_path = CACHE_DIR + '/' + doc_hash +\ '_lsi_' + str(int(remove_once)) if os.path.exists(corp_cache_path) \ and os.path.exists(dic_cache_path)\ and os.path.exists(lsi_cache_path): lsi = models.LsiModel.load(lsi_cache_path) corp = MmCorpus(corp_cache_path) dic = Dictionary.load(dic_cache_path) else: texts = [self.tokenize(doc) for doc in all_documents] all_tokens = sum(texts, []) if remove_once: tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dic = Dictionary(texts) corp = [dic.doc2bow(text) for text in texts] MmCorpus.serialize(corp_cache_path, corp) dic.save(dic_cache_path) lsi = models.LsiModel( corp, id2word=dic, num_topics=20) lsi.save(lsi_cache_path) return dic, corp, lsi
def upload_file(): """ Upload csv files and create: * ~/out/corpus.dict * ~/out/corpus.lda * ~/out/corpus.lda.state * ~/out/corpus.mm * ~/out/corpus.mm.index * ~/out/corpus_doclabels.txt * ~/out/corpus_topics.txt * ~/mycorpus.txt As well as (for example): * ~/swcorp/Doyle_AStudyinScarlet.txt * ~/swcorp/Lovecraft_AttheMountainofMadness.txt * etc. """ # INPUT # columns to read from csv file columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'] # parts-of-speech to include into the model pos_tags = ['ADJ', 'NN', 'V'] # stopwords regex = re.compile('\w+') stopwords = request.files['stoplist'] stopwords = str(stopwords.readlines()) stopwords = regex.findall(stopwords) stopwords.extend(("'", "'d", "'s")) # temporary solution print(stopwords) # document size (in words) doc_size = 1000 # uses the pipeline's ParagraphId to split text into documents, # overrides doc_size - 1: on, 0: off doc_split = 0 # no. of topics to be generated no_of_topics = 30 # no. of lda iterations - usually, the more the better, but # increases computing time no_of_passes = 1 # perplexity estimation every n chunks - # the smaller the better, but increases computing time eval = 1 # documents to process at once chunk = 100 # "symmetric", "asymmetric", "auto", or array # (default: a symmetric 1.0/num_topics prior) affects sparsity of # the document-topic (theta) distribution alpha = "symmetric" # custom alpha may increase topic coherence, but may also produce # more topics with zero probability alpha = np.array([ 0.02, 0.02, # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04, # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02]) # can be a number (int/float), an array, or None # affects topic-word (lambda) distribution - not necessarily # beneficial to topic coherence eta = None # PREPROCESSING files = request.files.getlist('files') docs = [] doc_labels = [] print("\n reading files ...\n") for file in files: file_label = secure_filename(file.filename).split('.')[0] df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE) df = df[columns] df = df.groupby('CPOS') doc = pd.DataFrame() for p in pos_tags: # collect only the specified parts-of-speech doc = doc.append(df.get_group(p)) # construct documents if doc_split: # size according to paragraph id doc = doc.groupby('ParagraphId') for para_id, para in doc: docs.append(para['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(para_id)])) else: # size according to doc_size doc = doc.sort_values(by='TokenId') i = 1 while(doc_size < doc.shape[0]): docs.append( doc[:doc_size]['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(i)])) doc = doc.drop(doc.index[:doc_size]) i += 1 docs.append(doc['Lemma'].values.astype(str)) doc_labels.append(''.join([file_label, " #", str(i)])) if not os.path.exists(os.path.join(os.getcwd(), "swcorp")): os.makedirs(os.path.join(os.getcwd(), "swcorp")) swpath = os.path.join('swcorp', "".join(file_label)) with open(swpath + ".txt", 'w', encoding="utf-8") as text: text.write(" ".join( word for word in doc['Lemma'].values.astype(str) if word not in stopwords)) print("\n normalizing and vectorizing ...\n") # texts = [ # [word for word in doc if word not in stopwords] for doc in docs] print("\n stopwords removed ...\n") print("\n writing mastercorpus ...\n") mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') with open(mastercorpus, 'w', encoding="utf-8") as data: folder = glob.glob("swcorp/*") for text in folder: with open(text, 'r', encoding="utf-8") as text: textline = [re.sub( r'\\n\\r', '', document) for document in ' '.join( text.read().split())] if text != folder[-1]: data.write("".join(textline) + "\n") else: data.write("".join(textline)) # MAIN PART mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') dictionary = corpora.Dictionary( line.lower().split() for line in open( mastercorpus, encoding="utf-8")) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens # separated by whitespace yield dictionary.doc2bow(line.lower().split()) # corpus = buildCorpus(mastercorpus, dictionary) corpus = MyCorpus() # corpus = glob.glob("swcorpus/*") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): os.makedirs(os.path.join # (os.path.join(os.getcwd(), 'out'), foldername)) MmCorpus.serialize( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus.mm'])), corpus) mm = MmCorpus('out/corpus.mm') print(mm) # doc_labels = glob.glob("corpus/*") print("fitting the model ...\n") model = LdaModel( corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) # model = LdaMulticore(corpus=corpus, id2word=dictionary, # num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'), # foldername)) with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item + "\n") with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write( "".join(["topic #", str(i[0]), ": ", str(item), "\n"])) dictionary.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'dict']))) # MmCorpus.serialize( # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( # [foldername, 'mm'])), corpus) model.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'lda']))) print("\n ta-daaaa ...\n") # VISUALIZATION no_of_topics = model.num_topics no_of_docs = len(doc_labels) doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): # topic_dist is a list of tuples (topic_id, topic_prob) topic_dist = model.__getitem__(doc) for topic in topic_dist: doc_topic[i][topic[0]] = topic[1] # get plot labels topic_labels = [] for i in range(no_of_topics): # show_topic() returns tuples (word_prob, word) topic_terms = [x[0] for x in model.show_topic(i, topn=3)] topic_labels.append(" ".join(topic_terms)) # cf. https://de.dariah.eu/tatom/topic_model_visualization.html if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20, 20)) # if many items, enlarge figure plt.pcolor(doc_topic, norm=None, cmap='Reds') plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels) plt.xticks( np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis() plt.colorbar(cmap='Reds') plt.tight_layout() plt.savefig("./static/corpus_heatmap.svg") return render_template('success.html')
logging.basicConfig(stream=sys.stdout, level=logging.INFO) ### Generating a training/background corpus from gensim.corpora import TextCorpus, MmCorpus, Dictionary # Provide a filename or a file-like object as input and TextCorpus will be automatically initialized with a # dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only # need to override `get_texts` and provide your own implementation.. background_corpus = TextCorpus(input=YOUR_CORPUS) background_corpus.dictionary.save("my_dict.dict") # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results back to original words. MmCorpus.serialize("background_corpus.mm", background_corpus) # Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs. ### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download wiki_corpus = WikiCorpus(articles) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus
def make_corpus(path): wiki = WikiCorpus(path) MmCorpus.serialize('/mnt/ebs/wikidata/wiki_jp_vocab.mm', wiki)
class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(line.lower().split()) corpus = MyCorpus() #create output folder if not os.path.exists("out"): os.makedirs("out") corpusPath = os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'mm'])) MmCorpus.serialize(corpusPath, corpus) mm = MmCorpus(corpusPath) doc_labels = makeDocLabels(path) log.info('fitting the model ...') # fitting the model model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) log.info('generated topics...') # print topics topics = model.show_topics(num_topics=no_of_topics)
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 2] dictionary.filter_tokens(once_ids) dictionary.save(os.environ['NM_HOME']+'/Data/product_text.dict") corpus = [dictionary.doc2bow(text) for text in texts] #corpus = TextCorpus(input=texts) # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results # back to original words. #corpus.dictionary.save("/Users/rsteckel/Workspace/NM/product_text.dict") #dictionary = corpus.dictionary MmCorpus.serialize(os.environ['NM_HOME']+"/Data/product_corpus.mm", corpus) documents.close() #-------------LDA------------- lda = LdaModel(corpus, num_topics=10, id2word=dictionary) #lda.show_topics() for i in np.arange(10): print lda.print_topic(i), '\n' #--------------LSI---------------- tfidf = models.TfidfModel(corpus)
# remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus dictionary ,corpus = prep_corpus(docs['tokens']) MmCorpus.serialize('data/model/newsgroups.mm', corpus) dictionary.save('data/model/newsgroups.dict') lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10) lda.save('data/model/newsgroups_50.model')
wiki = WikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=min_threshold, no_above=max_threshold, keep_n=keep_words) # Remove stop words (additional removal of common words used in spoken language) stop_ids = [] with open(stop_words_file, 'r') as infile: for line in infile: try: stop_ids.append(wiki.dictionary.token2id[line.lower().strip()]) except KeyError: continue wiki.dictionary.filter_tokens(bad_ids=stop_ids) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format
argc=len(sys.argv) if(argc>1): year=int(sys.argv[1]) if(argc>2): topicnum=int(sys.argv[2]) print year, topicnum if(not os.path.exists('%d/CVPRpapers%d.mm')): with open('%d/allpapers%d.txt'%(year,year)) as fp: d=fp.readlines() docs=[i.split(" ") for i in d] dictionary, corpus = prep_corpus(docs) MmCorpus.serialize('%d/CVPRpapers%d.mm'%(year,year), corpus) dictionary.save('%d/CVPRpapers%d.dict'%(year,year)) t0=time.clock() lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topicnum, passes=10) print time.clock()-t0 lda.save('%d/CVPRpapers%d_%d.model'%(year,year,topicnum))
def saveCorpus(corpus, corpusfile): from gensim.corpora.dictionary import Dictionary from gensim.corpora import MmCorpus MmCorpus.serialize(corpusfile, corpus)
if(argc>2): topicnum=int(sys.argv[2]) if(argc>3): conference=sys.argv[3] relpath= conference+str(year) rname= relpath+'/papers' print conference,year, topicnum if(not os.path.exists(rname+'.mm')): with open(relpath+'/allpapers.txt') as fp: d=fp.readlines() docs=[i.split(" ") for i in d] dictionary, corpus = prep_corpus(docs) MmCorpus.serialize(rname+'.mm', corpus) dictionary.save(rname+'.dict') t0=time.clock() lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topicnum, passes=10) print time.clock()-t0 lda.save(relpath+'/papers_%d.model'%(topicnum))
print 'Building dictionary of terms ...' dictionary = corpora.Dictionary(texts) print '%d word types' % len(dictionary) print 'Filtering infrequent and frequent terms ...' dictionary.filter_extremes(no_below=5, no_above=0.5) print '%d word types, after filtering' % len(dictionary) print 'Saving dictionary (%s)...' % DICT dictionary.save(DICT) print 'Building bag-of-words corpus ...' bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 4 / 5 training = bow_corpus[:size] testing = bow_corpus[size:] print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing)
processed = [] tokenizer = RegexpTokenizer(r'\w+') stopWords = stopwords.words('spanish') stemmer = SnowballStemmer('spanish') for userQueries in usersQueries: tokenized = tokenizer.tokenize(userQueries[0]) cleaned = [token for token in tokenized if token not in stopWords] stemmed = [stemmer.stem(i) for i in cleaned] processed.append(stemmed) finally: srcFile.close() dictionary = Dictionary(processed) corpus = [dictionary.doc2bow(userQueries) for userQueries in processed] MmCorpus.serialize('data/corpus_all.mat', corpus) # corpus = MmCorpus('data/corpus_10k') def trainLda(corpus, dictionary): ldaModel = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=40) ldaModel.save('data/model/ldamodel_all.mod') ldaUsers = ldaModel[corpus] ldaUsers.save('data/ldausers_all.mat') def trainHdp(corpus, dictionary): hdpModel = hdpmodel.HdpModel(corpus, id2word=dictionary) hdpModel.save('data/model/hdpmodel_all.mod') hdpUsers = hdpModel[corpus] hdpUsers.save('data/hdpusers_all.mat') trainLda(corpus, dictionary)
print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) mywiki = myWikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file
def setup(): documents = [] import glob import os directoryNames = list(set(glob.glob(os.path.join("Data", "*"))).difference(set(glob.glob(os.path.join("Data","*.*"))))) numberOfDocuments = 0 for folder in directoryNames: for fileNameDir in os.walk(folder): for fileName in fileNameDir[2]: if fileName[-4:] != ".txt": continue nameFileDocument = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName) with open(nameFileDocument, 'r') as doc: doc_text = doc.read().replace('\n', '') import re processed_doc_text = re.sub('[^a-zA-Z0-9\n]', ' ', doc_text) documents.append(processed_doc_text) numberOfDocuments += 1 break print(numberOfDocuments) # remove common words and tokenize #from gensim.utils import lemmatize #lemmatized_docs = [lemmatize(document) for document in documents] from stop_words import get_stop_words stop_words = get_stop_words('english') texts = [[word for word in document if word not in stop_words] for document in documents] # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from gensim import corpora dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000) dictionary.save('files/pmc-data.dict') # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('files/pmc-data.mm', corpus) # store to disk, for later use from gensim.corpora import MmCorpus mm = MmCorpus('files/pmc-data.mm') from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('files/pmc-data-tfidf.mm', tfidf[mm], progress_cnt=10000)