def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'): index_temp = get_tmpfile("index") indexer = Similarity(output_prefix=index_temp, corpus=corpus, num_features=len(dct), num_best=6) indexer.save(file_name) return indexer
class LSM: def __init__(self, model_name, index): self.model_name = model_name if self.model_name == 'LSI': self.model_file = lsi_model_file self.corpora_file = lsi_corpora_file self.similarity_file = lsi_sim_file self.num_topics = LSI_TOPICS elif self.model_name == 'LDA': self.model_file = lda_model_file self.corpora_file = lda_corpora_file self.similarity_file = lda_sim_file self.num_topics = LDA_TOPICS if not os.path.isfile(mm_corpus_file) or not os.path.isfile(dict_file): self.corpus = CorpusConnector(index) corpora.MmCorpus.serialize(mm_corpus_file, self.corpus) self.corpus.save_dict() self.dictionary = self.corpus.dictionary else: self.dictionary = corpora.Dictionary.load(dict_file) self.corpus = corpora.MmCorpus(mm_corpus_file) self.model = None self.corpora = None self.similarity_index = None def create_model(self): if not os.path.isfile(self.model_file): if self.model_name == 'LSI': self.model = lsimodel.LsiModel(corpus = self.corpus, \ id2word = self.dictionary, num_topics = self.num_topics) else: self.model = ldamodel.LdaModel(corpus = self.corpus, \ num_topics = self.num_topics, id2word = self.dictionary) self.model.save(self.model_file) self.corpora = self.model[self.corpus] corpora.MmCorpus.serialize(self.corpora_file, self.corpora) else: self.corpora = gensim.corpora.MmCorpus(self.corpora_file) if self.model_name == 'LSI': self.model = gensim.models.LsiModel.load(self.model_file) else: self.model = gensim.models.LdaModel.load(self.model_file) def create_similarity_index(self): if not os.path.isfile(self.similarity_file): self.similarity_index = Similarity('./LSM/', self.corpora, self.num_topics) self.similarity_index.save(self.similarity_file) else: self.similarity_index = Similarity.load(self.similarity_file)
def main(): orig_qns = [qn.strip()for qn in open('data/questions.txt')] aug = [qn.strip() for qn in open('data/augmented.txt')] all_qns = [] for idx, qn in tqdm(enumerate(orig_qns)): all_qns.append(qn) if aug[idx] != qn: all_qns.append(aug[idx]) print("Combined original questions and augmented questions") pickle.dump(all_qns, open("precompute/questions.pkl", 'wb')) qns = pickle.load(open("precompute/questions.pkl", 'rb')) documents = [] for qn in tqdm(qns): document = get_similar.preprocess_text(qn) if len(document) < 1: document = ['UNK'] documents.append(document) print(f"Finished preprocessing {len(documents)} questions") pickle.dump(documents, open("precompute/documents.pkl", "wb")) print("Saved tokens to documents.pkl") documents = pickle.load(open("precompute/documents.pkl", "rb")) dct = corpora.Dictionary(documents) pickle.dump(dct, open("precompute/dct.pkl", 'wb')) dct.save('precompute/dct.dict') dct = corpora.Dictionary.load('precompute/dct.dict') corpus = [dct.doc2bow(doc) for doc in tqdm(documents)] pickle.dump(corpus, open("precompute/corpus.pkl", 'wb')) print("Corpus generated") tfidf = models.TfidfModel(corpus, smartirs='bfn') pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb')) corpus_tfidf = tfidf[corpus] pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb')) print("tfidf generated") index_temp = get_tmpfile("index") index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100) index.save("precompute/similarities.pkl") print("Similarity index saved") PIPE = subprocess.PIPE #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE) NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model']) NLU.wait() print("Rasa NLU trained")
def main(dataset_path): if not os.path.exists('../data/retriever/paragraph-ids.txt'): print('Writing paragraph ID to file...') with open('../data/retriever/paragraph-ids.txt', 'w') as f: for paragraph_id in load_ids(dataset_path): f.write(paragraph_id + '\n') dictionary_path = '../data/retriever/dct.pkl' if not os.path.exists(dictionary_path): print('Creating dictionary...') st = time.time() dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000) dct.save(dictionary_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating dictionary in {et - st}s.') else: print('Loading dictionary...') dct = Dictionary.load(dictionary_path) print('Dictionary loaded.') tfidf_path = '../data/retriever/tfidf.pkl' if not os.path.exists(tfidf_path): print('Creating model...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) model = TfidfModel(corpus) model.save(tfidf_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating model in {et - st}s.') else: print('Loading model...') model = TfidfModel.load(tfidf_path) print('Model loaded.') index_path = '../data/retriever/indexes/master-index' if not os.path.exists(index_path): print('Creating index...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) index = Similarity('../data/retriever/indexes/index', model[corpus], len(dct)) index.save(index_path) et = time.time() print(f'\rFinished creating index in {et - st}s.') print('Done') else: print('Nothing to do. Exiting...')
def create_sim_matrix(tfidf, corpus, dictionary, outputDir): """" Creates a Gensim simiariry matrix for document similarity comparison and saves it tfidf (Gensim tfidf model): Gensim tfidf model corpus (Gensim corpus object): Gensim corpus dictionary (Gensim dictionary object): Gensim dictionary outputDir (string): Location to save matrix """ indicesFile = outputDir + 'indices' simFile = outputDir + 'Index' sims = Similarity(indicesFile, tfidf[corpus], num_features=(len(dictionary))) sims.close_shard() sims.save(simFile) print('Similarity matrix created and stored at: ' + simFile)
def fit_model(data, n_topics, iterations, passes, min_prob, eval_every, n_best, min_df, max_df, preserved_words): dt = cur_date() output_folder = "lda_%stopics_%s" % (n_topics, dt) os.makedirs(output_folder, exist_ok=True) os.makedirs("%s/separate" % output_folder, exist_ok=True) logging.info("creating corpus...") dictionary, corpus = make_corpus(list(data.values()), min_df, max_df, preserved_words, output_folder) # generate LDA model logging.info("training model...") lda = LdaModel(corpus, num_topics=n_topics, id2word=dictionary, iterations=iterations, passes=passes, minimum_probability=min_prob, eval_every=eval_every) logging.info("saving model...") lda.save('saved/lda_%s_%s.serialized' % (n_topics, dt)) # print(lda.print_topics(num_topics=n_topics, num_words=4)) # save all-vs-all pairwise similarities logging.info("creating index...") index = Similarity('./sim_index', lda[corpus], num_features=n_topics, num_best=n_best + 1) paths = list(data.keys()) logging.info("write all similarities to result file") with open('%s/similarities.txt' % output_folder, 'w') as res_file: with open('%s/similarities_summary.txt' % output_folder, 'w', encoding='utf-8') as res_file_sum: for i, similarities in enumerate(index): cur_fname = get_filename(paths[i]) top_similar = [(paths[s[0]], s[1]) for s in similarities if s[0] != i] res_file.write('%s: %s\n' % (cur_fname, [(get_filename(p), c) for (p, c) in top_similar])) res_file_sum.write('%s: %s\n' % (cur_fname, get_title(paths[i]))) for sim in top_similar: res_file_sum.write( '%s: %s' % (get_filename(sim[0]), get_title(sim[0]))) res_file_sum.write('-' * 100 + '\n') # for each doc we make separate file which containts list of similar docs with open( '%s/separate/%s.txt' % (output_folder, cur_fname.split('.')[0]), 'w') as sep_res: for sim in top_similar: sep_res.write('%s\n' % get_filename(sim[0])) logging.info("save index") index.save('saved/lda_index_%s.index' % dt) # save topic - words matrix with open("%s/topic_words.txt" % output_folder, 'w', encoding='utf-8') as f: for topic_words in lda.print_topics(lda.num_topics): f.write("#%s: %s\n" % (topic_words[0], topic_words[1])) # save document - topics matrix with open("%s/document_topics.txt" % output_folder, 'w') as f: for i, topics in enumerate(lda[corpus]): f.write("#%s: %s\n" % (get_filename(paths[i]), topics)) # save dictionary dictionary.save_as_text("%s/dictionary.txt" % output_folder)
# this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(output_prefix + '_bow.mm') # build tfidf, ~50min logger.info(">>> Building TF-IDF model ...") tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(output_prefix + '.tfidf_model') """ # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB logger.info(">>> Serializing TF-IDF vectors ...") MmCorpus.serialize(output_prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000) del tfidf tfidf_corpus = MmCorpus(output_prefix + '_tfidf.mm') """ logger.info(">>> Generating similarity index ...") #similarity_index = Similarity(output_prefix + "_similarity_index", tfidf_corpus, len(dictionary)) similarity_index = Similarity(output_prefix + "_similarity_index", tfidf[mm], len(dictionary), use_reverse_index=True) del tfidf logger.info(">>> Serializing similarity index ...") similarity_index.save(output_prefix + '_similarity.index') logger.info(">>> Finished running %s" % program)
import gensim from gensim.similarities import Similarity, MatrixSimilarity # from pgfin_timing import Timer from pgfin_helpers import tokenize logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore # load the corpora print "\n Loading corpora.\n" # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm') # lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm') # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm') lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm') # print(tfidf_corpus) # print(lsi_corpus) print "\n Start similarity index.\n" index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms) index.save('./data/pgfin_index.index') # save to disk # print index index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms) index_dense.save('./data/pgfin_matrixindex.index') # save to disk # print index_dense
def initialise_query_structures(corpus, dictionary): index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, corpus=corpus, num_features=len(dictionary)) index.save('../NLP_DocumentSimilarity/corpus/patent_query_index.index') return index
#tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix') num_feat = len(wiki.dictionary.keys()) index = Similarity('../data/logEntropyShards/logEntropySimilarity', logent_corpus, num_features=num_feat) index.save('../data/logEntropyShards/logEntropySimilarityIndex') print('Saved Shards and similarity index') print('Getting list of titles...') bz2_wiki = bz2.BZ2File(wiki_file, "r") extract = corpora.wikicorpus.extract_pages(bz2_wiki) i = 0 matches = open('../data/title_matches.txt','a') for title,doc,z in extract: wiki_filt = corpora.wikicorpus.filter_wiki(doc) doc_token = corpora.wikicorpus.tokenize(wiki_filt) bowbow = diction.doc2bow(doc_token) if bowbow == BOW_corpus[i]: i+=1 print(unidecode(title),file=matches) if i % 100000 == 0: