def create_similarity_index(self): if not os.path.isfile(self.similarity_file): self.similarity_index = Similarity('./LSM/', self.corpora, self.num_topics) self.similarity_index.save(self.similarity_file) else: self.similarity_index = Similarity.load(self.similarity_file)
def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'): index_temp = get_tmpfile("index") indexer = Similarity(output_prefix=index_temp, corpus=corpus, num_features=len(dct), num_best=6) indexer.save(file_name) return indexer
class LSM: def __init__(self, model_name, index): self.model_name = model_name if self.model_name == 'LSI': self.model_file = lsi_model_file self.corpora_file = lsi_corpora_file self.similarity_file = lsi_sim_file self.num_topics = LSI_TOPICS elif self.model_name == 'LDA': self.model_file = lda_model_file self.corpora_file = lda_corpora_file self.similarity_file = lda_sim_file self.num_topics = LDA_TOPICS if not os.path.isfile(mm_corpus_file) or not os.path.isfile(dict_file): self.corpus = CorpusConnector(index) corpora.MmCorpus.serialize(mm_corpus_file, self.corpus) self.corpus.save_dict() self.dictionary = self.corpus.dictionary else: self.dictionary = corpora.Dictionary.load(dict_file) self.corpus = corpora.MmCorpus(mm_corpus_file) self.model = None self.corpora = None self.similarity_index = None def create_model(self): if not os.path.isfile(self.model_file): if self.model_name == 'LSI': self.model = lsimodel.LsiModel(corpus = self.corpus, \ id2word = self.dictionary, num_topics = self.num_topics) else: self.model = ldamodel.LdaModel(corpus = self.corpus, \ num_topics = self.num_topics, id2word = self.dictionary) self.model.save(self.model_file) self.corpora = self.model[self.corpus] corpora.MmCorpus.serialize(self.corpora_file, self.corpora) else: self.corpora = gensim.corpora.MmCorpus(self.corpora_file) if self.model_name == 'LSI': self.model = gensim.models.LsiModel.load(self.model_file) else: self.model = gensim.models.LdaModel.load(self.model_file) def create_similarity_index(self): if not os.path.isfile(self.similarity_file): self.similarity_index = Similarity('./LSM/', self.corpora, self.num_topics) self.similarity_index.save(self.similarity_file) else: self.similarity_index = Similarity.load(self.similarity_file)
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def vars(): mydct = load('mydct.joblib') noval_corp = load('noval_corp.joblib') noval_ind = get_tmpfile('index') noval_index = Similarity(noval_ind, noval_corp, len(mydct)) val_corp = load('val_corp.joblib') val_ind = get_tmpfile('index') val_index = Similarity(val_ind, val_corp, len(mydct)) pca8 = load('pca8.joblib') nlp = yelp_tool.spacy.load('en_core_web_md', disable=['tagger', 'ner']) read = yelp_tool.Readability() nlp.add_pipe(read, last=True) return mydct, noval_index, val_index, pca8, nlp
def __init__(self, initializer): preprocessed_documents = initializer.getPreprocessedDocuments() dictionary = initializer.getDictionary() corpus = [dictionary.doc2bow(text) for text in preprocessed_documents] tf_idf = initializer.getTfIdf() query_doc_tf_idf = tf_idf[dictionary.doc2bow( preprocessed_documents[0])] similarity_object = Similarity('tfidf', tf_idf[corpus], num_features=len(dictionary)) similarities = similarity_object[query_doc_tf_idf] similarity_object.destroy() self.scores = similarities[1:len(similarities)]
def getSimilarity(df_content_o): logging.debug('preparing docSim') raw_documents = list(df_content_o['content']) corpora_documents = [] for item_text in raw_documents: item_str = item_text.split(' ') corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1 similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!! similarity.num_best = max_similar_num return similarity,dictionary
def main(): orig_qns = [qn.strip()for qn in open('data/questions.txt')] aug = [qn.strip() for qn in open('data/augmented.txt')] all_qns = [] for idx, qn in tqdm(enumerate(orig_qns)): all_qns.append(qn) if aug[idx] != qn: all_qns.append(aug[idx]) print("Combined original questions and augmented questions") pickle.dump(all_qns, open("precompute/questions.pkl", 'wb')) qns = pickle.load(open("precompute/questions.pkl", 'rb')) documents = [] for qn in tqdm(qns): document = get_similar.preprocess_text(qn) if len(document) < 1: document = ['UNK'] documents.append(document) print(f"Finished preprocessing {len(documents)} questions") pickle.dump(documents, open("precompute/documents.pkl", "wb")) print("Saved tokens to documents.pkl") documents = pickle.load(open("precompute/documents.pkl", "rb")) dct = corpora.Dictionary(documents) pickle.dump(dct, open("precompute/dct.pkl", 'wb')) dct.save('precompute/dct.dict') dct = corpora.Dictionary.load('precompute/dct.dict') corpus = [dct.doc2bow(doc) for doc in tqdm(documents)] pickle.dump(corpus, open("precompute/corpus.pkl", 'wb')) print("Corpus generated") tfidf = models.TfidfModel(corpus, smartirs='bfn') pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb')) corpus_tfidf = tfidf[corpus] pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb')) print("tfidf generated") index_temp = get_tmpfile("index") index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100) index.save("precompute/similarities.pkl") print("Similarity index saved") PIPE = subprocess.PIPE #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE) NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model']) NLU.wait() print("Rasa NLU trained")
def initiate_recommender(): # Retrieve all the necessary files for the recommender system baseDir = settings.BASE_DIR # Load dictionary and corpus dictFile = baseDir + "/static/data/DBLP_Dictionary.dict" corpusFile = baseDir + "/static/data/DBLP_Corpus.mm" dictionary = corpora.Dictionary.load(dictFile) corpus = corpora.MmCorpus(corpusFile) # Load the TF-IDF model tfidfFile = baseDir + "/static/data/TF-IDF" tfidf = models.TfidfModel().load(tfidfFile) # Load the Gensim similarity index indexFile = baseDir + "/static/data/Index" sims = Similarity.load(indexFile) # If matrix fits in memory, use this instead and comment out previous two lines #sims = MatrixSimilarity(tfidf[corpus], num_features=(len(dictionary))) # Point to the text csv file textFile = baseDir + "/static/data/Text.csv" # Load ID dataframe from recommender paperIDs = baseDir + "/static/data/AbsID.csv" cols = ["paperID"] dfIDs = pd.read_csv(paperIDs, names=cols, header=None) return dictionary, corpus, tfidf, sims, textFile, dfIDs
def build_index( self, premises: Iterable[Sentence] ) -> Tuple[Similarity, Callable[[TokenList], Vector], Iterable[Sentence]]: """Builds an index from given premises that can be used to answer similarity queries.""" if Irsel.index_cache: # if an index has already been built for these TF-IDF parameters, reuse it cached_smart, cached_dimensions, cached_index, cached_query_transformer, cached_premises = Irsel.index_cache if cached_smart == self.smart and cached_dimensions == self.dimensions and cached_premises is premises: printq("Hitting index cache.") return cached_index, cached_query_transformer, cached_premises else: printq("Skipping index cache.") dictionary, corpus = self.build_corpus( premises) # create a term-document matrix corpus, query_transformer = self.transform_corpus( dictionary, corpus) # apply TF-IDF and LSI models with Message("Storing index"): # Builds an index which we can compare queries against. index = Similarity(get_tmpfile(f"irsel_index"), corpus, num_features=len(dictionary)) printq(index) # allows us to reuse this index for later proof attempts with the same parameters Irsel.index_cache = self.smart, self.dimensions, index, query_transformer, premises return index, query_transformer, premises
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def cosine_similarity_only_syn(self): print("Cosine Similarity with only synsets") cos_sim = [] for data in self.data: sent1 = [word[0] for word in data[1]] sent2 = [word[0] for word in data[2]] sent3, sent4 = [], [] for word in sent1: if self.preprocessdata_o.synsets.get(word): sent3.append( list(self.preprocessdata_o.synsets.get(word))[0]) sent1 += sent3 for word in sent2: if self.preprocessdata_o.synsets.get(word): sent4.append( list(self.preprocessdata_o.synsets.get(word))[0]) sent2 += sent4 text = [sent3] + [sent4] sent_dict = corpora.Dictionary(text) corpus = [sent_dict.doc2bow(t) for t in text] sim = Similarity('-Similarity-index', corpus, num_features=len(sent_dict)) test_corpus_1 = sent_dict.doc2bow(sent1) cos_sim_each = sim[test_corpus_1][1] cos_sim.append(cos_sim_each) self.feature['cos_sim_only_syn'] = cos_sim
def create_document_similarity_model(alternate_path=False) -> dict(): DATA_MODEL_NAME = "data_model.pickle" DICT_MODEL_NAME = "dictSim.pickle" INDEX_NAME = "gensim_index.pickle" #initial word tokenization if not os.path.exists(DATA_MODEL_NAME): print("loading data files from scratch") train_X, train_Y = load_robots_txt_files(alternate_path) save_model((train_X, train_Y), DATA_MODEL_NAME) else: print("loading data files by pickle") train_X, train_Y = load_model(DATA_MODEL_NAME) #create gensim dictionary if not os.path.exists(DICT_MODEL_NAME): print("loading gensim dict from scratch") gensim_dict = Dictionary(train_X) save_model(gensim_dict, DICT_MODEL_NAME) else: print("loading gensim dict with pickle") gensim_dict = load_model(DICT_MODEL_NAME) #create lookable index if not os.path.exists(INDEX_NAME): print("building index from scratch") iterator = tqdm(map(lambda x: gensim_dict.doc2bow(x), train_X)) index = Similarity("gensim_index.models", corpus=iterator, num_features=len(gensim_dict) + 1, num_best=100) save_model(index, INDEX_NAME) else: print("loading index with pickle") index = load_model(INDEX_NAME)
def check(docs, target): """ Calculate the similarity between target and docs. Parameters ---------- docs: list A list of strings to be compared against target: string The target string to be compared Returns ------- float The percentage similarity """ stemmer = PorterStemmer() tok_docs = [tokenize(text) for text in docs] stem_docs = [[stemmer.stem(word) for word in doc] for doc in tok_docs] dictionary = Dictionary(stem_docs) corpus = [dictionary.doc2bow(doc) for doc in stem_docs] tfidf = TfidfModel(corpus) sims = Similarity('/tmp/sims.index', tfidf[corpus], num_features=len(dictionary)) query = [stemmer.stem(word) for word in tokenize(target)] query_bow = dictionary.doc2bow(query) query_tfidf = tfidf[query_bow] return sum(sims[query_tfidf]) / len(sims[query_tfidf])
def get_bow(graph, with_children=False): docs = [] for vertex in graph.vertices(): articles_text = "" for article in graph.vp.articles[vertex]: articles_text = articles_text + article docs.append(articles_text.split()) # create & save a dictionary # # remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [[word for word in document if word not in stoplist] for document in docs] # # # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) for vertex in graph.vertices(): category_articles = string.join(graph.vp.articles[vertex]).split() graph.vp.bow[vertex] = dictionary.doc2bow(category_articles) dictionary.save('/tmp/bag_of_words.dict') corpus = [graph.vp.bow[vertex] for vertex in graph.vertices()] index = Similarity('/tmp/tst', corpus=corpus, num_features=dictionary.__len__())
def main(dataset_path): if not os.path.exists('../data/retriever/paragraph-ids.txt'): print('Writing paragraph ID to file...') with open('../data/retriever/paragraph-ids.txt', 'w') as f: for paragraph_id in load_ids(dataset_path): f.write(paragraph_id + '\n') dictionary_path = '../data/retriever/dct.pkl' if not os.path.exists(dictionary_path): print('Creating dictionary...') st = time.time() dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000) dct.save(dictionary_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating dictionary in {et - st}s.') else: print('Loading dictionary...') dct = Dictionary.load(dictionary_path) print('Dictionary loaded.') tfidf_path = '../data/retriever/tfidf.pkl' if not os.path.exists(tfidf_path): print('Creating model...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) model = TfidfModel(corpus) model.save(tfidf_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating model in {et - st}s.') else: print('Loading model...') model = TfidfModel.load(tfidf_path) print('Model loaded.') index_path = '../data/retriever/indexes/master-index' if not os.path.exists(index_path): print('Creating index...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) index = Similarity('../data/retriever/indexes/index', model[corpus], len(dct)) index.save(index_path) et = time.time() print(f'\rFinished creating index in {et - st}s.') print('Done') else: print('Nothing to do. Exiting...')
def find_answer(question): # 对输入的问题进行分词 question.replace('\t', '').replace(' ', '') # .replace('\n', '') question_gen = jieba.cut(question) questionList = list(question_gen) question_seg = " ".join(questionList) print(question_seg) print(question, question_gen, questionList, question_seg) answerList = [] # 判断问题是否可以用知识库数据解决 if is_KB_QA(question_seg): print("Is KB QA:") info_list = KB_answer(questionList) for answer in info_list: answerDic = {} answerDic["answer"] = answer[2] + "为" + answer[3] answerDic["percentage"] = (int)(answer[0] * 100) answerList.append(answerDic) # 如果答案列表为空,在以回答的问题中寻找相似答案 if not answerList: print("Is not KB QA:") # 建立问题和回答的字典 dic = {} question, answer = getSellerQA(item_id) #with open(SENTENCE_PATH, "r", encoding="utf-8") as question: # with open(ANSWER_PATH, "r", encoding="utf-8") as answer: for q, a in zip(question, answer): dic[q] = a # 读取已经完成分词的语料库 sentences = [] for line in question: line.replace('\t', '').replace(' ', '') # .replace('\n', '') seg_list = jieba.cut(line) sentences.append(list(seg_list)) print('input done') # 生成字典和向量语料 #pprint(sentences) dictionary = corpora.Dictionary(sentences) corpus = [dictionary.doc2bow(text) for text in sentences] index = Similarity('-Similarity-index', corpus, num_features=400) print("training done:", list(question_gen)) # 找到与提出的问题最相似的已有问题 resultList = find_simillar(questionList, dictionary, index) # 将得到的答案整合到一个List中并返回 for answer in resultList: answerDic = {} # answerList.append(''.join(sentences[answer[0]])) answerDic["answer"] = dic[''.join(sentences[answer[0]])] answerDic["percentage"] = (int)(answer[1] * 100) answerList.append(answerDic) #answerList.append(dic[''.join(sentences[answer[0]])]) #print(dic[''.join(sentences[answer[0]])]) print(resultList) reDic = {} reDic["answer"] = answerList reDic["cnt"] = len(answerList) print(reDic) return reDic
def get_docsim_feature(contents, remarks=""): dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks corpora_documents = [] tokenizer = Tokenizer() for item_text in contents: item_str = tokenizer(item_text) corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] similarity = Similarity('-Similarity-index', corpus, num_features=300) similarity.num_best = 3 pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4) pickle.dump(corpus, open(corpus_path, "wb"), protocol=4) return similarity, corpus
def similarity(self,sent1,sent2): text1 = self.wordTokenize(sent1) text2 = self.wordTokenize(sent2) texts = [text1, text2] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary)) return similarity[dictionary.doc2bow(text1)][1]
def similarity_matrix(self, corpus, dictionary): """Compute cosine similarity against a corpus of documents by storing the index matrix in memory.""" # index = MatrixSimilarity(corpus, num_features=len(dictionary)) index_temp = get_tmpfile("index") index = Similarity(index_temp, corpus, num_features=len(dictionary)) # create index for sims in index[corpus]: pprint(sims)
def get_sim(f1, f2): c1 = open(f1, encoding='utf8').read() c1 = removePunctuation(c1) print(c1) # jieba 进行分词 data1 = jieba.cut(c1) data11 = "" # 获取分词内容 for i in data1: data11 += i + " " doc1 = [data11] # 检验分词,程序成功可去掉 print("分词内容:\n") print(doc1) t1 = [[word for word in doc.split()] for doc in doc1] # print(t1) # frequence频率 freq = defaultdict(int) for i in t1: for j in i: freq[j] += 1 # print(freq) # 限制词频 '''t2 = [[token for token in k if freq[j] >= 3] for k in t1] ''' # corpora语料库建立字典 dic1 = corpora.Dictionary(t1) # 对比文件 c2 = open(f2, encoding='utf8').read() c2 = removePunctuation(c2) # jieba 进行分词 data2 = jieba.cut(c2) data21 = "" for i in data2: data21 += i + " " new_doc = data21 # print(new_doc) # doc2bow把文件变成一个稀疏向量 new_vec = dic1.doc2bow(new_doc.split()) # 对字典进行doc2bow处理,得到新语料库 new_corpor = [dic1.doc2bow(t3) for t3 in t1] # 特征数 featurenum = len(dic1.token2id) # SparseMatrixSimilarity 稀疏矩阵相似度 idx = Similarity('-Similarity-index', new_corpor, featurenum) sims = idx[new_corpor] f = open(r'/output.txt', 'w') print('%.2f' % sims, file=f) f.close() print('%.2f' % sims)
def main(): stopword = open('D:\code/test\哈工大停用词表.txt', encoding='utf8') # 获取停用词列表 stopwordlist = list(jieba.cut(stopword.read())) print(stopwordlist) try: orig_path, add_path, save_path = sys.argv[1:4] except Exception as e: print(sys.argv) print(e) # save_path = 'D:\code/test/out.txt' # 源文本预处理 # orig_path = 'D:\code/test/orig.txt' orig_file = open(orig_path, 'r', encoding="utf-8") text = orig_file.read() text = remove_punctuation(text) text = list(text) afterswlis = [] for each in text: if each not in stopwordlist: afterswlis.append(each) else: continue text = afterswlis text ="".join(text) orig_file.close() # 预处理查重文本 # add_path = 'D:\code/test/orig_0.8_dis_15.txt' add_file = open(add_path, 'r', encoding="utf-8") add_text = add_file.read() add_file.close() add_text = remove_punctuation(add_text) add_text = list(add_text) afterswlis = [] for each in add_text: if each not in stopwordlist: afterswlis.append(each) else: continue add_text = afterswlis add_text = "".join(add_text) # 文本转向量 texts = [jieba.lcut(text)] dictionary = corpora.Dictionary(texts) num_features = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] add_vec = dictionary.doc2bow(jieba.lcut(add_text)) # 向量计算相似度 similarity = Similarity('-Similarity-index', corpus, num_features) # 转换类型,切片保留两位小数 a = similarity[add_vec] b = a[0] b = str(b).split('.')[0] + '.' + str(a).split('.')[1][:2] print("相似的计算结果:%s" % b) # 输出结果写入指定文档 f = open(save_path, 'w', encoding="utf-8") f.write("相似的计算结果:%s" % b) f.close()
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' # Guardamos las similitudes en un archivo con un formato sencillo # NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO for idioma, salida in self.output()['langs'].iteritems(): file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma)) for n_topics, o in salida.iteritems(): index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path) # JSON sims = index2dict(index, file_list, num_sims=self.num_similar_docs) with o['json'].open('w') as f: json.dump(sims, f) # HTML + CSV s = u'' net = pd.DataFrame(columns=['from_name', 'to_name', 'sim']) for book, v in sims.iteritems(): s += u'-------------------------------------------\n' s += u'### %s\n\n' % (book) s += u'| Ranking | Libro | Similitud |\n|:--------:|:-------|-------------:|\n''' for rank, attrs in v.iteritems(): s += u'| %d | %s | %f |\n' % (rank, attrs['name'], round(attrs['similarity'],3)) net = net.append(pd.DataFrame({'from_name':[book], 'to_name':[attrs['name']], 'sim':[attrs['similarity']]})) s += u'\n\n' md = markdown.markdown(s, extensions=['markdown.extensions.tables']) books = sorted(list(set(net['from_name']).union(net['to_name']))) ids = {v:i for i,v in enumerate(books)} net['from'] = [ids[k] for k in net['from_name']] net['to'] = [ids[k] for k in net['to_name']] with o['html'].open('w') as f: f.write(md) with o['csv'].open('w') as f: net.to_csv(f, index=False) # Red (en R) tempname = 'net_temp0.html' i = 1 while os.path.exists(tempname): tempname = 'net_temp%d.html' % i i += 1 if i >= 100: print 'ERROR: No se puede crear la red temporal... Checa que no exista un archivo llamado %s en esta carpeta y que tienes permisos de escritura...' % tempname break subprocess.call(['itam-d3-network.R', '--input', o['csv'].path, '--output', tempname, '--max_links', str(self.num_similar_docs), '--min_sim', str(self.min_similarity)]) print 'USER INFO: Creando archivo temporal: ' + tempname shutil.move(tempname, o['net'].path) print 'USER INFO: Movimiento listo, %s --> %s' % (tempname, o['net'].path) if os.path.exists(tempname): os.remove(tempname)
def tf_text2vector(self): try: dct = self.tf_parameters["tf_dictionary"] rules, corpus = zip(*self.tf_parameters["tf_rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) index = Similarity(None, corpus, num_features=len(dct)) rules_similarity = list(zip(rules, index[txt_corp])) return rules_similarity except: return None
def GetLsm(self, dictionary, corpus): lsi = models.lsimodel.LsiModel( corpus, id2word=dictionary) #num_topics=len(corpus)/2 vec_lsi = lsi[corpus[0]] index = Similarity('l_index', corpus, len(dictionary)) cnt = 0 for similarities in index: if cnt == 1: return list(enumerate(similarities)) cnt += 1
def GetTfidf(self, dictionary, corpus): tfidf = models.TfidfModel(corpus) vec_lsi = tfidf[corpus[0]] index = Similarity('t_index', corpus, len(dictionary)) #tsims = index[vec_lsi] cnt = 0 for similarities in index: if cnt == 1: return list(enumerate(similarities)) cnt += 1
def get_sim(model, corps): """get Similarity for corpus and model Args: model (TfIdfModel): TfIdf model to develop Similarity corps (Dictionary): Dictionary of words Returns: [type]: [description] """ return Similarity(None, model[corps], num_features=400)
def __init__(self, loader_obj): self.model_types = [("lsi", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() self.et_vectors = self.tkz_model.application_field["texts"] self.coeffs = self.tkz_model.application_field["coeff"] self.tags = self.tkz_model.application_field["tags"] self.index = Similarity( None, self.et_vectors, num_features=self.model.texts_algorithms["num_topics"])
def main(path="train.json"): #get a random question quest = select_question(path) print("Random question : ") print(quest) #Tokenize and create gensim dictionnary dictionary, corpus_quest = process_question(quest) tfidf = gensim.models.TfidfModel(corpus_quest) #corpus of contexts processing ctx = import_context(path) corpus = process_contexts(ctx) #Global corpus dictionnary corpus = final_process_context(corpus, dictionary) dir_for_index = get_tmpfile("index_sim") #Similarity function to compare each context to the question sim = Similarity(dir_for_index, corpus, num_features=len(dictionary)) #result list of similarity scores res = (sim[corpus_quest].tolist()[0]) #Get 3 best most similar context from result list max_index = sorted(range(len(res)), key=lambda sub: res[sub])[-3:] #create dict of index (to be able able to find the context in the context list) and similarity value dict_best = {} for e in max_index: dict_best[e] = res[e] #get index of best falue best_index = max(dict_best, key=dict_best.get) print("Best context") print(ctx[best_index]) #use function sim_metric to find out if it is the appropriate context (it will return 1) sim_accuracy = sim_metric(quest, ctx[best_index]) print("similarity metric", sim_accuracy) #Look for other good solutions if the first option is not satisfactory #top3metric tells if there is a adequate solution in the 3 most similar context returned #if the first example was a good fit , it will automatically return 1 top3metric = sim_accuracy if sim_accuracy == 0: other_solutions_index = [] for j, value in dict_best.items(): if (j != best_index): other_solutions_index.append(j) if len(other_solutions_index) != 0: print("Autres solutions possibles") for k in other_solutions_index: #print(ctx[k]) metric = sim_metric(quest, ctx[k]) if (metric == 1): top3metric = 1 print("similarity metric", metric) return [sim_accuracy, top3metric]
def lsi_indexes_fill(self): try: dct = self.kwargs["lsi_parameters"]["dictionary"] lsi_model = self.kwargs["lsi_parameters"]["model"] rules, corpus = zip(*self.kwargs["lsi_parameters"]["rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) txt_vect = lsi_model[txt_corp] corpus_vects = [lsi_model[x] for x in corpus] index = Similarity(None, corpus_vects, num_features=self.kwargs["lsi_parameters"]["num_topics"]) rules_similarity = list(zip(rules, index[txt_vect])) return rules_similarity except: return None
def tfidf_text2vector(self): try: dct = self.tfidf_parameters["tf_idf_dictionary"] tfidf_model = self.tfidf_parameters["tfidf_model"] rules, corpus = zip(*self.tfidf_parameters["tf_idf_rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) txt_tf_idf_vect = tfidf_model[txt_corp] corpus_tf_idf_vects = [tfidf_model[x] for x in corpus] index = Similarity(None, corpus_tf_idf_vects, num_features=len(dct)) rules_similarity = list(zip(rules, index[txt_tf_idf_vect])) return rules_similarity except: return None
def load(self, path): if type(path) == str: path = Path(path) with open(path / 'paragraph-ids.txt') as f: self.paragraph_ids = [paragraph_id.strip() for paragraph_id in f] dictionary_path = str(path / 'dct.pkl') self.dictionary = Dictionary.load(dictionary_path) index_path = str(path / 'indexes' / 'master-index') self.index = Similarity.load(index_path) self.index.num_best = self.num_best
def __init__(self, corpus, num_features, num_clusters, max_iterations): self.similarity_index = Similarity(output_prefix = 'similarities', corpus = corpus, num_features = num_features) self.num_docs = len(self.similarity_index) self.num_clusters = num_clusters self.max_iterations = max_iterations self.num_features = num_features self.corpus = corpus self.MIN_CLUSTER_SIZE = 2
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' # Guardamos las similitudes en un archivo con un formato sencillo # NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO for idioma, salida in self.output()['langs'].iteritems(): file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma)) for n_topics, o in salida.iteritems(): index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path) sims = arrange_similarities(index, file_list, num_sims=self.num_similar_docs) sims = '\n'.join(['\t'.join([str(i) for i in t]) for t in sims]) with o.open('w') as f: f.write(sims)
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix') num_feat = len(wiki.dictionary.keys()) index = Similarity('../data/logEntropyShards/logEntropySimilarity', logent_corpus, num_features=num_feat) index.save('../data/logEntropyShards/logEntropySimilarityIndex') print('Saved Shards and similarity index') print('Getting list of titles...') bz2_wiki = bz2.BZ2File(wiki_file, "r") extract = corpora.wikicorpus.extract_pages(bz2_wiki) i = 0 matches = open('../data/title_matches.txt','a') for title,doc,z in extract: wiki_filt = corpora.wikicorpus.filter_wiki(doc) doc_token = corpora.wikicorpus.tokenize(wiki_filt) bowbow = diction.doc2bow(doc_token) if bowbow == BOW_corpus[i]: i+=1
class KMedoids(object): ''' Implementation of kmedoids clustering. There are two ways to find a medoid: - Use the element which is closest to the centroids. This is the "kmeans based" kemdoids. - Use the elemen which has the smalles summed distance to the other cluster member. This is the "kmedian based" kmedoids. So far this implementation uses the kmeans based approach. ''' def __init__(self, corpus, num_features, num_clusters, max_iterations): self.similarity_index = Similarity(output_prefix = 'similarities', corpus = corpus, num_features = num_features) self.num_docs = len(self.similarity_index) self.num_clusters = num_clusters self.max_iterations = max_iterations self.num_features = num_features self.corpus = corpus self.MIN_CLUSTER_SIZE = 2 def get_medoids(self): ''' Retuirns a Matrix containing the medoids ''' return self.medoid_similarity_index.index def __medoid_generator(self): ''' Yields all medoid documents ''' for medoid_id in self.medoids.iterkeys(): yield self.similarity_index.vector_by_id(medoid_id) def __create_medoid_similarity_index(self): self.medoid_similarity_index = MatrixSimilarity( corpus = list(self.__medoid_generator()), num_features = self.num_features) def __random_init_medoids(self): #the keys are the indices of the medoids #the values are indices list of the elements belonging to medoid self.medoids = defaultdict(list) #init random medoids for x in xrange(self.num_clusters): medoid_index = random.randrange(self.num_docs) self.medoids[medoid_index] = [] #create similarity index of medoids self.__create_medoid_similarity_index() def __assign(self): #We use cosine-similarity as metric #NOTE: the closer the cosine is to 1 the closer the documents are #the cosine distance is in <-1, 1> where 1 is the closest and -1 the farthest #we might convert it to <0, 2> where 0 is the closest and 2 the farthest in the future #dis = (dis * -1) +1 #clear all clusters for id, _ in self.medoids.iteritems(): self.medoids[id] = [] #assign each doc to closest medoid args = itertools.izip(enumerate(self.corpus), itertools.repeat(self.medoid_similarity_index)) pool = multiprocessing.Pool(POOL_SIZE) #for id, pos in pool.imap_unordered(assign_doc_to_cluster, args, # chunksize= CHUNK_SIZE): for id, pos in pool.imap_unordered(assign_doc_to_cluster, args): self.medoids[self.medoids.keys()[pos]].append(id) def __get_centroid(self, cluster): #averages all docs in cluster count = 0 centroid = numpy.zeros(self.num_features, dtype=numpy.float32) for doc_id in cluster: doc = self.similarity_index.vector_by_id(doc_id).toarray().flatten() #full_doc = matutils.sparse2full(doc, self.num_features) centroid = centroid + doc count += 1 if count != 0: centroid = centroid / count return matutils.full2sparse(centroid) def __recalculate_medoids(self): changed = False count = 0 for medoid_id, cluster in self.medoids.items(): if count % 1000 == 0: logger.info("PROGRESS: Recalculate medoid for cluster #%d id%d" % (count, medoid_id)) count +=1 if len(cluster) < self.MIN_CLUSTER_SIZE: #cluster is too small, init a new random medoid #remove medoid del self.medoids[medoid_id] #add new random medoid. the id could already be used as medoid. # for now we just risk our it ;) medoid_index = random.randrange(self.num_docs) self.medoids[medoid_index] = [] changed = True else: logger.debug("Find new centroid for cluster %d." % medoid_id) #calculate centroid and assign closest doc as new medoid centroid = self.__get_centroid(cluster) old_num_best = self.similarity_index.num_best #similarity index should only return the best fit self.similarity_index.num_best = 1 try: new_medoid_id, _ = self.similarity_index[centroid][0] except IndexError as e: logger.error("Could not find best fit for centroid: %s." % (e)) #use random medoid index new_medoid_id = random.randrange(self.num_docs) self.similarity_index.num_best = old_num_best if new_medoid_id != medoid_id: changed = True #remove old medoid del self.medoids[medoid_id] #empty medoid in any case self.medoids[new_medoid_id] = [] if changed: self.__create_medoid_similarity_index() return changed def cluster(self): logger.info("Init random medoids.") self.__random_init_medoids() logger.info("Assign elements to random clusters.") self.__assign() changed = True count = 0 while changed and count < self.max_iterations: changed = False count += 1 logger.info("Entering iteration #%d." % count) #recalculate medoids logger.info("Recalculate medoids.") changed = self.__recalculate_medoids() #assign all doc to medoids logger.info("Assign elements to new clusters.") assignment = self.__assign() if count < self.max_iterations: logger.info("Converged in %d iterations." % count) else: logger.info("May not have converged after %d iterations." % self.max_iterations) return self.medoids
input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.") logger.info("Processing input documents...") try: infile = open(input_file, 'r') except IOError: print('cannot open %s' % (input_file,)) sys.exit(1) for docnum, line in enumerate(infile): line = line.rstrip()
# load models print "\n Loading models, etc..\n" id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary') tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model') lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model') indexfile = ('./data/ta_index.txt') queryfile = './queryfiles/queryfile.txt' # text in corpus # queryfile = './queryfiles/45vuotta.txt' # Film review # queryfile = './queryfiles/tktjohdessee2.txt' # Ancient essay # check similarity print "\n Load similarity indices.\n" index = Similarity.load('./data/pgfin_index.index') index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index') with open(queryfile, 'r') as datafile: query = datafile.read() # vectorize the query text into bag-of-words and tfidf query_bow = id2word_pgfin.doc2bow(tokenize(query)) query_tfidf = tfidf_model[query_bow] query_lsi = lsi_model[query_tfidf] index_dense.num_best = 5 class BookHitValue(object):
import gensim from gensim.similarities import Similarity, MatrixSimilarity # from pgfin_timing import Timer from pgfin_helpers import tokenize logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore # load the corpora print "\n Loading corpora.\n" # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm') # lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm') # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm') lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm') # print(tfidf_corpus) # print(lsi_corpus) print "\n Start similarity index.\n" index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms) index.save('./data/pgfin_index.index') # save to disk # print index index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms) index_dense.save('./data/pgfin_matrixindex.index') # save to disk # print index_dense
passwd = "8269202" DBName = "bullhorn" db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True) app = Flask(__name__) CORS(app) resultTuple = generateCorpus() dictionary = resultTuple['dictionary'] corpus = resultTuple['corpus'] socTitleDict = resultTuple['socTitleDict'] num_topics = 200 lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics) gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics) gensimIndex.num_best = 3 @app.before_request def before_request(): db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True) resultTuple = generateCorpus() # dictionary = resultTuple['dictionary'] # corpus = resultTuple['corpus'] # socTitleDict = resultTuple['socTitleDict'] # # num_topics = 200 # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics) # gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics) # gensimIndex.num_best = 3
model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0 for doc_idx in range(0, len(similarity_index)): logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx])) rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64) fwd_doc = similarity_index.vector_by_id(doc_idx) for feature_id, val in enumerate(fwd_doc.toarray().flatten()): if val == 0: continue feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten() rev_doc[0, feature_id] = feat_rev_docs[doc_idx] rev_doc = rev_doc.tocsr()