def vars(): mydct = load('mydct.joblib') noval_corp = load('noval_corp.joblib') noval_ind = get_tmpfile('index') noval_index = Similarity(noval_ind, noval_corp, len(mydct)) val_corp = load('val_corp.joblib') val_ind = get_tmpfile('index') val_index = Similarity(val_ind, val_corp, len(mydct)) pca8 = load('pca8.joblib') nlp = yelp_tool.spacy.load('en_core_web_md', disable=['tagger', 'ner']) read = yelp_tool.Readability() nlp.add_pipe(read, last=True) return mydct, noval_index, val_index, pca8, nlp
def create_similarity_index(self): if not os.path.isfile(self.similarity_file): self.similarity_index = Similarity('./LSM/', self.corpora, self.num_topics) self.similarity_index.save(self.similarity_file) else: self.similarity_index = Similarity.load(self.similarity_file)
def cosine_similarity_only_syn(self): print("Cosine Similarity with only synsets") cos_sim = [] for data in self.data: sent1 = [word[0] for word in data[1]] sent2 = [word[0] for word in data[2]] sent3, sent4 = [], [] for word in sent1: if self.preprocessdata_o.synsets.get(word): sent3.append( list(self.preprocessdata_o.synsets.get(word))[0]) sent1 += sent3 for word in sent2: if self.preprocessdata_o.synsets.get(word): sent4.append( list(self.preprocessdata_o.synsets.get(word))[0]) sent2 += sent4 text = [sent3] + [sent4] sent_dict = corpora.Dictionary(text) corpus = [sent_dict.doc2bow(t) for t in text] sim = Similarity('-Similarity-index', corpus, num_features=len(sent_dict)) test_corpus_1 = sent_dict.doc2bow(sent1) cos_sim_each = sim[test_corpus_1][1] cos_sim.append(cos_sim_each) self.feature['cos_sim_only_syn'] = cos_sim
def create_document_similarity_model(alternate_path=False) -> dict(): DATA_MODEL_NAME = "data_model.pickle" DICT_MODEL_NAME = "dictSim.pickle" INDEX_NAME = "gensim_index.pickle" #initial word tokenization if not os.path.exists(DATA_MODEL_NAME): print("loading data files from scratch") train_X, train_Y = load_robots_txt_files(alternate_path) save_model((train_X, train_Y), DATA_MODEL_NAME) else: print("loading data files by pickle") train_X, train_Y = load_model(DATA_MODEL_NAME) #create gensim dictionary if not os.path.exists(DICT_MODEL_NAME): print("loading gensim dict from scratch") gensim_dict = Dictionary(train_X) save_model(gensim_dict, DICT_MODEL_NAME) else: print("loading gensim dict with pickle") gensim_dict = load_model(DICT_MODEL_NAME) #create lookable index if not os.path.exists(INDEX_NAME): print("building index from scratch") iterator = tqdm(map(lambda x: gensim_dict.doc2bow(x), train_X)) index = Similarity("gensim_index.models", corpus=iterator, num_features=len(gensim_dict) + 1, num_best=100) save_model(index, INDEX_NAME) else: print("loading index with pickle") index = load_model(INDEX_NAME)
def get_bow(graph, with_children=False): docs = [] for vertex in graph.vertices(): articles_text = "" for article in graph.vp.articles[vertex]: articles_text = articles_text + article docs.append(articles_text.split()) # create & save a dictionary # # remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [[word for word in document if word not in stoplist] for document in docs] # # # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) for vertex in graph.vertices(): category_articles = string.join(graph.vp.articles[vertex]).split() graph.vp.bow[vertex] = dictionary.doc2bow(category_articles) dictionary.save('/tmp/bag_of_words.dict') corpus = [graph.vp.bow[vertex] for vertex in graph.vertices()] index = Similarity('/tmp/tst', corpus=corpus, num_features=dictionary.__len__())
def check(docs, target): """ Calculate the similarity between target and docs. Parameters ---------- docs: list A list of strings to be compared against target: string The target string to be compared Returns ------- float The percentage similarity """ stemmer = PorterStemmer() tok_docs = [tokenize(text) for text in docs] stem_docs = [[stemmer.stem(word) for word in doc] for doc in tok_docs] dictionary = Dictionary(stem_docs) corpus = [dictionary.doc2bow(doc) for doc in stem_docs] tfidf = TfidfModel(corpus) sims = Similarity('/tmp/sims.index', tfidf[corpus], num_features=len(dictionary)) query = [stemmer.stem(word) for word in tokenize(target)] query_bow = dictionary.doc2bow(query) query_tfidf = tfidf[query_bow] return sum(sims[query_tfidf]) / len(sims[query_tfidf])
def build_index( self, premises: Iterable[Sentence] ) -> Tuple[Similarity, Callable[[TokenList], Vector], Iterable[Sentence]]: """Builds an index from given premises that can be used to answer similarity queries.""" if Irsel.index_cache: # if an index has already been built for these TF-IDF parameters, reuse it cached_smart, cached_dimensions, cached_index, cached_query_transformer, cached_premises = Irsel.index_cache if cached_smart == self.smart and cached_dimensions == self.dimensions and cached_premises is premises: printq("Hitting index cache.") return cached_index, cached_query_transformer, cached_premises else: printq("Skipping index cache.") dictionary, corpus = self.build_corpus( premises) # create a term-document matrix corpus, query_transformer = self.transform_corpus( dictionary, corpus) # apply TF-IDF and LSI models with Message("Storing index"): # Builds an index which we can compare queries against. index = Similarity(get_tmpfile(f"irsel_index"), corpus, num_features=len(dictionary)) printq(index) # allows us to reuse this index for later proof attempts with the same parameters Irsel.index_cache = self.smart, self.dimensions, index, query_transformer, premises return index, query_transformer, premises
def find_answer(question): # 对输入的问题进行分词 question.replace('\t', '').replace(' ', '') # .replace('\n', '') question_gen = jieba.cut(question) questionList = list(question_gen) question_seg = " ".join(questionList) print(question_seg) print(question, question_gen, questionList, question_seg) answerList = [] # 判断问题是否可以用知识库数据解决 if is_KB_QA(question_seg): print("Is KB QA:") info_list = KB_answer(questionList) for answer in info_list: answerDic = {} answerDic["answer"] = answer[2] + "为" + answer[3] answerDic["percentage"] = (int)(answer[0] * 100) answerList.append(answerDic) # 如果答案列表为空,在以回答的问题中寻找相似答案 if not answerList: print("Is not KB QA:") # 建立问题和回答的字典 dic = {} question, answer = getSellerQA(item_id) #with open(SENTENCE_PATH, "r", encoding="utf-8") as question: # with open(ANSWER_PATH, "r", encoding="utf-8") as answer: for q, a in zip(question, answer): dic[q] = a # 读取已经完成分词的语料库 sentences = [] for line in question: line.replace('\t', '').replace(' ', '') # .replace('\n', '') seg_list = jieba.cut(line) sentences.append(list(seg_list)) print('input done') # 生成字典和向量语料 #pprint(sentences) dictionary = corpora.Dictionary(sentences) corpus = [dictionary.doc2bow(text) for text in sentences] index = Similarity('-Similarity-index', corpus, num_features=400) print("training done:", list(question_gen)) # 找到与提出的问题最相似的已有问题 resultList = find_simillar(questionList, dictionary, index) # 将得到的答案整合到一个List中并返回 for answer in resultList: answerDic = {} # answerList.append(''.join(sentences[answer[0]])) answerDic["answer"] = dic[''.join(sentences[answer[0]])] answerDic["percentage"] = (int)(answer[1] * 100) answerList.append(answerDic) #answerList.append(dic[''.join(sentences[answer[0]])]) #print(dic[''.join(sentences[answer[0]])]) print(resultList) reDic = {} reDic["answer"] = answerList reDic["cnt"] = len(answerList) print(reDic) return reDic
def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'): index_temp = get_tmpfile("index") indexer = Similarity(output_prefix=index_temp, corpus=corpus, num_features=len(dct), num_best=6) indexer.save(file_name) return indexer
def similarity(self,sent1,sent2): text1 = self.wordTokenize(sent1) text2 = self.wordTokenize(sent2) texts = [text1, text2] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary)) return similarity[dictionary.doc2bow(text1)][1]
def similarity_matrix(self, corpus, dictionary): """Compute cosine similarity against a corpus of documents by storing the index matrix in memory.""" # index = MatrixSimilarity(corpus, num_features=len(dictionary)) index_temp = get_tmpfile("index") index = Similarity(index_temp, corpus, num_features=len(dictionary)) # create index for sims in index[corpus]: pprint(sims)
def get_sim(f1, f2): c1 = open(f1, encoding='utf8').read() c1 = removePunctuation(c1) print(c1) # jieba 进行分词 data1 = jieba.cut(c1) data11 = "" # 获取分词内容 for i in data1: data11 += i + " " doc1 = [data11] # 检验分词,程序成功可去掉 print("分词内容:\n") print(doc1) t1 = [[word for word in doc.split()] for doc in doc1] # print(t1) # frequence频率 freq = defaultdict(int) for i in t1: for j in i: freq[j] += 1 # print(freq) # 限制词频 '''t2 = [[token for token in k if freq[j] >= 3] for k in t1] ''' # corpora语料库建立字典 dic1 = corpora.Dictionary(t1) # 对比文件 c2 = open(f2, encoding='utf8').read() c2 = removePunctuation(c2) # jieba 进行分词 data2 = jieba.cut(c2) data21 = "" for i in data2: data21 += i + " " new_doc = data21 # print(new_doc) # doc2bow把文件变成一个稀疏向量 new_vec = dic1.doc2bow(new_doc.split()) # 对字典进行doc2bow处理,得到新语料库 new_corpor = [dic1.doc2bow(t3) for t3 in t1] # 特征数 featurenum = len(dic1.token2id) # SparseMatrixSimilarity 稀疏矩阵相似度 idx = Similarity('-Similarity-index', new_corpor, featurenum) sims = idx[new_corpor] f = open(r'/output.txt', 'w') print('%.2f' % sims, file=f) f.close() print('%.2f' % sims)
def main(): stopword = open('D:\code/test\哈工大停用词表.txt', encoding='utf8') # 获取停用词列表 stopwordlist = list(jieba.cut(stopword.read())) print(stopwordlist) try: orig_path, add_path, save_path = sys.argv[1:4] except Exception as e: print(sys.argv) print(e) # save_path = 'D:\code/test/out.txt' # 源文本预处理 # orig_path = 'D:\code/test/orig.txt' orig_file = open(orig_path, 'r', encoding="utf-8") text = orig_file.read() text = remove_punctuation(text) text = list(text) afterswlis = [] for each in text: if each not in stopwordlist: afterswlis.append(each) else: continue text = afterswlis text ="".join(text) orig_file.close() # 预处理查重文本 # add_path = 'D:\code/test/orig_0.8_dis_15.txt' add_file = open(add_path, 'r', encoding="utf-8") add_text = add_file.read() add_file.close() add_text = remove_punctuation(add_text) add_text = list(add_text) afterswlis = [] for each in add_text: if each not in stopwordlist: afterswlis.append(each) else: continue add_text = afterswlis add_text = "".join(add_text) # 文本转向量 texts = [jieba.lcut(text)] dictionary = corpora.Dictionary(texts) num_features = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] add_vec = dictionary.doc2bow(jieba.lcut(add_text)) # 向量计算相似度 similarity = Similarity('-Similarity-index', corpus, num_features) # 转换类型,切片保留两位小数 a = similarity[add_vec] b = a[0] b = str(b).split('.')[0] + '.' + str(a).split('.')[1][:2] print("相似的计算结果:%s" % b) # 输出结果写入指定文档 f = open(save_path, 'w', encoding="utf-8") f.write("相似的计算结果:%s" % b) f.close()
def GetLsm(self, dictionary, corpus): lsi = models.lsimodel.LsiModel( corpus, id2word=dictionary) #num_topics=len(corpus)/2 vec_lsi = lsi[corpus[0]] index = Similarity('l_index', corpus, len(dictionary)) cnt = 0 for similarities in index: if cnt == 1: return list(enumerate(similarities)) cnt += 1
def tf_text2vector(self): try: dct = self.tf_parameters["tf_dictionary"] rules, corpus = zip(*self.tf_parameters["tf_rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) index = Similarity(None, corpus, num_features=len(dct)) rules_similarity = list(zip(rules, index[txt_corp])) return rules_similarity except: return None
def GetTfidf(self, dictionary, corpus): tfidf = models.TfidfModel(corpus) vec_lsi = tfidf[corpus[0]] index = Similarity('t_index', corpus, len(dictionary)) #tsims = index[vec_lsi] cnt = 0 for similarities in index: if cnt == 1: return list(enumerate(similarities)) cnt += 1
def get_sim(model, corps): """get Similarity for corpus and model Args: model (TfIdfModel): TfIdf model to develop Similarity corps (Dictionary): Dictionary of words Returns: [type]: [description] """ return Similarity(None, model[corps], num_features=400)
def __init__(self, loader_obj): self.model_types = [("lsi", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() self.et_vectors = self.tkz_model.application_field["texts"] self.coeffs = self.tkz_model.application_field["coeff"] self.tags = self.tkz_model.application_field["tags"] self.index = Similarity( None, self.et_vectors, num_features=self.model.texts_algorithms["num_topics"])
def main(path="train.json"): #get a random question quest = select_question(path) print("Random question : ") print(quest) #Tokenize and create gensim dictionnary dictionary, corpus_quest = process_question(quest) tfidf = gensim.models.TfidfModel(corpus_quest) #corpus of contexts processing ctx = import_context(path) corpus = process_contexts(ctx) #Global corpus dictionnary corpus = final_process_context(corpus, dictionary) dir_for_index = get_tmpfile("index_sim") #Similarity function to compare each context to the question sim = Similarity(dir_for_index, corpus, num_features=len(dictionary)) #result list of similarity scores res = (sim[corpus_quest].tolist()[0]) #Get 3 best most similar context from result list max_index = sorted(range(len(res)), key=lambda sub: res[sub])[-3:] #create dict of index (to be able able to find the context in the context list) and similarity value dict_best = {} for e in max_index: dict_best[e] = res[e] #get index of best falue best_index = max(dict_best, key=dict_best.get) print("Best context") print(ctx[best_index]) #use function sim_metric to find out if it is the appropriate context (it will return 1) sim_accuracy = sim_metric(quest, ctx[best_index]) print("similarity metric", sim_accuracy) #Look for other good solutions if the first option is not satisfactory #top3metric tells if there is a adequate solution in the 3 most similar context returned #if the first example was a good fit , it will automatically return 1 top3metric = sim_accuracy if sim_accuracy == 0: other_solutions_index = [] for j, value in dict_best.items(): if (j != best_index): other_solutions_index.append(j) if len(other_solutions_index) != 0: print("Autres solutions possibles") for k in other_solutions_index: #print(ctx[k]) metric = sim_metric(quest, ctx[k]) if (metric == 1): top3metric = 1 print("similarity metric", metric) return [sim_accuracy, top3metric]
def getSimilarity(df_content_o): logging.debug('preparing docSim') raw_documents = list(df_content_o['content']) corpora_documents = [] for item_text in raw_documents: item_str = item_text.split(' ') corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1 similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!! similarity.num_best = max_similar_num return similarity,dictionary
def tfidf_text2vector(self): try: dct = self.tfidf_parameters["tf_idf_dictionary"] tfidf_model = self.tfidf_parameters["tfidf_model"] rules, corpus = zip(*self.tfidf_parameters["tf_idf_rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) txt_tf_idf_vect = tfidf_model[txt_corp] corpus_tf_idf_vects = [tfidf_model[x] for x in corpus] index = Similarity(None, corpus_tf_idf_vects, num_features=len(dct)) rules_similarity = list(zip(rules, index[txt_tf_idf_vect])) return rules_similarity except: return None
def __init__(self, initializer): preprocessed_documents = initializer.getPreprocessedDocuments() dictionary = initializer.getDictionary() corpus = [dictionary.doc2bow(text) for text in preprocessed_documents] tf_idf = initializer.getTfIdf() query_doc_tf_idf = tf_idf[dictionary.doc2bow( preprocessed_documents[0])] similarity_object = Similarity('tfidf', tf_idf[corpus], num_features=len(dictionary)) similarities = similarity_object[query_doc_tf_idf] similarity_object.destroy() self.scores = similarities[1:len(similarities)]
def lsi_indexes_fill(self): try: dct = self.kwargs["lsi_parameters"]["dictionary"] lsi_model = self.kwargs["lsi_parameters"]["model"] rules, corpus = zip(*self.kwargs["lsi_parameters"]["rules_corpus"]) txt_corp = dct.doc2bow(self.lemm_txt.split()) txt_vect = lsi_model[txt_corp] corpus_vects = [lsi_model[x] for x in corpus] index = Similarity(None, corpus_vects, num_features=self.kwargs["lsi_parameters"]["num_topics"]) rules_similarity = list(zip(rules, index[txt_vect])) return rules_similarity except: return None
def tdif_metrics(corpus_path: str) -> None: prep = DSSMPrepare() raw_ques, raw_docs, rels = prep.from_one_corpus(corpus_path) docs = [[w.lower() for w in word_tokenize(text)] for text in raw_docs.values()] ques = [[w.lower() for w in word_tokenize(text)] for text in raw_ques.values()] docs = docs + ques dictionary = Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] tf_idf = TfidfModel(corpus) right = {} for did, doc_text in raw_docs.items(): dense_input = [w.lower() for w in word_tokenize(doc_text)] dense_input = dictionary.doc2bow(dense_input) dense_input = tf_idf[dense_input] right[did] = dense_input left = {} for qid, ques_text in raw_ques.items(): dense_input = [w.lower() for w in word_tokenize(doc_text)] dense_input = dictionary.doc2bow(dense_input) dense_input = tf_idf[dense_input] left[qid] = dense_input relations = pd.DataFrame(rels, columns=['label', 'id_left', 'id_right']) res = {} res['MAP'] = 0.0 res['NDCG@3'] = 0.0 res['NDCG@5'] = 0.0 num_valid = 0 for group in relations.groupby('id_left'): qid, data = group dids = data['id_right'].values.tolist() labels = data['label'].values.tolist() c = [right[did] for did in dids] sims = Similarity('tf_idf', tf_idf[c], num_features=len(dictionary)) scores = sims[left[qid]] rank = list(zip(labels, scores)) random.shuffle(rank) rank = sorted(rank, key=lambda x: x[1], reverse=True) rank = [float(r[0]) for r in rank] res['MAP'] += average_precision(rank) res['NDCG@3'] += ndcg_at_k(rank, 3) res['NDCG@5'] += ndcg_at_k(rank, 5) num_valid += 1 click.echo('\t'.join([f"{k}={v / num_valid:.3f}" for k, v in res.items()]))
def calculateAverageSimilarity(singleDocument, arrayOfDocuments): gen_docs = [ stemAndTokenizeArray(document) for document in arrayOfDocuments ] dictionary = Dictionary(gen_docs) corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] tf_idf = TfidfModel(corpus) sims = Similarity('/tmp/', tf_idf[corpus], num_features=len(dictionary)) query_doc = stemAndTokenizeArray(singleDocument) query_doc_bow = dictionary.doc2bow(query_doc) query_doc_tf_idf = tf_idf[query_doc_bow] similarities = sims[query_doc_tf_idf] return sum(similarities) / len(similarities)
def main(): orig_qns = [qn.strip()for qn in open('data/questions.txt')] aug = [qn.strip() for qn in open('data/augmented.txt')] all_qns = [] for idx, qn in tqdm(enumerate(orig_qns)): all_qns.append(qn) if aug[idx] != qn: all_qns.append(aug[idx]) print("Combined original questions and augmented questions") pickle.dump(all_qns, open("precompute/questions.pkl", 'wb')) qns = pickle.load(open("precompute/questions.pkl", 'rb')) documents = [] for qn in tqdm(qns): document = get_similar.preprocess_text(qn) if len(document) < 1: document = ['UNK'] documents.append(document) print(f"Finished preprocessing {len(documents)} questions") pickle.dump(documents, open("precompute/documents.pkl", "wb")) print("Saved tokens to documents.pkl") documents = pickle.load(open("precompute/documents.pkl", "rb")) dct = corpora.Dictionary(documents) pickle.dump(dct, open("precompute/dct.pkl", 'wb')) dct.save('precompute/dct.dict') dct = corpora.Dictionary.load('precompute/dct.dict') corpus = [dct.doc2bow(doc) for doc in tqdm(documents)] pickle.dump(corpus, open("precompute/corpus.pkl", 'wb')) print("Corpus generated") tfidf = models.TfidfModel(corpus, smartirs='bfn') pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb')) corpus_tfidf = tfidf[corpus] pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb')) print("tfidf generated") index_temp = get_tmpfile("index") index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100) index.save("precompute/similarities.pkl") print("Similarity index saved") PIPE = subprocess.PIPE #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE) NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model']) NLU.wait() print("Rasa NLU trained")
def cosine_similarity_no_syn(self): print("Cosine Similarity without synsets") cos_sim = [] for data in self.data: sent1 = [word[0] for word in data[1]] sent2 = [word[0] for word in data[2]] text = [sent1] + [sent2] sent_dict = corpora.Dictionary(text) corpus = [sent_dict.doc2bow(t) for t in text] sim = Similarity('-Similarity-index', corpus, num_features=len(sent_dict)) test_corpus_1 = sent_dict.doc2bow(sent1) cos_sim_each = sim[test_corpus_1][1] cos_sim.append(cos_sim_each) self.feature['cos_sim_no_syn'] = cos_sim
def main(dataset_path): if not os.path.exists('../data/retriever/paragraph-ids.txt'): print('Writing paragraph ID to file...') with open('../data/retriever/paragraph-ids.txt', 'w') as f: for paragraph_id in load_ids(dataset_path): f.write(paragraph_id + '\n') dictionary_path = '../data/retriever/dct.pkl' if not os.path.exists(dictionary_path): print('Creating dictionary...') st = time.time() dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000) dct.save(dictionary_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating dictionary in {et - st}s.') else: print('Loading dictionary...') dct = Dictionary.load(dictionary_path) print('Dictionary loaded.') tfidf_path = '../data/retriever/tfidf.pkl' if not os.path.exists(tfidf_path): print('Creating model...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) model = TfidfModel(corpus) model.save(tfidf_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating model in {et - st}s.') else: print('Loading model...') model = TfidfModel.load(tfidf_path) print('Model loaded.') index_path = '../data/retriever/indexes/master-index' if not os.path.exists(index_path): print('Creating index...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) index = Similarity('../data/retriever/indexes/index', model[corpus], len(dct)) index.save(index_path) et = time.time() print(f'\rFinished creating index in {et - st}s.') print('Done') else: print('Nothing to do. Exiting...')
def get_docsim_feature(contents, remarks=""): dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks corpora_documents = [] tokenizer = Tokenizer() for item_text in contents: item_str = tokenizer(item_text) corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] similarity = Similarity('-Similarity-index', corpus, num_features=300) similarity.num_best = 3 pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4) pickle.dump(corpus, open(corpus_path, "wb"), protocol=4) return similarity, corpus
def create_sim_matrix(tfidf, corpus, dictionary, outputDir): """" Creates a Gensim simiariry matrix for document similarity comparison and saves it tfidf (Gensim tfidf model): Gensim tfidf model corpus (Gensim corpus object): Gensim corpus dictionary (Gensim dictionary object): Gensim dictionary outputDir (string): Location to save matrix """ indicesFile = outputDir + 'indices' simFile = outputDir + 'Index' sims = Similarity(indicesFile, tfidf[corpus], num_features=(len(dictionary))) sims.close_shard() sims.save(simFile) print('Similarity matrix created and stored at: ' + simFile)