def getSparseMatrixSimilarity(keyword, texts): # 1、将【文本集】生成【分词列表】 texts = [jieba.lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(jieba.lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): print('kw 与 text%d 相似度为:%.2f' % (e, s)) print(sparse_matrix) print(similarities)
def init_model(self): corpus = [] for text in tqdm(self.texts, desc="make corpus (BoW)"): corpus.append(self.parse(text)) self.model = TfidfModel(corpus) self.index = SparseMatrixSimilarity(self.model[corpus], num_features=len(self.vocab))
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "TFIDFRanker": model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) dataset = TFIDFRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = TFIDFRanker.build_index_mapping(names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus, model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf) ranker.persist(model_path) logging.info('TFIDFRanker : initialized') logging.info('TFIDFRanker : model : {}'.format(tfidf_model)) logging.info('TFIDFRanker : index : {}'.format(tfidf_index)) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('TFIDFRanker : initialized') return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus, model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
def mergeTags(): res = {} # 创建一个空字典 for i in range(len(displayArr)): texts = default_tags keyword = displayArr[i] # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if s > 0.5: # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s) key = ''.join(texts[e - 1]).strip() res[key] = s arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True) for ind, (k, v) in enumerate(arrSorted): if ind == 0: ids = textsOld[i].strip().split('.')[0] textsOld[i] = textsOld[i] + '----------' + k # textsOld[i] = ids+'.'+k res = {} #字典置空 return textsOld
def samilarRate(texts, keyword): # 传入texts,keyword # 文本集和搜索词 # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) result = [] sorft = [] for e, s in enumerate(similarities, 1): result.append('kw 与 text%d 相似度为:%.2f' % (e, s)) sorft.append(s) return result, sorft
def check(news): """检查是否重复""" dictionary, corpus, num_features = Similar.dictionary() kw_vector = dictionary.doc2bow(lcut(news)) tfidf = TfidfModel(corpus) tf_texts = tfidf[corpus] tf_kw = tfidf[kw_vector] sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if 0.6 < s < 0.98: return return news
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity: similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim') try: sim = MatrixSimilarity.load(similarityPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) if lsiModel is None: lsiModel = getLsiModel(tfidfModel) sim = SparseMatrixSimilarity(lsiModel[corpus], num_best=21, num_features=tfidfModel.vectors.shape[0]) sim.save(similarityPath) return sim
def fit(self): """ Fit the TFIDF model each argument should be a list of lists, where each inner list is a list of keyphrases. e.g. submission_kps = [ ['deep_learning', 'natural_language_processing'], ['neural_imaging', 'fmri', 'functional_magnetic_resonance'] ] """ self.bow_archives_by_paperid = {userid: [self.dictionary.doc2bow(doc) for doc in archive] \ for userid, archive in self.kp_archives_by_paperid.items()} self.bow_archives_by_userid = {userid: [self.dictionary.doc2bow(doc) for doc in archive] \ for userid, archive in self.kp_archives_by_userid.items()} flattened_archives = [ bow for archive in self.bow_archives_by_paperid.values() for bow in archive ] self.index = SparseMatrixSimilarity( [self.tfidf[bow] for bow in flattened_archives], num_features=len(self.dictionary))
def create_index(self, docs_with_urls): logger.info("Creating index out of {} documents".format( len(docs_with_urls))) urls, doc_bows = zip(*self.infer_all(docs_with_urls)) self.urls = urls self.index = SparseMatrixSimilarity(doc_bows, num_features=len(self.dictionary))
def get_sim(previous_req, new_req, flag): # 利用cosmic和非cosmic信息构建的Dictionary dictionary = corpora.Dictionary.load_from_text('./data/total.dic') # print('new_req.shape:', new_req.shape, 'raw_info.shape:', raw_info.shape, 'previous_req.shape:', previous_req.shape) corpus = [dictionary.doc2bow(text) for text in previous_req[flag]] tfidf = models.TfidfModel(corpus) index = SparseMatrixSimilarity(tfidf[corpus], 4000) new = [dictionary.doc2bow(t) for t in new_req['joint_info']] sim_dict = {} sim = index[new] relation_exist = [] for i in range(new_req.shape[0]): key = (new_req['batch'].iloc[i], new_req['projectNo'].iloc[i], new_req['requirementNO'].iloc[i]) value = {} current = { key, } # print(key) for j in range(len(sim[i])): if sim[i][j] >= 0.8: inner_key = (previous_req['batch'].iloc[j], previous_req['projectNo'].iloc[j], previous_req['requirementNO'].iloc[j]) if inner_key == key: continue else: value[inner_key] = sim[i][j] current.add(inner_key) if value and (current not in relation_exist): relation_exist.append(current) sim_dict[key] = value return sim_dict, len(relation_exist)
def __tfidf_sim_match(self, query, ans_list, threshold=0.10): cut_query, _ = self.reader.clean_cut_trim([query]) # 清洗query cut_ans_list, _ = self.reader.clean_cut_trim(ans_list) # 清洗ans_list ans_bow = [self.tfidf_dict.doc2bow(line) for line in cut_ans_list] # 用ans_list做一个bag of words text_tfidf = self.tfidf_model[ans_bow] # apply model sim_index = SparseMatrixSimilarity(text_tfidf, self.n_features) query_bow = [self.tfidf_dict.doc2bow(cut_query[0]) ] # 用query做一个bag of words query_tfidf = self.tfidf_model[query_bow] # 用tfidf model编码 similarities = sim_index[query_tfidf][0] # 算相似度 sorted_scores = sorted(similarities, reverse=True) # 将得分从大到小排序 max_pos = np.argsort(similarities)[::-1] # 从大到小排序,返回index(而不是真正的value) answers = self.__max_pos2answers(max_pos, ans_list) # 用QQ匹配的阈值过滤一遍结果 sorted_scores, max_pos, answers, questions = \ self.__filter_by_threshold(sorted_scores, max_pos, answers, [], threshold) if len(answers) > 0: return True else: return False
async def load_index(): if 'index' not in model: index_file = await tasks['index'] model['index'] = SparseMatrixSimilarity.load(index_file) for shard in model['index'].shards: shard.dirname = os.path.dirname(index_file) return model['index']
def mergeTags(textArr): res = [] for i in range(len(displayArr)): try: exampleArr = textArr if i == 0: texts = textArr else: for item in res: if item in exampleArr: exampleArr.remove(item) texts = exampleArr res = [] # print(exampleArr) # 文本集和搜索词 # texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思', # '而是出自策略射击游戏《绝地求生:大逃杀》里的台词', # '我吃鸡翅,你吃鸡腿'] keyword = texts[i] # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if s > 0.5: res.append(exampleArr[e - 1]) print(keyword + ' 与 ' + exampleArr[e - 1] + ' 的相似度为 :', s) print('---------------------------------------------------') except: print('') print('合并完成!')
def predict_tfid(pred_data, data): path = input("Enter path to LDA model: ") tfid = gensim.models.TfidfModel.load(path + "tfid_model") corpus = MmCorpus(path + "tfid_corpus.mm") tfid_corpus = tfid[corpus] new_dictionary = Dictionary(data['tokens']) new_corpus = [new_dictionary.doc2bow(doc) for doc in data['tokens']] index_sparse = SparseMatrixSimilarity(tfid_corpus, num_features=corpus.num_terms) index_sparse.num_best = 500 idx = (index_sparse[new_corpus]) print("Most Similar users are as follows: ") print("Name\t\t\tscore ") m = 1 for i in idx[0]: display("{}. {} {}".format(m, data.iloc[i[0]]['handles'], i[1])) m += 1 return
def build_model(self, listoftextlist): self.dictionary = Dictionary(listoftextlist) self.num_features = len(self.dictionary.token2id) self.corpus = [ self.dictionary.doc2bow(text) for text in listoftextlist ] self.tfidf = TfidfModel(self.corpus) self.index = SparseMatrixSimilarity(self.tfidf[self.corpus], self.num_features)
def create_neg_from_cluster(data, cluster_id, all_clusters): ''' Creates negative pairs from a cluster ''' # Get the cluster cluster = data.loc[data["cluster_id"].values == cluster_id] cluster = extract_key_features(cluster) pairs = [] hard_neg = len(cluster) // 2 # Hard negatives are those that are from different clusters, but we get the pair with the highest similarity for row in range(hard_neg): # Keep choosing random titles until we get one that is not our own neg_cluster_id = cluster_id while neg_cluster_id == cluster_id: neg_cluster_id = random.choice(all_clusters) # Extract data about this cluster neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy() neg_cluster = extract_key_features(neg_cluster) # Add the current title of the cluster to the beginning of this random cluster so that # the first row in the similarity matrix will refer to this title neg_cluster = pd.concat([pd.DataFrame([cluster.iloc[row].values], columns=["id", "description", "title", "titleDesc"]), neg_cluster]) # Get the similarity between the title and the random cluster dictionary = corpora.Dictionary(neg_cluster["titleDesc"]) neg_cluster_dict = [dictionary.doc2bow(title) for title in neg_cluster["title"].map(lambda x: x.split(" "))] sim_matrix = np.array(SparseMatrixSimilarity(neg_cluster_dict, num_features=len(dictionary))) # First row is the similarity between the current title and the rest of the random cluster # so get the max similarity of this (+1 is because we don't include the similarity with ourself) max_val = sim_matrix[0][1:].argmax() + 1 # Add the pair pair = [cluster["title"].iloc[row], neg_cluster["title"].iloc[max_val], 0] pairs.append(pair) for row in range(hard_neg, len(cluster)): # Keep choosing random titles until we get one that is not our own neg_cluster_id = cluster_id while neg_cluster_id == cluster_id: neg_cluster_id = random.choice(all_clusters) # Randomly get a title from the random cluster neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy() neg_cluster = extract_key_features(neg_cluster) neg_title = neg_cluster["title"].iloc[random.choice(list(range(len(neg_cluster))))] # Add the pair pair = [cluster["title"].iloc[row], neg_title, 0] pairs.append(pair) return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])
def load(self, dir_path): dir_path = Path(dir_path) vocab_path = str(dir_path / self.VOCAB_FNAME) model_path = str(dir_path / self.TFIDF_FNAME) index_path = str(dir_path / self.INDEX_FNAME) self.vocab = Dictionary.load(vocab_path) self.model = TfidfModel.load(model_path) self.index = SparseMatrixSimilarity.load(index_path)
def similar(aim): aim_text = aim.title + aim.abstract simple = [x.title + x.abstract for x in ret[0:-10]] text = [set(posseg.lcut(x)) for x in simple] text = list({y for x in text for y in x}) dictionary = Dictionary(text) length = len(dictionary.token2id) corpus = [dictionary.doc2bow(lcut(src)) for src in simple] tfidf = TfidfModel(corpus) tf_texts = tfidf[corpus] sparse_matrix = SparseMatrixSimilarity(tf_texts, length) vector = dictionary.doc2bow(lcut(aim_text)) tf_kw = tfidf[vector] similarities = sparse_matrix.get_similarities(tf_kw) print(aim.title) for e, s in enumerate(similarities, 1): if s > 0.1: print(s, ret[e - 1].title)
def create_pos_from_cluster(data, cluster_id): ''' Creates positive pairs from a cluster ''' MAX_PAIRS = 16 cluster = data.loc[data["cluster_id"].values == cluster_id] cluster = extract_key_features(cluster) max_combos = combinations(len(cluster), 2) dictionary = corpora.Dictionary(cluster["titleDesc"]) cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))] sim_matrix = np.array(SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary))) # Because the matrix is redundant (the rows and columns represent the same titles) # we set the bottom half of the similarities (including the diagonal) to 100 # so that we don't have to worry about them when doing argmin() for row in range(sim_matrix.shape[0]): for column in range(sim_matrix.shape[1]): if (row >= column): sim_matrix[row][column] = 100 # If the maximum amount of combinations we can make is less than our set max, # set the maximum to the max combos if max_combos < MAX_PAIRS: MAX_PAIRS = max_combos # Half of the pairs should be hard positives and the other half random hard_pos = MAX_PAIRS // 2 random_pos = MAX_PAIRS - hard_pos pairs = [] # Hard positives are those that are from the same cluster, but with the least similarity for x in range(hard_pos): # Keep getting the pairs with the lowest similarity score min_sim = np.unravel_index(sim_matrix.argmin(), sim_matrix.shape) pair = [cluster["title"].iloc[min_sim[0]], cluster["title"].iloc[min_sim[1]], 1] pairs.append(pair) sim_matrix[min_sim[0]][min_sim[1]] = 100 # The amount of available pairs (given that some are gone from hard positive creation) avail_indices = np.argwhere(sim_matrix != 100) # Get random pairs within the same cluster for x in range(random_pos): ran_idx = random.sample(list(range(len(avail_indices))), 1) choice = avail_indices[ran_idx][0] pair = [cluster["title"].iloc[choice[0]], cluster["title"].iloc[choice[1]], 1] pairs.append(pair) avail_indices = np.delete(avail_indices, ran_idx, 0) return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])
def index(self, corpus, mode="MatrixSimilarity"): if mode == "MatrixSimilarity": self._index = MatrixSimilarity(self.corpus, num_features=self.num_features) elif mode == "SparseMatrixSimilarity": self._index = SparseMatrixSimilarity( self.corpus, num_features=self.num_features) else: raise TypeError( "mode has to be either MatrixSimilarity or SparseMatrixSimilarity" ) return self._index[corpus]
def to_sparse_matrix(self): # 1、将【文本集】生成【分词列表】 #texts = [lcut(text) for text in texts_list] texts = [lcut(str(text)) for text in self.texts_list] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) self.dictionary=dictionary num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) self.tfidf=tfidf tf_texts = tfidf[corpus] sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) return sparse_matrix
def get_sim(all_reqs, new_req): # 利用cosmic和非cosmic信息构建的Dictionary dictionary = corpora.Dictionary.load_from_text('./data/total.dic') corpus = [dictionary.doc2bow(text) for text in all_reqs['joint_info']] # 得到tfidf模型 tfidf = models.TfidfModel(corpus) # 得到稀疏矩阵相似度计算模型 index = SparseMatrixSimilarity(tfidf[corpus], 4000) # 储存相似度结果的双层嵌套词典, 格式为:{new_req_id1:{sim_req_id1:sim_value, ...}, ...} # 其中主键id为需求的type, batch, projectNO, requirementNO, file_trail信息组成的tuple sim_dict = {} # 先根据dictionary将new_req(新需求)信息构建words bow,再使用tfidf(TF-IDF模型)将其转化为TF-IDF向量 # 将新需求的TF-IDF向量输入到 index(SparseMatrixSimilarity模型)得到相似度矩阵sim, 大小为 [len(new_req) * len(all_req)] sim = index[tfidf[[dictionary.doc2bow(t) for t in new_req['joint_info']]]] print('相似矩阵的大小为:', sim.shape) relation_exist = [] for i in range(sim.shape[0]): key = (new_req['type'].iloc[i], new_req['batch'].iloc[i], new_req['projectNO'].iloc[i], new_req['requirementNO'].iloc[i], new_req['file_trail'].iloc[i]) # print(key) value = {} for j in range(sim.shape[1]): if sim[i][j] >= 0.86: inner_key = (all_reqs['type'].iloc[j], all_reqs['batch'].iloc[j], all_reqs['projectNO'].iloc[j], all_reqs['requirementNO'].iloc[j], all_reqs['file_trail'].iloc[j]) if inner_key == key: continue else: ## current_relation是 关系对set current_relation = {key, inner_key} if current_relation not in relation_exist: value[inner_key] = sim[i][j] relation_exist.append(current_relation) if value: sim_dict[key] = value return sim_dict, len(relation_exist)
def compute_section_text_similarity(corpus: Sequence[str]): """ Convert text bundle into tfidf vectors.""" # Use generator to increase memory efficiency def tokenized_corpus() -> typing.Generator[list[str], None, None]: yield from (tokenize(doc) for doc in corpus) def bag_of_words_corpus( dct: corpora.Dictionary ) -> typing.Generator[list[tuple[int, int]], None, None]: yield from (dct.doc2bow(doc) for doc in tokenized_corpus()) word_id_map = corpora.Dictionary(tokenized_corpus()) tfidf = models.TfidfModel(bag_of_words_corpus(word_id_map), dictionary=word_id_map) similarity_index = SparseMatrixSimilarity( tfidf[bag_of_words_corpus(word_id_map)], num_features=len(word_id_map)) pairwise_similarity = [ simi for idx, similarities in enumerate(similarity_index) for simi in similarities[idx + 1:] ] return sum(pairwise_similarity) / len(pairwise_similarity)
dictionary = corpora.Dictionary(worked_texts) corpus = [dictionary.doc2bow(worked_text) for worked_text in worked_texts] # convert the dictionary to a bag of words corpus for reference lsi_model = models.LsiModel(corpus, id2word=dictionary, num_topics=12) query = "April is the fourth month of the year, and comes between March \ and May. It has 30 days. April begins on the same day of week as July in \ all years and also January in leap years." query = "IF YOU'RE LIKE most iPhone users, when you upgraded to the newest version of iOS, Apple automatically migrated your settings, apps, and text messages. While there are benefits to wiping your phone and starting over—c'mon, you don't really need all those apps—there's also the possibility that you might lose valuable info hidden within your text messages." tfidf_model = TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity index_sparse = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) import sklearn sklearn.externals.joblib.dump(tfidf_model, 'tfidf_model.pkl') sklearn.externals.joblib.dump(lsi_model, 'lsi_model.pkl') sklearn.externals.joblib.dump(texts, 'texts.pkl') sklearn.externals.joblib.dump(worked_texts, 'worked_texts.pkl') sklearn.externals.joblib.dump(titles, 'titles.pkl') sklearn.externals.joblib.dump(dictionary, 'dictionary.pkl') sklearn.externals.joblib.dump(corpus, 'corpus.pkl') sklearn.externals.joblib.dump(index_sparse, 'index_sparse.pkl')
def __init__(self): # 读入停用词表 with open(FilePool.stopword_txt, 'r') as f_stopword: doc = f_stopword.readlines() self.stopwords = [line.rstrip('\n') for line in doc] # 读入答案 if args.answer_base == 'long': # 使用长答案 ans_json = FilePool.long_answers_json ans_txt = FilePool.long_answers_txt elif args.answer_base == 'cleaned': # 使用短答案 ans_json = FilePool.cleaned_answers_json ans_txt = FilePool.cleaned_answers_txt else: # 使用small answers ans_json = FilePool.small_answers_json ans_txt = FilePool.small_answers_txt with open(ans_json, 'r') as f_json: text = json.load(f_json) if args.trim_stop: self.cut_answers = [[ele for ele in answer if ele not in self.stopwords] for answer in text] else: self.cut_answers = text with open(ans_txt, 'r') as f_ans_txt: text = f_ans_txt.readlines() self.uncut_answers = [line.rstrip('\n') for line in text] # 读入QA库和已知问题库 if args.method == Method.mix or args.method == Method.qq_match: with open(FilePool.qa_file, 'r') as f_qa: self.qa = json.load(f_qa) with open(FilePool.base_ques_list_file, 'r') as f_base_ques_list: self.base_ques_list = json.load(f_base_ques_list) # 提前实例化bm25模型,提升性能 # 如果提前对问题分类了,那也要提前实例化模型,给分类为空的问题兜底 if (args.method == Method.bm25 or args.method == Method.bm25_syn): self.bm25_model_uncat = BM25(self.cut_answers) if args.method == Method.mix or args.method == Method.bm25_new: self.bm25_model_uncat = NewBM25(self.cut_answers) # 提前实例化tfidf模型,提升性能 if args.method == Method.mix or args.method == Method.qq_match: self.tfidf_dict = Dictionary(self.base_ques_list) # fit dictionary n_features = len(self.tfidf_dict.token2id) bow = [self.tfidf_dict.doc2bow(line) for line in self.base_ques_list] # convert corpus to BoW format # 构造tf-idf模型 self.tfidf_model = TfidfModel(bow) # fit model text_tfidf = self.tfidf_model[bow] # apply model self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features) elif args.method == Method.tfidf_sim: self.tfidf_dict = Dictionary(self.cut_answers) # fit dictionary n_features = len(self.tfidf_dict.token2id) bow = [self.tfidf_dict.doc2bow(line) for line in self.cut_answers] # convert corpus to BoW format # 构造tf-idf模型 self.tfidf_model = TfidfModel(bow) # fit model text_tfidf = self.tfidf_model[bow] # apply model self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features) # 实例化Parser self.parser = StanfordDependencyParser(path_to_jar=FilePool.stanford_parser, path_to_models_jar=FilePool.stanford_chinese_model)
def build_positive_pairs(corpus, clusters, attribute, num_pos): pos_pairs = [] for current_cluster in tqdm(clusters): cluster_data = corpus[corpus['cluster_id'] == current_cluster] # build gensim dictionary, corpus and search index for selected cluster dct = Dictionary(cluster_data[attribute], prune_at=5000000) dct.filter_extremes(no_below=2, no_above=1.0, keep_n=None) gensim_corpus = [dct.doc2bow(text) for text in cluster_data[attribute]] index = SparseMatrixSimilarity(gensim_corpus, num_features=len(dct), num_best=80) # query up to 80 most similar offers, only offers with similarity > 0 will be returned query = index[gensim_corpus] for i, offer_sim_dup in enumerate(query): current_num_pos = num_pos current_id = cluster_data.iloc[i]['id'] offer_sim = [] # remove self for x in offer_sim_dup: if x[0] != i: offer_sim.append(x) # check if any pairs > 0 similarity remain if len(offer_sim) == 0: pos_pairs.append((current_id, [[], []])) continue # adapt number of selectable pairs if too few available offer_len = len(offer_sim) if offer_len < current_num_pos: current_num_pos = offer_len if current_num_pos == 1: hard_pos = 1 random_pos = 0 elif current_num_pos % 2 == 1: hard_pos = int(current_num_pos / 2) + 1 random_pos = int(current_num_pos / 2) else: hard_pos = int(current_num_pos / 2) random_pos = int(current_num_pos / 2) # get hard offers from bottom of list hard_offers = offer_sim[-hard_pos:] if random_pos == 0: pos_pairs.append( (current_id, [[cluster_data.iloc[x[0]]['id'] for x in hard_offers], []])) continue # remaining offers rest = offer_sim[:-hard_pos] # randomly select from remaining random_select = random.sample(range(len(rest)), random_pos) random_offers = [rest[idx] for idx in random_select] hard_ids = [cluster_data.iloc[x[0]]['id'] for x in hard_offers] random_ids = [cluster_data.iloc[x[0]]['id'] for x in random_offers] pos_pairs.append((current_id, [hard_ids, random_ids])) return pos_pairs
else: """建立词典 获得特征数""" dictionary = corpora.Dictionary(diff_word_list) feature_cnt = len(dictionary.token2id.keys()) """基于词典 分词列表转稀疏向量集""" corpus = [dictionary.doc2bow(codes) for codes in diff_word_list] # print("key") # print([x for x in word_list if x not in stopwords]) kw_vector = dictionary.doc2bow([x for x in word_list if x not in stopwords]) """创建tf-idf模型 传入语料库训练""" tfidf = TfidfModel(corpus) """训练好的tf-idf模型处理检索文本和搜索词""" tf_texts = tfidf[corpus] tf_kw = tfidf[kw_vector] """相似度计算""" sparse_matrix = SparseMatrixSimilarity(tf_texts, feature_cnt) similarities = sparse_matrix.get_similarities(tf_kw) # print("similarities") # print(similarities) # for e, s in enumerate(similarities, 1): # print('kw 与 text%d 相似度为:%.2f' % (e, s)) conceptualSimilarity.append(max(similarities)) """key word ratio""" keywordsInComments = [x for x in word_list if x in languageKeyWords] stopKeyRatio.append(keywordsInComments.__len__() / word_list.__len__()) print(readable) print(max(readable), min(readable))
def fit(self, source, target, sourcetext='text', sourcedate='publication_date', targettext='text', targetdate='publication_date', keyword_source=None, keyword_target=None, keyword_source_must=False, keyword_target_must=False, condition_source=None, condition_target=None, days_before=None, days_after=None, merge_weekend=False, threshold=None, from_time=None, to_time=None, to_csv=False, destination='comparisons', to_pajek=False, filter_above=0.5, filter_below=5): ''' source/target = doctype of source/target (can also be a list of multiple doctypes) sourcetext/targettext = field where text of target/source can be found (defaults to 'text') sourcdate/targetedate = field where date of source/target can be found (defaults to 'publication_date') keyword_source/_target = optional: specify keywords that need to be present in the textfield; list or string (lowercase) keyword_source/_target_must = optional: In case of a list, do all keywords need to appear in the text (logical AND) or does at least one of the words need to be in the text (logical OR). Defaults to False (logical OR) condition_source/target = optional: supply the field and its value as a dict as a condition for analysis, e.g. {'topic':1} (defaults to None) days_before = days target is before source (e.g. 2); days_after = days target is after source (e.g. 2) -> either both or none should be supplied. Additionally, merge_weekend = True will merge articles published on Saturday and Sunday. threshold = threshold to determine at which point similarity is sufficient; if supplied only the rows who pass it are included in the dataset from_time, to_time = optional: specifying a date range to filter source and target articles. Supply the date in the yyyy-MM-dd format. to_csv = if True save the resulting data in a csv file - otherwise a pandas dataframe is returned destination = optional: where should the resulting datasets be saved? (defaults to 'comparisons' folder) to_pajek = if True save - in addition to csv/pickle - the result (source, target and similarity score) as pajek file to be used in the Infomap method (defaults to False) - not available in combination with days_before/days_after parameters filter_above = Words occuring in more than this fraction of all documents will be filtered filter_below = Words occuring in less than this absolute number of docments will be filtered ''' now = time.localtime() logger.info( "The results of the similarity analysis could be inflated when not using the recommended text processing steps (stopword removal, punctuation removal, stemming) beforehand" ) #Construct source and target queries for elasticsearch if isinstance(source, list): # multiple doctypes source_query = { 'query': { 'bool': { 'filter': { 'bool': { 'must': [{ 'terms': { 'doctype': source } }] } } } } } elif isinstance(source, str): # one doctype source_query = { 'query': { 'bool': { 'filter': { 'bool': { 'must': [{ 'term': { 'doctype': source } }] } } } } } if isinstance(target, list): # multiple doctypes target_query = { 'query': { 'bool': { 'filter': { 'bool': { 'must': [{ 'terms': { 'doctype': target } }] } } } } } elif isinstance(target, str): # one doctype target_query = { 'query': { 'bool': { 'filter': { 'bool': { 'must': [{ 'term': { 'doctype': target } }] } } } } } #Change query if date range was specified source_range = {'range': {sourcedate: {}}} target_range = {'range': {targetdate: {}}} if from_time: source_range['range'][sourcedate].update({'gte': from_time}) target_range['range'][targetdate].update({'gte': from_time}) if to_time: source_range['range'][sourcedate].update({'lte': to_time}) target_range['range'][targetdate].update({'lte': to_time}) if from_time or to_time: source_query['query']['bool']['filter']['bool']['must'].append( source_range) target_query['query']['bool']['filter']['bool']['must'].append( target_range) #Change query if keywords were specified if isinstance(keyword_source, str) == True: source_query['query']['bool']['filter']['bool']['must'].append( {'term': { sourcetext: keyword_source }}) elif isinstance(keyword_source, list) == True: if keyword_source_must == True: for item in keyword_source: source_query['query']['bool']['filter']['bool'][ 'must'].append({'term': { sourcetext: item }}) elif keyword_source_must == False: source_query['query']['bool']['should'] = [] source_query['query']['bool']['minimum_should_match'] = 1 for item in keyword_source: source_query['query']['bool']['should'].append( {'term': { sourcetext: item }}) if isinstance(keyword_target, str) == True: target_query['query']['bool']['filter']['bool']['must'].append( {'term': { targettext: keyword_target }}) elif isinstance(keyword_target, list) == True: if keyword_target_must == True: for item in keyword_target: target_query['query']['bool']['filter']['bool'][ 'must'].append({'term': { targettext: item }}) elif keyword_target_must == False: target_query['query']['bool']['should'] = [] target_query['query']['bool']['minimum_should_match'] = 1 for item in keyword_target: target_query['query']['bool']['should'].append( {'term': { targettext: item }}) #Change query if condition_target or condition_source is specified if isinstance(condition_target, dict) == True: target_query['query']['bool']['filter']['bool']['must'].append( {'match': condition_target}) if isinstance(condition_source, dict) == True: source_query['query']['bool']['filter']['bool']['must'].append( {'match': condition_source}) #Retrieve source and target articles as generators source_query = scroll_query(source_query) target_query = scroll_query(target_query) #Make generators into lists and filter out those who do not have the specified keys (preventing KeyError) target_query = [ a for a in target_query if targettext in a['_source'].keys() and targetdate in a['_source'].keys() ] source_query = [ a for a in source_query if sourcetext in a['_source'].keys() and sourcedate in a['_source'].keys() ] #Target and source texts (split) target_text = [] for doc in target_query: target_text.append(doc['_source'][targettext].split()) source_text = [] for doc in source_query: source_text.append(doc['_source'][sourcetext].split()) logger.info('Preparing dictionary') dictionary = Dictionary(source_text + target_text) logger.info( 'Removing all tokens that occur in less than {} documents or in more than {:.1f}% or all documents from dictionary' .format(filter_below, filter_above * 100)) dictionary.filter_extremes(no_below=filter_below, no_above=filter_above) logger.info('Preparing tfidf model') tfidf = TfidfModel(dictionary=dictionary) #extract additional information from sources source_dates = [doc['_source'][sourcedate] for doc in source_query] source_ids = [doc['_id'] for doc in source_query] source_doctype = [doc['_source']['doctype'] for doc in source_query] source_dict = dict(zip(source_ids, source_dates)) source_dict2 = dict(zip(source_ids, source_doctype)) #extract information from targets target_ids = [doc['_id'] for doc in target_query] target_dates = [doc['_source'][targetdate] for doc in target_query] target_dict = dict(zip(target_ids, target_dates)) target_doctype = [doc['_source']['doctype'] for doc in target_query] target_dict2 = dict(zip(target_ids, target_doctype)) #If specified, comparisons compare docs within sliding date window if days_before != None or days_after != None: logger.info('Performing sliding window comparisons...') # merge queries including identifier key for i in source_query: i.update({'identifier': 'source'}) for i in target_query: i.update({'identifier': 'target'}) source_query.extend(target_query) # sourcedate and targetdate need to be the same key (bc everything is done for sourcedate) if targetdate is not sourcedate: logger.info( 'Make sure that sourcedate and targetdate are the same key.' ) else: # convert dates into datetime objects for a in source_query: if isinstance(a['_source'][sourcedate], datetime.date) == True: pass # is already datetime object else: a['_source'][sourcedate] = [ int(i) for i in a['_source'][sourcedate][:10].split("-") ] a['_source'][sourcedate] = datetime.date( a['_source'][sourcedate][0], a['_source'][sourcedate][1], a['_source'][sourcedate][2]) # sort query by date source_query.sort(key=lambda item: item['_source'][sourcedate]) # create list of all possible dates d1 = source_query[0]['_source'][sourcedate] d2 = source_query[-1]['_source'][sourcedate] delta = d2 - d1 date_list = [] for i in range(delta.days + 1): date_list.append(d1 + datetime.timedelta(i)) # create list of docs grouped by date (dates without docs are empty lists) grouped_query = [] for d in date_list: dt = [] for a in source_query: if a['_source'][sourcedate] == d: dt.append(a) grouped_query.append(dt) # Optional: merges saturday and sunday into one weekend group # Checks whether group is Sunday, then merge together with previous (saturday) group. if merge_weekend == True: grouped_query_new = [] for group in grouped_query: # if group is sunday, extend previous (saturday) list, except when it is the first day in the data. if group[0]['_source'][sourcedate].weekday() == 6: if not grouped_query_new: grouped_query_new.append(group) else: grouped_query_new[-1].extend(group) # if empty, append empty list elif not group: grouped_query_new.append([]) # for all other weekdays, append new list else: grouped_query_new.append(group) grouped_query = grouped_query_new # Sliding window starts here... How it works: # A sliding window cuts the documents into groups that should be compared to each other based on their publication dates. A list of source documents published on the reference date is created. For each of the target dates in the window, the source list is compared to the targets, the information is put in a dataframe, and the dataframe is added to a list. This process is repeated for each window. We end up with a list of dataframes, which are eventually merged together into one dataframe. len_window = days_before + days_after + 1 source_pos = days_before # source position is equivalent to days_before (e.g. 2 days before, means 3rd day is source with the index position [2]) n_window = 0 for e in tqdm(self.window(grouped_query, n=len_window)): n_window += 1 df_window = [] source_texts = [] source_ids = [] if not 'source' in [ l2['identifier'] for l2 in e[source_pos] ]: pass else: for doc in e[source_pos]: try: if doc['identifier'] == 'source': # create sourcetext list to compare against source_texts.append( doc['_source'][sourcetext].split()) # extract additional information source_ids.append(doc['_id']) except: logger.error( 'This does not seem to be a valid document' ) print(doc) # create index of source texts query = tfidf[[ dictionary.doc2bow(d) for d in source_texts ]] # iterate through targets for d in e: target_texts = [] target_ids = [] for doc in d: try: if doc['identifier'] == 'target': target_texts.append( doc['_source'][targettext].split()) # extract additional information target_ids.append(doc['_id']) except: logger.error( 'This does not seem to be a valid document' ) print(doc) # do comparison index = SparseMatrixSimilarity( tfidf[[ dictionary.doc2bow(d) for d in target_texts ]], num_features=len(dictionary)) sims = index[query] #make dataframe df_temp = pd.DataFrame( sims, columns=target_ids, index=source_ids).stack().reset_index() df_window.append(df_temp) df = pd.concat(df_window, ignore_index=True) df.columns = ['source', 'target', 'similarity'] df['source_date'] = df['source'].map(source_dict) df['target_date'] = df['target'].map(target_dict) df['source_doctype'] = df['source'].map(source_dict2) df['target_doctype'] = df['target'].map(target_dict2) #Optional: if threshold is specified if threshold: df = df.loc[df['similarity'] >= threshold] #Make exports folder if it does not exist yet if not 'comparisons' in os.listdir('.'): os.mkdir('comparisons') #Optional: save as csv file if to_csv == True: df.to_csv( os.path.join( destination, r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.csv" .format(now=now, target=target, source=source, n_window=n_window))) #Otherwise: save as pickle file else: df.to_pickle( os.path.join( destination, r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.pkl" .format(now=now, target=target, source=source, n_window=n_window))) #Optional: save as pajek file not for days_before/days_after if to_pajek == True: logger.info( "Does not save as Pajek file with days_before/days_after because of the size of the files." ) #Same procedure as above, but without specifying a time frame (thus: comparing all sources to all targets) else: #Create index out of target texts logger.info("Preparing the index out of target texts...") index = SparseMatrixSimilarity( tfidf[[dictionary.doc2bow(d) for d in target_text]], num_features=len(dictionary)) #Retrieve source IDs and make generator to compute similarities between each source and the index logger.info("Preparing the query out of source texts...") query = tfidf[[dictionary.doc2bow(d) for d in source_text]] query_generator = (item for item in query) #Retrieve similarities logger.info("Starting comparisons...") i = 0 s_ids = 0 for doc in query_generator: i += 1 # count each round of comparisons # if doc is empty (which may happen due to pruning) # then we skip this comparison if len(doc) == 0: s_ids += 1 logger.info('Skipped one empty document') continue #sims_list = [index[doc] for doc in query_generator] sims = index[doc] #make dataframe #df = pd.DataFrame(sims_list, columns=target_ids, index = source_ids).stack(). reset_index() df = pd.DataFrame([sims]).transpose() logger.debug('Created dataframe of shape {}'.format(df.shape)) logger.debug('Length of target_id list: {}'.format( len(target_ids))) df['target'] = target_ids df['source'] = source_ids[s_ids] df.columns = ['similarity', 'target', 'source'] df["source_date"] = df["source"].map(source_dict) df["target_date"] = df["target"].map(target_dict) df['source_doctype'] = df['source'].map(source_dict2) df['target_doctype'] = df['target'].map(target_dict2) df = df.set_index('source') #Optional: if threshold is specified if threshold: df = df.loc[df['similarity'] >= threshold] #Make exports folder if it does not exist yet if not 'comparisons' in os.listdir('.'): os.mkdir('comparisons') #Optional: save as csv file if to_csv == True: df.to_csv( os.path.join( destination, r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.csv" .format(now=now, target=target, source=source, i=i))) #Otherwise: save as pickle file else: df.to_pickle( os.path.join( destination, r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.pkl" .format(now=now, target=target, source=source, i=i))) #Optional: additionally save as pajek file if to_pajek == True: G = nx.Graph() # change int to str (necessary for pajek format) df['similarity'] = df['similarity'].apply(str) # change column name to 'weights' to faciliate later analysis df.rename({'similarity': 'weight'}, axis=1, inplace=True) # notes and weights from dataframe G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr='weight') # write to pajek nx.write_pajek( G, os.path.join( destination, r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.net" .format(now=now, target=target, source=source, i=i))) s_ids += 1 # move one doc down in source_ids logger.info("Done with source " + str(i) + " out of " + str(len(source_text)))
resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary( config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) assert len(input) == len( corpus ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % ( len(input), len(corpus)) # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) else: index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) generateSimilar( corpus, index, method ) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format logging.info("finished running %s" % program)
async def create_file(keyword: str, threshold: float, file: UploadFile = File(...)): contents = file.file.read() now = time.time() with open("./cache_file/" + str(now) + file.filename, "w+") as f: f.write(contents.decode("utf-8")) with open("./cache_file/" + str(now) + file.filename, "r") as f_read: data = f_read.readlines() # 1、将【文本集】生成【分词列表】 texts = [lcut(text.strip("\n")) for text in tqdm(data)] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in tqdm(texts)] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) # print(similarities) new_now = datetime.datetime.now() #文件目录简单管理 os.makedirs("./static/" + keyword + str(new_now)) db.insert({"name": keyword + str(new_now), "type": "dir"}) f = open("./static/" + keyword + str(new_now) + "/result.txt", "w") db.insert({ "name": keyword + str(new_now) + "/result.txt", "type": "file", "dir": keyword + str(new_now) }) f1 = open("./static/" + keyword + str(new_now) + "/result_er.txt", "w") db.insert({ "name": keyword + str(new_now) + "/result_er.txt", "type": "file", "dir": keyword + str(new_now) }) #end Semantic_list = [] for e, s in enumerate(similarities, 1): su = (e, s) Semantic_list.append(su) try: if s >= threshold: f.write(data[e - 1].strip("\n") + str(s) + "\n") else: f1.write(data[e - 1].strip("\n") + str(s) + "\n") except Exception as e: pass Semantic_list.sort(key=takeSecond, reverse=True) rs_list = [] for item in Semantic_list[0:101]: rs_dic = {"msg": data[item[0] - 1], "Similaritydegree": str(item[1])} rs_list.append(rs_dic) # Semantic_list os.remove("./cache_file/" + str(now) + file.filename) return {"semantic": rs_list}