def getSparseMatrixSimilarity(keyword, texts): # 1、将【文本集】生成【分词列表】 texts = [jieba.lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(jieba.lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): print('kw 与 text%d 相似度为:%.2f' % (e, s)) print(sparse_matrix) print(similarities)
def samilarRate(texts, keyword): # 传入texts,keyword # 文本集和搜索词 # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) result = [] sorft = [] for e, s in enumerate(similarities, 1): result.append('kw 与 text%d 相似度为:%.2f' % (e, s)) sorft.append(s) return result, sorft
def mergeTags(): res = {} # 创建一个空字典 for i in range(len(displayArr)): texts = default_tags keyword = displayArr[i] # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if s > 0.5: # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s) key = ''.join(texts[e - 1]).strip() res[key] = s arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True) for ind, (k, v) in enumerate(arrSorted): if ind == 0: ids = textsOld[i].strip().split('.')[0] textsOld[i] = textsOld[i] + '----------' + k # textsOld[i] = ids+'.'+k res = {} #字典置空 return textsOld
def check(news): """检查是否重复""" dictionary, corpus, num_features = Similar.dictionary() kw_vector = dictionary.doc2bow(lcut(news)) tfidf = TfidfModel(corpus) tf_texts = tfidf[corpus] tf_kw = tfidf[kw_vector] sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if 0.6 < s < 0.98: return return news
def mergeTags(textArr): res = [] for i in range(len(displayArr)): try: exampleArr = textArr if i == 0: texts = textArr else: for item in res: if item in exampleArr: exampleArr.remove(item) texts = exampleArr res = [] # print(exampleArr) # 文本集和搜索词 # texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思', # '而是出自策略射击游戏《绝地求生:大逃杀》里的台词', # '我吃鸡翅,你吃鸡腿'] keyword = texts[i] # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if s > 0.5: res.append(exampleArr[e - 1]) print(keyword + ' 与 ' + exampleArr[e - 1] + ' 的相似度为 :', s) print('---------------------------------------------------') except: print('') print('合并完成!')
def similar(aim): aim_text = aim.title + aim.abstract simple = [x.title + x.abstract for x in ret[0:-10]] text = [set(posseg.lcut(x)) for x in simple] text = list({y for x in text for y in x}) dictionary = Dictionary(text) length = len(dictionary.token2id) corpus = [dictionary.doc2bow(lcut(src)) for src in simple] tfidf = TfidfModel(corpus) tf_texts = tfidf[corpus] sparse_matrix = SparseMatrixSimilarity(tf_texts, length) vector = dictionary.doc2bow(lcut(aim_text)) tf_kw = tfidf[vector] similarities = sparse_matrix.get_similarities(tf_kw) print(aim.title) for e, s in enumerate(similarities, 1): if s > 0.1: print(s, ret[e - 1].title)
"""建立词典 获得特征数""" dictionary = corpora.Dictionary(diff_word_list) feature_cnt = len(dictionary.token2id.keys()) """基于词典 分词列表转稀疏向量集""" corpus = [dictionary.doc2bow(codes) for codes in diff_word_list] # print("key") # print([x for x in word_list if x not in stopwords]) kw_vector = dictionary.doc2bow([x for x in word_list if x not in stopwords]) """创建tf-idf模型 传入语料库训练""" tfidf = TfidfModel(corpus) """训练好的tf-idf模型处理检索文本和搜索词""" tf_texts = tfidf[corpus] tf_kw = tfidf[kw_vector] """相似度计算""" sparse_matrix = SparseMatrixSimilarity(tf_texts, feature_cnt) similarities = sparse_matrix.get_similarities(tf_kw) # print("similarities") # print(similarities) # for e, s in enumerate(similarities, 1): # print('kw 与 text%d 相似度为:%.2f' % (e, s)) conceptualSimilarity.append(max(similarities)) """key word ratio""" keywordsInComments = [x for x in word_list if x in languageKeyWords] stopKeyRatio.append(keywordsInComments.__len__() / word_list.__len__()) print(readable) print(max(readable), min(readable)) fig = plt.figure()
async def create_file(keyword: str, threshold: float, file: UploadFile = File(...)): contents = file.file.read() now = time.time() with open("./cache_file/" + str(now) + file.filename, "w+") as f: f.write(contents.decode("utf-8")) with open("./cache_file/" + str(now) + file.filename, "r") as f_read: data = f_read.readlines() # 1、将【文本集】生成【分词列表】 texts = [lcut(text.strip("\n")) for text in tqdm(data)] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in tqdm(texts)] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) # print(similarities) new_now = datetime.datetime.now() #文件目录简单管理 os.makedirs("./static/" + keyword + str(new_now)) db.insert({"name": keyword + str(new_now), "type": "dir"}) f = open("./static/" + keyword + str(new_now) + "/result.txt", "w") db.insert({ "name": keyword + str(new_now) + "/result.txt", "type": "file", "dir": keyword + str(new_now) }) f1 = open("./static/" + keyword + str(new_now) + "/result_er.txt", "w") db.insert({ "name": keyword + str(new_now) + "/result_er.txt", "type": "file", "dir": keyword + str(new_now) }) #end Semantic_list = [] for e, s in enumerate(similarities, 1): su = (e, s) Semantic_list.append(su) try: if s >= threshold: f.write(data[e - 1].strip("\n") + str(s) + "\n") else: f1.write(data[e - 1].strip("\n") + str(s) + "\n") except Exception as e: pass Semantic_list.sort(key=takeSecond, reverse=True) rs_list = [] for item in Semantic_list[0:101]: rs_dic = {"msg": data[item[0] - 1], "Similaritydegree": str(item[1])} rs_list.append(rs_dic) # Semantic_list os.remove("./cache_file/" + str(now) + file.filename) return {"semantic": rs_list}
with open(file_path, encoding='UTF-8') as text: text_as_string = "" for line in text: text_as_string += line text = preprocess(text_as_string) texts.append(text) return texts texts = get_texts() texts = get_ngrams(texts) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] index = SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) # Not 100% how to interpret these results for i in range(0, len(corpus)): print(index.get_similarities(corpus[i])) # # # # # # #