def clean1_3(book_labels): print('book_label 1.3 清洗开始:', book_labels.shape, '---------------------------') # 1.3 去掉字典(词袋)中不存在的标签 indexbyWordBag = [] labelsvector = [] for i in range(book_labels.shape[0]): try: labelvector = syn.v(book_labels.iloc[i, 1]) # print(labelvector,len(labelvector)) # 若词袋中存在该词,保留index labelsvector.append(labelvector) indexbyWordBag.append(i) except KeyError as err: # 若词袋中不存在该词,删除 continue book_labels = book_labels.iloc[indexbyWordBag] book_labels = book_labels.reset_index(drop=True) labelvector = pd.DataFrame(labelsvector) book_labels = pd.concat([book_labels, labelvector], ignore_index=True, axis=1) print('book_label 1.3 清洗结束:', book_labels.shape, '---------------------------') # 10118,102 return book_labels
def synonym_search(self, question, method): n_grams = [2, 3, 4, 5] term = [] scores = [] for n in n_grams: for i in range(len(question) - n + 1): word = question[i:i + n] if method == 'Bert': wd_vec = bc.encode([word])[0] elif method == 'Word2Vec': try: wd_vec = synonyms.v(word) except: wd_vec = [0] * 100 else: wd_vec = 0 term.append((word, wd_vec)) for word, wd_vec in term: max_score = 0 for wd, vec in self.symptom_vec.items(): if method == 'Bert': score = np.inner(wd_vec, vec) / \ (np.linalg.norm(wd_vec)*np.linalg.norm(vec)) score = score if method == "Levenshtein": score = Levenshtein.jaro(word, wd) if method == "Word2Vec": try: vec = self.symptom_w2v_vec[wd] except: vec = [0] * 100 score = np.inner(wd_vec, vec) / \ ((np.linalg.norm(wd_vec)+1e-5)*(np.linalg.norm(vec)+1e-5)) if score >= max_score: max_score = score prob_entity = wd original_word = word scores.append((original_word, max_score, prob_entity)) scores.sort(key=takeSecond, reverse=True) # prob_entity = scores[0][2] # original_word =scores[0][0] # print(method) return scores[:3]
def test_word_vector(self): print("test_word_vector") word = "三国" print(word, "向量", synonyms.v(word))
# -*- "coding: utf-8" -*- import synonyms import numpy as np from cosine import Cosine cosine = Cosine(n_recommendation=4) with open("vocabulary_filter.txt", "r", encoding="utf-8") as f: vocabulary = f.read().split()[:-1] vectors = [] for word in vocabulary: try: vectors.append(synonyms.v(word)) # 使用 synonyms 获得词向量 except: pass vectors = np.array(vectors) indices, similarities = cosine.cal_similarity(vectors, vectors) # 调用cosine模块计算余弦相似度 with open("method_synonyms.csv", "w", encoding="utf-8") as f: for nrow, row in enumerate(indices): for ncol, col in enumerate(row): if ncol == 0: # 跳过自身 continue f.write("{},{},{}\n".format(vocabulary[nrow], vocabulary[col], similarities[nrow][ncol]))
def cleanData(book_labels_table): ''' #对图书标签进行预处理,清洗规则如下: 0. 只保留中文文字描述的标签 1. 去掉四个汉字以上的标签 2. 去掉没有标签的图书,返回图书id,存入deletebookid=[] 3. 去掉一本书对应的重复标签 :param book_labels_table: :return: book_labels_table ''' #先去除没有标签的图书 book_labels_table=delteNanRow(book_labels_table) # 存放图书-向量的图书id列表和图书标签向量列表 print('--------------------下面开始去除四个以上的汉字和不在词典中的汉字---------------------------') bookidlist=[] booklabelsvector=[] # 1. 去掉四个字以上的标签(只保留标签长度为2-4个汉字的),且将繁体标签转换为简体标签,去掉重复的标签 print('去掉四个字以上的标签(只保留标签长度为2-4个汉字的),且将繁体标签转换为简体标签,去掉重复的标签') for i in range(book_labels_table.shape[0]): #item_book_label存放清洗后的label item_book_label =[] for j in range(len(book_labels_table.iloc[i,1])): pattern=re.compile("^[\u4e00-\u9fa5]{2,4}$") if re.match(pattern, book_labels_table.iloc[i,1][j]): #化繁体为简体 # print('化繁体为简体') book_labels_table.iloc[i,1][j]=Traditional2Simplified(book_labels_table.iloc[i,1][j]) item_book_label.append(book_labels_table.iloc[i,1][j]) #去重复的词 item_book_label=set(item_book_label) book_labels_table.iloc[i,1]=list(item_book_label) #经过一轮清洗后,可能会出现没有标签的图书,去除没有标签的图书 print('经过一轮清洗后,可能会出现没有标签的图书,去除没有标签的图书') book_labels_table=delteNanRow(book_labels_table) # 第二轮去除我们词袋中没有出现的词 print('第二轮去除我们词袋中没有出现的词') for i in range(book_labels_table.shape[0]): # item_book_labelVector存放清洗后标签对应的词向量 item_book_label=[] item_book_labelVector = [] # ind_list用来更新book_labels,有些一轮清洗的留下来的词可能并不在我们的词袋中,我们也需要删除它 #注意:这样又会引出一个新的问题——可能会再次出现一本书没有标签,我们需要再做依次删除空行操作 for j in range(len(book_labels_table.iloc[i, 1])): try: wordvector = synonyms.v(book_labels_table.iloc[i, 1][j]) except KeyError as err: #若词袋中不存在该词,删除 continue #若词袋中存在该词,先存入该词,接着存入该词向量 item_book_label.append(book_labels_table.iloc[i, 1][j]) item_book_labelVector.append(wordvector) #更新每本书对应的标签 book_labels_table.iloc[i,1]=item_book_label # 得到该本书的图书id,建立图书id-图书标签向量表 # print('得到该本书的图书id,建立图书id-图书标签向量表') if len(book_labels_table.iloc[i,1]) != 0: bookidlist.append(book_labels_table.iloc[i,0]) booklabelsvector.append(item_book_labelVector) book_labels_table=delteNanRow(book_labels_table) book_labels_vector_table=pd.DataFrame(zip(bookidlist,booklabelsvector)) book_labels_vector_table.columns=['bookid','book_labels_vector'] print(book_labels_table) #至此得到图书id-图书标签向量表 print(book_labels_vector_table) print('入库') book_labels_table.index=book_labels_table['bookid'] del book_labels_table['bookid'] print(book_labels_table) book_labels_vector_table.index=book_labels_vector_table['bookid'] del book_labels_vector_table['bookid'] print(book_labels_vector_table) return book_labels_table,book_labels_vector_table
data_long ("国际劳工组织") cixing=[] for i in range(0,len(data_long)): cixing.append(synonyms.seg(data_long[i])) test=synonyms.nearby("人脸") test[0] print("识别: %s" % (synonyms.nearby("识别"))) print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST"))) synonyms.display("金融") synonyms.v() print(1) cixiangliang=[] for i in range(0,len(data_long)): try: cixiangliang.append(synonyms.v (data_long[i])) except: cixiangliang.append(-1) ciqinggan=[] for i in range(0,len(data_long)): s = SnowNLP(data_long[i])
def testWordVec(word): # 得出synonyms中这个词的向量 wordvector = synonyms.v(word) # print(wordvector) return wordvector
def word2vector(word): try: vector = synonyms.v(word) except Exception as e: vector = None return vector