def review_model_predict_entities(model_predict_entities): word_tag_map = POSTokenizer().word_tag_tab idf_freq = TFIDF().idf_freq reviewed_entities = defaultdict(list) for ent_type, ent_and_sent_list in model_predict_entities.items(): for ent, sent in ent_and_sent_list: start = sent.lower().find(ent) if start == -1: continue start += 1 end = start + len(ent) - 1 tokens = jieba.lcut(sent) offset = 0 selected_tokens = [] for token in tokens: offset += len(token) if offset >= start: selected_tokens.append(token) if offset >= end: break fixed_entity = ''.join(selected_tokens) fixed_entity = re.sub(r'\d*\.?\d+%$', '', fixed_entity) if ent_type == '人物': if len(fixed_entity) >= 10: continue if len(fixed_entity) <= 1: continue if re.findall(r'^\d+$', fixed_entity): continue if word_tag_map.get(fixed_entity, '') == 'v' and idf_freq[fixed_entity] < 7: continue reviewed_entities[ent_type].append(fixed_entity) return reviewed_entities
def load_model(): print('loading data') idf, _ = TFIDF().idf_loader.get_idf() model = Word2Vec.load('D:\\data\\wiki_embedding\\model\\wiki.zh.model') with open('file/stopwords.txt', 'r', encoding='utf-8-sig') as sw: stop_words = [line.strip() for line in sw.readlines()] print('load success') return idf, model, stop_words
class SentenceVec: def __init__(self, sentence_vec_path='D:\\data\\videos\\title_vec.csv', model=WordEmbedding()): self.sentence_vec_path = sentence_vec_path self.model = model self.idf, self.median = TFIDF().idf_loader.get_idf() def get_tfidf(self, sentence): words = jieba.cut(sentence) freq = {} for w in words: freq[w] = freq.get(w, 0.0) + 1.0 totle = sum(freq.values()) tfidf_dict = {} for w in freq.keys(): weight = self.idf.get(w, 0.) * freq[w] / totle tfidf_dict[w] = weight return tfidf_dict def get_sentence_vec(self, sentence): seg_list = self.model.__jieba_cut__(sentence) sen_vec = np.zeros(200) for word in seg_list: sen_vec += self.model.get_word_vec(word) return sen_vec / len(seg_list) def get_weight_sent_vec(self, sentence): ''' 带tfidf权重的词向量 :param sentence: :return: ''' tfidf_dict = self.get_tfidf(sentence) weight_totle = sum(tfidf_dict.values()) sen_vec = np.zeros(200) for word in jieba.cut(sentence): sen_vec += self.model.get_word_vec(word) * tfidf_dict[word] if weight_totle > 0.0: return sen_vec / weight_totle else: return sen_vec def save_sentence_vec(self, sentence): sen_vec = pd.DataFrame( pd.read_csv(self.sentence_vec_path, header=0, encoding='utf-8-sig')) sen_vec['title'].append(sentence) sen_vec['title_vec'].append(self.get_sentence_vec(sentence))
def __init__(self, PATH_DICT='dict.txt', PATH_STOPWORDS='stopwords.txt', PATH_WORD2VEC_MODEL='wiki.zh.model', TOPK_TITLE=5, TOPK_VOICE=10, ENCODING_STOPWORDS='utf-8-sig', ONLY_NV=True, HMM=False): """ 初始化,用于加载模型和参数 :param PATH_DICT: 用户字典的存放位置,就是整理的兴趣点 :param PATH_STOPWORDS: 停用词的存放位置,包括之前的停用词和现在整理的停用词 :param PATH_WORD2VEC_MODEL: 词向量模型,有四个文件 :param TOPK_TITLE: 标题关键词的提取个数,默认5 :param TOPK_VOICE: 声音关键词的提取个数,默认10 :param ENCODING_STOPWORDS: 停用词的编码,默认无标签的utf-8 :param ONLY_NV: 是否进行词性筛选,默认是,保留名词和动词时的效果较好, ’x'为未知词性,用户自定义添加的词典里面有些词是未知词性,需要保留 :param HMM: 分词时是否使用hmm算法,不使用hmm时效果好,hmm在结巴中主要用于新词发现 """ print('load') self._path_dict = PATH_DICT self._path_stopwords = PATH_STOPWORDS self._path_words2vec = PATH_WORD2VEC_MODEL jieba.load_userdict(self._path_dict) with open(self._path_stopwords, 'r', encoding=ENCODING_STOPWORDS) as sw: self._stop_words = [line.strip() for line in sw.readlines()] self._idf, _ = TFIDF().idf_loader.get_idf() self._model = Word2Vec.load(self._path_words2vec) self._topk_title = TOPK_TITLE self._topk_voice = TOPK_VOICE self._only_nv = ONLY_NV self._hmm = HMM self.predicted_poi = [] print('model load success!')
origin = "first" print(calc.nearest_neighbors(origin)) model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl' with open(model_file, "rb") as f: w2v_model = pickle.load(f, encoding='iso-8859-1') # 此处耗内存 60.8 MiB words_list = [] w_emb = [] for word, emb in w2v_model.items(): words_list.append(word) w_emb.append(emb) from jieba.analyse.tfidf import TFIDF tf_idf = TFIDF() # tf_idf.idf_freq.get('我') query = '发生重疾如何理赔 ' database = [ '尊享惠康如何理赔', '患重疾怎么赔', '重疾险如何理赔', '重疾赔付几次', '重疾保额', '重疾赔付次数', '重疾赔付', '尊享惠康发生争议如何处理', '重疾保险金怎么赔付', '重疾给付', '重疾包括哪些', '发生争议如何处理', '重疾险保后理赔流程是如何的', '医疗事故导致的重疾可以理赔吗', '赔了重疾可以赔轻疾吗', '如何申请理赔', '赔了重疾可以赔轻症吗', '关于轻症重疾确诊就赔吗', '如何购买重疾险', '重疾申请理赔流程', '重疾险发生争议如何处理', '重疾种类', '保哪些重疾', '重疾90种', '重疾种类解释', '如何理赔', '发生事故如何通知保险公司', '重疾保险金赔付后身故还有的赔吗' ] start_time = time.time() def generator_nbow(question):
def __init__(self): self.local_threshold = 0.8 self.global_hook_threshold = 0.8 self.tfidf = TFIDF()
def __init__(self, sentence_vec_path='D:\\data\\videos\\title_vec.csv', model=WordEmbedding()): self.sentence_vec_path = sentence_vec_path self.model = model self.idf, self.median = TFIDF().idf_loader.get_idf()
class GenerateKeyWords(object): def __init__(self, PATH_DICT='dict.txt', PATH_STOPWORDS='stopwords.txt', PATH_WORD2VEC_MODEL='wiki.zh.model', TOPK_TITLE=5, TOPK_VOICE=10, ENCODING_STOPWORDS='utf-8-sig', ONLY_NV=True, HMM=False): """ 初始化,用于加载模型和参数 :param PATH_DICT: 用户字典的存放位置,就是整理的兴趣点 :param PATH_STOPWORDS: 停用词的存放位置,包括之前的停用词和现在整理的停用词 :param PATH_WORD2VEC_MODEL: 词向量模型,有四个文件 :param TOPK_TITLE: 标题关键词的提取个数,默认5 :param TOPK_VOICE: 声音关键词的提取个数,默认10 :param ENCODING_STOPWORDS: 停用词的编码,默认无标签的utf-8 :param ONLY_NV: 是否进行词性筛选,默认是,保留名词和动词时的效果较好, ’x'为未知词性,用户自定义添加的词典里面有些词是未知词性,需要保留 :param HMM: 分词时是否使用hmm算法,不使用hmm时效果好,hmm在结巴中主要用于新词发现 """ print('load') self._path_dict = PATH_DICT self._path_stopwords = PATH_STOPWORDS self._path_words2vec = PATH_WORD2VEC_MODEL jieba.load_userdict(self._path_dict) with open(self._path_stopwords, 'r', encoding=ENCODING_STOPWORDS) as sw: self._stop_words = [line.strip() for line in sw.readlines()] self._idf, _ = TFIDF().idf_loader.get_idf() self._model = Word2Vec.load(self._path_words2vec) self._topk_title = TOPK_TITLE self._topk_voice = TOPK_VOICE self._only_nv = ONLY_NV self._hmm = HMM self.predicted_poi = [] print('model load success!') def __simi(self, word1, word2): """ 词语相似度计算 :param word1: :param word2: :return: 相似度 float64 """ try: value = 0.0 if word1 in self._model and word2 in self._model: value = self._model.similarity(word1, word2) return value except: return 0.0 def __comput_tfidf(self, sentence, topk=10): """ 提取句子中的关键词,算法tf-idf :param sentence: 分词后的句子,字符串类型,词语之间逗号分隔 :param number: 需要提取的关键词的个数 :return: list 排序后的关键词 """ words = sentence.split(',') freq = {} for w in words: freq[w] = freq.get(w, 0.0) + 1.0 totle = sum(freq.values()) dict_weights = {} for w in freq.keys(): weight = self._idf.get(w, 0.) * freq[w] / totle dict_weights[w] = weight if len(dict_weights) < 1: return [] sorted_pair = sorted(dict_weights.items(), key=lambda x: x[1], reverse=True) new_words, _ = zip(*sorted_pair) topk = min(len(new_words), topk) return new_words[:topk] def __jieba_cut(self, sentence, n_v=True, HMM=True): """ jieba分词,去停用词 :param sentence:句子 :param n_v: 是否词性筛选,仅保留名词动词 :param HMM: :return: string e.g. 'ac,adsa,qwe’ """ if n_v: voice_tok = jieba.posseg.cut(sentence, HMM) tok_list = [] for w in voice_tok: if w not in self._stop_words: # 去停用词 if w.flag[0] == 'n' or w.flag[0] == 'v' or w.flag[ 0] == 'x': # 仅保留动词和名词,部分自定义的词词性为'x' tok_list.append(w.word) return ','.join(tok_list) else: sentence_tok = jieba.cut(sentence, HMM) tok_list = [] for w in sentence_tok: if w not in self._stop_words: # 去停用词 tok_list.append(w.word) return ','.join(tok_list) def __list_clean(self, sentence_list): """ 去重,同时去掉词长度为1的词,保留原始顺序 :param sentence_list: [...] :return: [...] """ words_dict = {} iterator = 0 for w in sentence_list: w = w.strip() if w not in words_dict and len(w) > 1: words_dict[w] = iterator iterator += 1 sorted_w = sorted(words_dict.items(), key=lambda x: x[1], reverse=False) words, _ = list(zip(*sorted_w)) return words def generate(self, title, voice): """ 根据title和voice两个字符串生成最终的关键词 :param title: title文本,字符串类型 :param voice: 声音文本,字符串类型 :return: 无返回,将最终的结果保存到类的属性里面,那么关键字的属性为:self.predicted_poi """ # 分词 title_cut = self.__jieba_cut(title, self._only_nv, self._hmm) voice_cut = self.__jieba_cut(voice, self._only_nv, self._hmm) # 提取关键词 important_title_words = self.__comput_tfidf(title_cut, self._topk_title) important_voice_words = self.__comput_tfidf(voice_cut, self._topk_voice) # 如果title 和 voice 的关键词为空,异常处理 if len(important_title_words) < 1: self.predicted_poi = important_voice_words elif len(important_voice_words) < 1: self.predicted_poi = important_title_words elif len(important_title_words) < 1 and len(important_voice_words) < 1: self.predicted_poi = [] else: # 正常状态 simi_dict = {} # 计算相似度 for w1 in important_title_words: for w2 in important_voice_words: simi_value = self.__simi(w1, w2) simi_dict[(w1, w2)] = simi_value # 相似度排序 sorted_pairs = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True) big_pairs, _ = zip(*sorted_pairs) # 去重得到最终结果 important_words = [] for tuple_p in big_pairs: important_words.append(tuple_p[0]) important_words.append(tuple_p[1]) self.predicted_poi = self.__list_clean(important_words)