def get_wmd(model, s1, s2): """ 使用gensim词向量模型计算文档迁移距离 :param model: 模型 :param s1: 句子1 :param s2: 句子2 :return: """ words_s1, tag_s1 = seg_doc(s1) words_s2, tag_s2 = seg_doc(s2) wmd = model.wmdistance(words_s1, words_s2) return wmd
def _sort_retrieval_docs(self, query, triple_docs): """ 对三元组进行重排 :param query:问句 :param triple_docs:检索得到的三元组 :return: """ self.debug('>>> start _sort_retrieval_docs <<<') filter_triple_docs = [] query_words, query_tags = seg_doc(query) for doc_item in triple_docs: target_field = self.query_fields[0] attribute = doc_item.get(target_field, "") attribute_words = attribute.strip().split() # 需要一个分类器,判断问句与文档是否相关,目前使用word move distance score = self._is_similarity(query_words, attribute_words) # 计算问句与属性间的文档迁移距离 if score > DEFAULT_WMD_THRESHOLD: # 过滤掉得分低的三元组 doc_item['score'] = score filter_triple_docs.append(doc_item) filter_triple_docs.sort(key=lambda x: x['score'], reverse=False) # 根据得分进行重排 self.debug('>>> end _sort_retrieval_docs <<<') return filter_triple_docs
def _sort_docs_by_object(self, sentence, triple_docs): """ 基于文档迁移距离对triple_docs进行重排 :param sentence: :param triple_docs: :return: """ self.debug('>>> start _sort_docs_by_object <<<') words, tags = seg_doc(sentence) _words = [w.strip() for w in words if w.strip()] match_triple_docs = list() # 满足匹配阈值的triple_docs for doc_item in triple_docs: item_str = doc_item.get("attribute_date", "") item_index = doc_item.get("attribute_date_index", "") item_words = [w.strip() for w in item_index.split()] distance = calculate_wmd(_words, item_words) # 计算文档迁移距离 score = exp(-distance / 19.0) doc_item['score'] = score if doc_item['score'] > DEFAULT_WMD_THRESHOLD: # 过滤掉距离大于阈值的三元组 self.debug('choose item_str=%s, score=%s', item_str, doc_item['score']) match_triple_docs.append(doc_item) else: self.debug("filter item_str=%s, score=%s", item_str, doc_item['score']) if match_triple_docs: # 按score降序排序 match_triple_docs.sort(key=lambda x: x['score'], reverse=True) self.debug('>>> end _sort_docs_by_object <<<') return match_triple_docs
def generate_training_data(path): file_names = os.listdir(path) for name in file_names: if os.path.isdir(os.path.join(path, name)): print('%s is directory' % name) continue if name.startswith('seg_'): if os.path.exists(os.path.join(path, 'seg_corpus/%s' % name)): print("%s seg corpus exists, don't need generate seg_%s" % (name, name)) continue with codecs.open(os.path.join(path, name), mode='r', encoding='utf-8') as fr: lines = fr.readlines() with codecs.open(os.path.join(path, 'seg_corpus/%s' % name), mode='w', encoding='utf-8') as fw: print('start generate seg_corpus/%s' % name) for line in tqdm(lines): if line.strip(): words = line.strip().split() fw.write(' '.join([w.strip() for w in words if w.strip()])) fw.write('\n') else: if os.path.exists(os.path.join(path, 'seg_corpus/seg_%s' % name)): print("%s seg corpus exists, don't need generate seg_%s" % (name, name)) continue with codecs.open(os.path.join(path, name), mode='r', encoding='utf-8') as fr: lines = fr.readlines() with codecs.open(os.path.join(path, 'seg_corpus/seg_%s' % name), mode='w', encoding='utf-8') as fw: print('start generate seg_corpus/seg_%s' % name) for line in tqdm(lines): if line.strip(): words, tags = seg_doc(line.strip()) fw.write(' '.join([w.strip() for w in words if w.strip()])) fw.write('\n')
def save_seg_books_txt(corpus_path, target_path): with open(corpus_path, 'r') as fr: doc = fr.read() if doc: with open(target_path, 'w') as fw: words, flags = seg_doc(doc) fw.write(" ".join(words).encode('utf-8')) else: logger.warn('@@@@@@@@@@@@@@@@@@@@@@@@@@2 read from %s, got None')
def _sort_docs_by_subject(self, sentence, triple_docs): """ 基于最长公共子串进行排序 :param sentence: :param triple_docs: :return: """ self.debug('>>> start _sort_docs_by_subject <<<') words, tags = seg_doc(sentence) _words = [w.strip() for w in words if w.strip()] chosen_triple_docs = [] for doc_item in triple_docs: item_str = doc_item.get('attribute_date', "") item_index = doc_item.get("attribute_date_index", "") item_words = [w.strip() for w in item_index.split()] sub_string, length = longest_common_substring( _words, item_words) # 计算_words与item_words的最长公共子串 scores = [ len(sub_string) / float(len(item_words)), ] if item_str in self.entity_synonym: # 若target_sentence在主语拓展库中,计算扩展主语与sentence的匹配度 for extend_str in self.entity_synonym[item_str]: _words, _tags = seg_doc(extend_str) extend_str_words = [w.strip() for w in _words if w.strip()] sub_string, length = longest_common_substring( _words, extend_str_words) scores.append( len(sub_string) / float(len(extend_str_words))) doc_item['score'] = max( scores) # 选取target_sentence及扩展主语与sentence的最大匹配分数作为最后分数 doc_item['length'] = len(item_words) if doc_item['score'] >= TRIPLE_MATCH_THRESHOLD: # 匹配度高于阈值选取该三元组 self.debug('choose item_str=%s, score=%s, length=%s', item_str, doc_item['score'], doc_item['length']) chosen_triple_docs.append(doc_item) else: # 匹配度低于阈值,过滤掉该三元组 self.debug("filter item_str=%s, score=%s, length=%s", item_str, doc_item['score'], doc_item['length']) if chosen_triple_docs: # 按length降序排序 chosen_triple_docs.sort(key=lambda x: x['length'], reverse=True) self.debug('>>> end _sort_docs_by_subject <<<') return chosen_triple_docs
def clear_p_content(content): # 清洗模板页面的正则表达式,并抽取每个正则表达式的关键字 logger.debug('>>> start clear_p_content <<<') ret_content = list() keywords_list = list() for c in content: # 遍历模板将java格式的正则表达式改写为python格式 c_text = c.get_text() ret_content.append(c_text.replace('(?<', '(?P<')) clear_c_text = c_text.replace('(?<title>(.*)?)', '').\ replace('?<title>', '').\ replace('.{0,4}', '').\ replace('.{0,6}', '').\ replace('(.*)?', '') # 抽取正则表达式的中文字符串,用于过滤无关模板 words_str = clear_c_text.replace('(', ' ').replace(')', ' ').replace( '?', ' ').replace('|', ' ') words, tags = seg_doc(words_str) keywords = " ".join(set([w for w in words if w.strip()])) keywords_list.append(keywords) logger.debug('>>> end clear_p_content <<<') return ret_content, keywords_list
def write2mongodb(path): logger.debug('>>> start write2mongodb <<<') triple_docs = load_xlsx(path, start_row=1, start_col=1) logger.debug('load from %s, got triple_docs=%s', path, len(triple_docs)) for doc in triple_docs: query = doc[0] query_words, query_tags = seg_doc(query) answer = doc[1] triple_subject = "" triple_predicate = "" triple_object = "" info = { "query": query, "answer": answer, "query_index": " ".join(query_words), "triple_subject": triple_subject, "triple_predicate": triple_predicate, "triple_object": triple_object } logger.debug('info=%s', json.dumps(info)) collection.insert(info)
def _seg_words(self, sentence): words, flags = seg_doc(sentence) return words
MONGODB_BIOLOGY_NODE from logger import BaseLogger from utils import seg_doc client = MongoClient(MONGODB_HOST, MONGODB_PORT) db = client.get_database(MONGODB_DBNAME) t_collection = db.get_collection(MONGODB_BIOLOGY_TRIPLE) n_collection = db.get_collection(MONGODB_BIOLOGY_NODE) logger = BaseLogger() node_docs = n_collection.find() # 读取MONGODB_BIOLOGY_NODE中的数据 logger.debug('start extract triple and write to %s', MONGODB_BIOLOGY_TRIPLE) for doc in tqdm(node_docs): # 遍历所有节点,读取节点属性信息,并写入到MONGODB_BIOLOGY_TRIPLE中 _id = str(doc['_id']) for key in doc.keys(): if key not in ['_id', 'update_time', 'label', 'create_time']: # 过滤 if key == 'name': # 属性为name triple = {"node_id": _id, "attribute_name": key, "attribute_date": doc[key]} else: # 属性非name,attribute_date需要进行拼接 triple = {"node_id": _id, "attribute_name": key, "attribute_date": "\n".join(doc[key])} words, tags = seg_doc(triple['attribute_date']) triple['attribute_date_index'] = " ".join([w for w in words if w.strip()]) logger.debug('triple=%s', json.dumps(triple, ensure_ascii=False)) t_collection.insert(triple)