def debug(): ner = NamedEntityReg() entity_linker = PageMemoryEntityLinker() baike_ename_title_map = Resource.get_singleton().get_baike_ename_title() table_parser = Resource.get_singleton().get_table_parser( entity_linker, ner) important_doains = Resource.get_singleton().get_important_domains() url = 'baike.so.com/doc/8342332-8659322.html' path = os.path.join(cache_dir, 'tables/兰萨法姆.html') entity_types = entity_linker.bk_info_map[url].types names = entity_linker.url2names[url] page_info = PageInfo(baike_ename_title_map[url][0], names, url, get_url_domains(entity_types, important_doains), entity_types) html = load_html_file(path) entity_linker.start_new_page(url) entity_linker.team_suffix_dict.meet_url( 'baike.so.com/doc/6644091-6857906.html') tables = parse_tables_from_html(html) tables = [encode_table(table) for table in tables] for table in tables: print table['columns'] kns = table_parser.parse(table, page_info, entity_types) for kn in kns: print kn.info()
def collect_team_suffix(suffix_out_path): Print("collect team suffix, write to [%s]" % suffix_out_path) ename_title_map = Resource.get_singleton().get_baike_ename_title() baike_info_map = Resource.get_singleton().get_baike_info() ltp = Resource.get_singleton().get_ltp() suffix_cnt = {} Print("collect suffix") for bk_url in tqdm(baike_info_map, total=len(baike_info_map)): e_types = baike_info_map[bk_url].types if not is_org(e_types): continue enames = ename_title_map[bk_url] for name in enames: words = ltp.cut(name) ed = len(words) for st in range(1, ed): suffix = "".join(words[st:]) if not suffix in suffix_cnt: suffix_cnt[suffix] = 0 suffix_cnt[suffix] += 1 threshold = 10 outf = file(suffix_out_path, 'w') for key in sorted(suffix_cnt, key=lambda x: suffix_cnt[x], reverse=True): cnt = suffix_cnt[key] if cnt < threshold: continue outf.write("%s\t%d\n" % (key, cnt)) outf.close()
def gen_name_map(extractor): baike_ename_title = Resource.get_singleton().get_baike_ename_title() url2names = Resource.get_singleton().get_url2names() bk_static_info = Resource.get_singleton().get_baike_info() location_dict = Resource.get_singleton().get_location_dict() location_dict = set([x.decode('utf-8') for x in location_dict]) all_names = set() for bk_url in url2names: if not bk_url in bk_static_info: continue bk_types = bk_static_info[bk_url].types if is_art_work(bk_types): continue enames = url2names[bk_url] is_son = False for ename in enames: parent_name = extractor.try_extract_parent_name(ename) if parent_name: is_son = True if is_son: continue for ename in enames: all_names.add(ename) name_map = {} Print("extract parent name") for bk_url in tqdm(baike_ename_title, total=len(baike_ename_title)): if not bk_url in bk_static_info: continue bk_types = bk_static_info[bk_url].types if is_art_work(bk_types): continue enames = baike_ename_title[bk_url] for ename in enames: parent_name = extractor.try_extract_parent_name( ename) # return unicode or None if not parent_name: continue if parent_name.encode('utf-8') in all_names: add_to_dict_list(name_map, parent_name, ename.decode('utf-8')) second_parent_name = del_loc_prefix(parent_name, location_dict) if second_parent_name and second_parent_name.encode( 'utf-8') in all_names: add_to_dict_list(name_map, second_parent_name, ename.decode('utf-8')) return name_map
def train_extract_summary_name(summary_path, out_path): outf = file(out_path, 'w') url2names = Resource.get_singleton().get_url2names() extor = SummaryNameExtractor() Print('train summary extra name') for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)): url, summary = line.split('\t') summary = json.loads(summary)['summary'] names = url2names[url] names = [x.decode('utf-8') for x in names] ret = extor.find_name_sentence(summary, names) if ret is None: continue sent, first_name = ret ret = extor.find_extra_name(sent, first_name, names) if ret is None: continue first_name, sent, second_name = ret outs = [url, first_name, sent, second_name] outf.write('%s\n' % ('\t'.join(outs))) outf.close()
def gen_team_suffix_dict(suffixes): suffixes = set(suffixes) outpath = os.path.join(extra_name_dir, 'extra_team_name_dict.tsv') resource = Resource.get_singleton() baike_info_map = resource.get_baike_info() ename_title_map = resource.get_baike_ename_title() url2names = resource.get_url2names() team_dicts = TeamDicts(suffixes) for bk_url in baike_info_map: types = baike_info_map[bk_url].types if not is_team(types): continue ori_names = set(url2names[bk_url]) enames = ename_title_map[bk_url] for ename in enames: suffix = try_get_str_suffix(ename, suffixes) if len(suffix) == 0: continue team_dicts.add_url(bk_url, suffix) new_name = ename[:len(ename) - len(suffix)] if not new_name in ori_names: team_dicts.add_name(bk_url, new_name, suffix) new_name = new_name + "队" if not new_name in ori_names: team_dicts.add_name(bk_url, new_name, suffix) team_dicts.save(outpath)
def __init__(self, doc_processor, rel_extractor, linker, link_map_out=False): self.ltp = Resource.get_singleton().get_ltp() self.doc_processor = doc_processor self.ner = doc_processor.ner self.rel_extractor = rel_extractor self.linker = linker self.title2url = Resource.get_singleton().get_title2url() if link_map_out: self.link_map_outf = file(os.path.join(cache_dir, 'link_map.json'), 'w') else: self.link_map_outf = None
def __init__(self, entity_linker, rel_linker): self.entity_linker = entity_linker self.rel_linker = rel_linker self.schema = Resource.get_singleton().get_schema() self.title2rel = { 'profession': 'fb:people.person.profession', 'nationality': 'fb:people.person.nationality', }
def try_get_str_suffix(name, suffixes): ltp = Resource.get_singleton().get_ltp() words = list(ltp.cut(name)) for st in range(1, len(words)): text = "".join(words[st:]) if text in suffixes: return text return ""
def try_get_suffix(name, suffixes): ltp = Resource.get_singleton().get_ltp() words = list(ltp.cut(name.encode('utf-8'))) words = [w.decode('utf-8') for w in words] for st in range(1, len(words)): text = u"".join(words[st:]) if text in suffixes: return text return u""
def __init__(self, name_dict=None, process_bracket_flag=True, add_time_entity=True): resource = Resource.get_singleton() if name_dict is None: name_dict = resource.get_vertical_domain_baike_dict() self.ltp = resource.get_ltp() self.post_processor = NamedEntityPostProcessor(name_dict, process_bracket_flag, add_time_entity)
def __init__(self, ner=None): self.ner = ner self.ltp = Resource.get_singleton().get_ltp() self.subj_miss_patterns = [ ['p', '《'], ['p', 'n'], ['p', 'j'], ['p', 'v'], ['v'], ['a', 'v'], ['d', 'v'], ]
def generate_data_from_doc(doc_path, bk2fb, fb_uris, outpath): resource = Resource.get_singleton() fb_rels_map = resource.get_half_named_fb_info() ner = NamedEntityReg() e_linker = PageMemoryEntityLinker() doc_processor = DocProcessor(ner) url2names = resource.get_url2names() bk_info_map = resource.get_baike_info() important_domains = resource.get_important_domains() rel_extracotr = VerbRelationExtractor() schema = resource.get_schema() Print('generate data from [%s]' % os.path.basename(doc_path)) outf = file(outpath, 'w') cnt = 0 for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)): bk_url, doc = line.split('\t') if bk_url not in bk2fb: continue fb_uri = bk2fb[bk_url] if fb_uri not in fb_rels_map: continue fb_rels = fb_rels_map[fb_uri] cnt += 1 if cnt % 100 == 0: print "" Print('cnt = %d' % cnt) # Print('parsing %s' %bk_url) # outf.write('##start parsing %s\n' %(bk_url)) bk_info = bk_info_map[bk_url] if bk_info.pop < 4 + 5: continue types = bk_info.types names = url2names[bk_url] page_info = PageInfo(names[-1], names, bk_url, get_url_domains(types, important_domains), types) e_linker.start_new_page(bk_url) # summary = [json.loads(summary)['summary']] # chapter_title = 'intro_summary' doc = json.loads(doc) for chapter_title, chapter in doc: chapter = [para for para in chapter if para.find('</table>') == -1] if len(chapter) == 0: continue generate_data_from_chapter(chapter_title, chapter, page_info, doc_processor, fb_rels, rel_extracotr, outf, e_linker, schema) outf.close()
def test_path(): ner = NamedEntityReg() entity_linker = PageMemoryEntityLinker() baike_ename_title_map = Resource.get_singleton().get_baike_ename_title() table_parser = Resource.get_singleton().get_table_parser( entity_linker, ner) important_doains = Resource.get_singleton().get_important_domains() doc_path = os.path.join(rel_ext_dir, 'baike_doc.json') for line in file(doc_path, 'r'): url, doc = line.split('\t') if not url in entity_linker.bk_info_map: print 'error url', url entity_types = entity_linker.bk_info_map[url].types names = entity_linker.url2names[url] page_info = PageInfo(baike_ename_title_map[url][0], names, url, get_url_domains(entity_types, important_doains), entity_types) entity_linker.start_new_page(url) doc = json.loads(doc) for chapter_title, html in doc: if not type(html) is unicode: continue tables = parse_tables_from_html(html) for table in tables: table = encode_table(table) table_kns = table_parser.parse(table, page_info, entity_types) if len(table_kns) > 0: print chapter_title, table['columns'] for line, row_kns in table_kns: print "\t%s" % line for kn in row_kns: print "\t\t%s" % kn.info()
def load_local_info(path, cnt_path): if not os.path.exists(path): outf = file(path, 'w') bk_info_map = Resource.get_singleton().get_baike_info() for line in file(cnt_path): bk_url = line.split('\t')[0] types = bk_info_map[bk_url].types info = {"types": types} outf.write("%s\t%s\n" %(bk_url, json.dumps(info)) ) outf.close() local_info = {} for line in file(path): bk_url, info = line.split('\t') local_info[bk_url] = json.loads(info) return local_info
def __init__(self, lowercase=True): resource = Resource.get_singleton() resource.load_baike_names(lowercase=lowercase) self.bk_info_map = resource.get_baike_info() self.name2bk = resource.get_name2bk(lowercase) self.url2names = resource.get_url2names(lowercase) self.team_suffix_dict = resource.get_team_suffix_dict() if lowercase: self.lower_name2bk = resource.get_lower_name2bk() self.summary_map = resource.get_summary_with_infobox() self.location_dict = resource.get_location_dict() self.lowercase = lowercase self.memory = PageMemory() self.adjust_pop_by_summary()
def gen_province_dict(): Print('generate province dict') resource = Resource.get_singleton() baike_info_map = resource.get_baike_info() ename_title_map = resource.get_baike_ename_title() out_path = os.path.join(dict_dir, 'province.txt') province_names = set() error_domains = get_error_domains() for bk_url in tqdm(ename_title_map, total=len(ename_title_map)): enames = ename_title_map[bk_url] if not bk_url in baike_info_map: continue bk_info = baike_info_map[bk_url] bk_types = bk_info.types if not "fb:location.location" in bk_types: continue is_province = False for bk_type in bk_types: if get_domain(bk_type) == 'fb:location' and ( 'state' in bk_type or "province" in bk_type): print "province bk_type: %s" % bk_url is_province = True for ename in enames: ename = ename.decode('utf-8') if len(ename) > 2 and (ename.endswith(u'省') or ename.endswith(u"州")): print "province ename: %s %s" % (ename, bk_url) is_province = True # if is_province: # for bk_type in bk_types: # if get_domain(bk_type) in error_domains: # is_province = False # print "province error type: %s" %(bk_url) if is_province: province_names.update(enames) outf = file(out_path, 'w') for name in province_names: if not is_chinese(name): continue outf.write("%s\n" % (name)) outf.close()
def process(inpath, outpath, name_map, fb_uris): schema = Resource.get_singleton().get_schema() error_props = load_error_property() Print('process %s' % inpath) outf = file(outpath, 'w') error_outf = file('log/error.log', 'w') for line in tqdm(file(inpath), total=nb_lines_of(inpath)): fb_key, rels = line.split('\t') if not fb_key in fb_uris: continue rels = json.loads(rels) new_rels = {} for fb_property, obj in rels: if schema.reverse_property(fb_property) == fb_property: continue if fb_property in error_props: continue if obj in name_map: names = name_map[obj] else: literal = process_fb_value(obj) if literal.startswith('fb:m.'): # error_outf.write('error property %s, entity %s\n' %(fb_property, fb_key)) names = [] else: names = [process_fb_value(obj)] if len(names) == 0: continue if not fb_property in new_rels: new_rels[fb_property] = [] new_rels[fb_property].extend(names) big = False for fb_property in new_rels: new_rels[fb_property] = list(set(new_rels[fb_property])) if len(new_rels[fb_property]) > 300: error_outf.write( 'big size property of url = %s, property = %s, size = %d\n' % (fb_key, fb_property, len(new_rels[fb_property]))) outf.write("%s\t%s\n" % (fb_key, json.dumps(new_rels, ensure_ascii=False))) outf.close() error_outf.close()
def test_chapt(): import json import os urls = [ 'baike.so.com/doc/1287918-1361771.html', 'baike.so.com/doc/4835393-5052275.html', 'baike.so.com/doc/2526484-2669235.html', 'baike.so.com/doc/5382393-5618748.html', 'baike.so.com/doc/6662392-6876216.html', 'baike.so.com/doc/3056594-3221987.html', 'baike.so.com/doc/8716294-9038723.html', 'baike.so.com/doc/5390356-5627004.html' ] # urls = ["baike.so.com/doc/1287918-1361771.html"] resource = Resource.get_singleton() url2names = resource.get_url2names() baike_info_map = resource.get_baike_info() baike_doc_path = os.path.join(rel_ext_dir, 'baike_doc.json') doc_processor = DocProcessor() for line in file(baike_doc_path): url, doc = line.split('\t') if not url in urls: continue doc = json.loads(doc) names = url2names[url] ename = names[0] types = baike_info_map[url].types page_info = PageInfo(ename, names, url, [], types) print " ".join(page_info.names) for chapter_title, chapter in doc: print 'parsing %s' % chapter_title for ltp_result, str_entities, subj_miss in doc_processor.parse_chapter( chapter_title, chapter, page_info, parse_ner=True): # print '\t' + ltp_result.sentence if subj_miss: print "\t\t" + ltp_result.sentence
def gen_citytown_dict(): Print('generate citytown dict') resource = Resource.get_singleton() baike_info_map = resource.get_baike_info() ename_title_map = resource.get_baike_ename_title() citydown_names = set() for bk_url in tqdm(baike_info_map, total=len(baike_info_map)): if not bk_url in baike_info_map or not bk_url in ename_title_map: continue bk_types = baike_info_map[bk_url].types if not 'fb:location.location' in bk_types: continue if not "fb:location.citytown" in bk_types: continue if 'fb:people.person' in bk_types: continue enames = ename_title_map[bk_url] # is_error_name = False # error_suffix = ['乡', "镇", '村', '街道', '道路'] # for ename in enames: # for suffix in error_suffix: # if ename.endswith(error_suffix): # is_error_name = True # if is_error_name: # continue citydown_names.update(enames) out_path = os.path.join(dict_dir, 'citytown.txt') outf = file(out_path, 'w') for name in citydown_names: if not is_chinese(name): continue outf.write("%s\n" % name) outf.close()
def __init__(self, name_dict, process_bracket_flag, add_time_entity): self.dict = name_dict self.process_bracket_flag = process_bracket_flag self.add_time_entity = add_time_entity self.ltp = Resource.get_singleton().get_ltp()
def link_partial_match_predicate(self, predicate): mapped_probs = {} for infobox_pred in self.predicate_map: if infobox_pred.find(predicate) == -1: continue match_ratio = len(predicate.decode('utf-8')) / float( len(infobox_pred.decode('utf-8'))) if match_ratio < 0.5: continue probs = self.predicate_map[infobox_pred] for fb_prop in probs: prob = probs[fb_prop] * match_ratio if prob < 0.1: continue if mapped_probs.get(fb_prop, 0) < prob: mapped_probs[fb_prop] = prob return mapped_probs if __name__ == "__main__": # rel_linker = MatchRelLinker() # probs = rel_linker.link_partial_match_predicate(u'出版') resource = Resource.get_singleton() predicate_map = resource.get_predicate_map() probs = predicate_map['决赛'] for prop in probs: print prop, probs[prop]
def __init__(self): resource = Resource.get_singleton() self.predicate_map = resource.get_predicate_map()
return False # if re_eng.match(name): # return False # if has_punc_eng(name): # return False if BaikeDatetime.parse(name, strict=True) is not None: return False return True def get_domain(fb_type): return fb_type.split('.')[0] # valid_domains = set(['fb:film', 'fb:tv', 'fb:soccer', 'fb:sports', 'fb:astronomy', 'fb:music', 'fb:book', 'fb:award']) valid_domains = Resource.get_singleton().get_important_domains() def is_vertical_domain(types): global valid_domains for fb_type in types: if get_domain(fb_type) in valid_domains or fb_type in valid_domains: return True return False if __name__ == "__main__": name2bk = Resource.get_singleton().get_name2bk() keys = sorted(name2bk.keys())
def test_ltp_extractor(datas_map, doc_processor, rel_extractor, linker): resource = Resource.get_singleton() schema = resource.get_schema() # base_dir = os.path.join(data_dir, '标注数据') # stf_results_map = load_stanford_result(os.path.join(base_dir, 'sentences.txt'), os.path.join(base_dir, 'sentences_stanf_nlp.json')) link_maps = None link_maps = load_links_map(os.path.join(cache_dir, 'link_map.json')) ltp_extractor = SimpleLTPExtractor(doc_processor, rel_extractor, linker, link_maps is None) url2names = resource.get_url2names() bk_info_map = resource.get_baike_info() url_map = load_url_map() important_domains = resource.get_important_domains() same_link_map = load_same_linkings() estimation = { "total output": 0, 'total labeled': 0, 'right output': 0, } str_estimation = { "total output": 0, 'total labeled': 0, 'right output': 0, } for baike_name in datas_map: datas = datas_map[baike_name] url = url_map[baike_name] # print baike_name # print url names = url2names[url] types = bk_info_map[url].types page_info = PageInfo(baike_name, names, url, get_url_domains(types, important_domains), types) linker.entity_linker.start_new_page(url) for data in datas: sentence = data.sentence # if sentence != '《生活大爆炸》(The Big Bang Theory)是由查克·洛尔和比尔·普拉迪创作的一出美国情景喜剧,此剧由华纳兄弟电视公司和查克·洛尔制片公司共同制作。': # continue print sentence para_info = ParagraphInfo(3, names, baike_name, False, True) ltp_result, _ = doc_processor.parse_sentence(sentence, para_info) str_entities = doc_processor.ner.recognize(ltp_result.sentence, ltp_result, page_info, None) triples, ltp_result = ltp_extractor.parse_sentence( ltp_result, str_entities, page_info, link_maps) kl_set = set() str_set = set() for kl in data.knowledges: kl.subj_url = same_link_map.get(kl.subj_url, kl.subj_url) kl.obj_url = same_link_map.get(kl.obj_url, kl.obj_url) str_set.add("%s\t%s\t%s" % (kl.subj.encode('utf-8'), kl.prop.encode('utf-8'), kl.obj.encode('utf-8'))) str_set.add("%s\t%s\t%s" % (kl.obj.encode('utf-8'), kl.prop.encode('utf-8'), kl.subj.encode('utf-8'))) kl_set.add("%s\t%s\t%s" % (kl.subj_url, kl.prop_uri, kl.obj_url)) reverse_prop_uri = schema.reverse_property(kl.prop_uri) if reverse_prop_uri: kl_set.add("%s\t%s\t%s" % (kl.obj_url, reverse_prop_uri, kl.subj_url)) estimation['total labeled'] += len(data.knowledges) str_estimation['total labeled'] += len(data.knowledges) for triple in triples: str_estimation['total output'] += 1 subj = ltp_result.text(triple.baike_subj.st, triple.baike_subj.ed) obj = ltp_result.text(triple.baike_obj.st, triple.baike_obj.ed) rel = ltp_result.text(triple.fb_rel.st, triple.fb_rel.ed) subj_url = same_link_map.get(triple.baike_subj.baike_url, triple.baike_subj.baike_url) obj_url = same_link_map.get(triple.baike_obj.baike_url, triple.baike_obj.baike_url) prop = triple.fb_rel.fb_prop triple_str = "%s\t%s\t%s" % (subj, rel, obj) if triple_str in str_set: flag_str = 'str_right' str_estimation['right output'] += 1 else: flag_str = "str_error" if prop == 'None': info = "%s:%s\t%s\t%s:%s" % (subj, subj_url, rel, obj, obj_url) print "\t%s\t%s" % (info, flag_str) continue info = triple.info(ltp_result) estimation['total output'] += 1 if "%s\t%s\t%s" % (subj_url, prop, obj_url) in kl_set or "%s\t%s\t%s" % ( subj_url, prop.split("^")[0], obj_url) in kl_set or "%s\t%s\t%s" % ( subj_url, prop.split("^")[-1], obj_url) in kl_set: estimation['right output'] += 1 print '\t%s\t%s' % (info, 'full_right') else: print '\t%s\t%s' % (info, 'full_error') # str_estimation['total labeled'] += len(data.knowledges) # str_estimation['total output'] += len(half_linked_triples) # for triple in half_linked_triples: # subj = ltp_result.text(triple.baike_subj.st, triple.baike_subj.ed) # obj = ltp_result.text(triple.baike_obj.st, triple.baike_obj.ed) # subj_url = same_link_map.get(triple.baike_subj.baike_url, triple.baike_subj.baike_url) # obj_url = same_link_map.get(triple.baike_obj.baike_url, triple.baike_obj.baike_url) # rel = ltp_result.text(triple.str_rel.st, triple.str_rel.ed) # triple_str = "%s\t%s\t%s" %(subj_url, rel, obj_url) # info = "%s:%s\t%s\t%s:%s" %(subj, subj_url, rel, obj, obj_url) # if triple_str in str_set: # str_estimation['right output'] += 1 # print "\t%s\t%s" %(info, 'str_right') # else: # print "\t%s\t%s" %(info, 'str_error') for kl in data.knowledges: print '\t\t%s' % kl.info() print estimation print str_estimation ltp_extractor.finish()
def work(inpath, outpath): ner = NamedEntityReg() doc_processor = DocProcessor(ner) rel_extractor = VerbRelationExtractor() entity_linker = PageMemoryEntityLinker() rel_linker = MatchRelLinker() linker = SeparatedLinker(entity_linker, rel_linker) table_parser = Resource.get_singleton().get_table_parser( entity_linker, ner) ltp_extractor = SimpleLTPExtractor(doc_processor, rel_extractor, linker) url2names = linker.entity_linker.url2names bk_info_map = linker.entity_linker.bk_info_map baike_ename_title_map = Resource.get_singleton().get_baike_ename_title() important_domains = Resource.get_singleton().get_important_domains() schema = Resource.get_singleton().get_schema() outf = file(outpath, 'w') total = nb_lines_of(inpath) for cnt, line in enumerate(file(inpath), start=1): url, chapters = line.split('\t') if not url in url2names or not url in bk_info_map or not url in baike_ename_title_map: print 'error url %s' % (url) continue chapters = json.loads(chapters) outf.write('##parse url:%s\n' % url) Print('parse url:%s (%d/%d)' % (url, cnt, total)) names = url2names[url] types = bk_info_map[url].types page_info = PageInfo(baike_ename_title_map[url][0], names, url, get_url_domains(types, important_domains), types) entity_linker.start_new_page(url) kn_writer = PageKnowledgeHandler() for title, chapter in chapters: try: if type(chapter) is unicode: tables = parse_tables_from_html(chapter) tables = [encode_table(table) for table in tables] for table in tables: table_kns = table_parser.parse(table, page_info, types) if len(table_kns) > 0: for line, row_kns in table_kns: kns = [] for kn in row_kns: kns.append("\t%s\t1" % kn.info()) kn_writer.add(line, kns) # outf.write("%s\n" %line) # for kn in row_kns: # outf.write("\t%s\t1\n" %kn.info()) else: for ltp_result, str_entities, _ in doc_processor.parse_chapter( title, chapter, page_info, parse_ner=True): if ltp_result is None: continue triples, _ = ltp_extractor.parse_sentence( ltp_result, str_entities, page_info, None, False) triples = [ triple for triple in triples if triple.score() > 0.01 ] if len(triples) > 0: kns = [] for triple in triples: kns.append("\t%s" % triple.info(ltp_result)) kn_writer.add(ltp_result.sentence, kns) # outf.write("%s\n" %(ltp_result.sentence)) # for triple in triples: # outf.write("\t%s\n" %triple.info(ltp_result)) except Exception, e: print "error at url:%s chapter:%s" % (url, title) print str(e) kn_writer.handle_uniq_prop(schema) kn_writer.output(outf)
def gen_title_rel_dict(fb_type, count_filepath, out_path, cnt_threshold, extra_name_filepath=None, error_func=None, url_path=None): Print('gen dict by type [%s]' % fb_type) candidate_urls = set() resource = Resource.get_singleton() baike_static_info_map = resource.get_baike_info() Print("gen candidate baike_url") for bk_url in tqdm(baike_static_info_map, total=len(baike_static_info_map)): types = baike_static_info_map[bk_url].types if fb_type in types: candidate_urls.add(bk_url) candidate_names = set() if count_filepath is not None: for line in file(count_filepath): p = line.strip().split('\t') if len(p) == 2: name, cnt = p cnt = int(cnt) if cnt >= cnt_threshold: candidate_names.add(name) Print('#candidate urls = %d, #candidate names = %d' % (len(candidate_urls), len(candidate_names))) # ename_title_map = resource.get_baike_ename_title() url2names = resource.get_url2names() title_names = set() title2url = {} for candidate_url in candidate_urls: enames = url2names[candidate_url] for ename in enames: if ename in candidate_names or count_filepath is None: # assert ename not in title_names title_names.add(ename) if ename in title2url: pre_pop = baike_static_info_map[title2url[ename]].pop pop = baike_static_info_map[candidate_url].pop if pre_pop > pop: title_url = title2url[ename] else: title_url = candidate_url else: title_url = candidate_url title2url[ename] = title_url else: print "%s: miss name: %s" % (fb_type, ename) if extra_name_filepath is not None: Print("add extra name from [%s]" % extra_name_filepath) for line in file(extra_name_filepath): title_names.add(line.rstrip()) outf = file(out_path, 'w') if url_path: url_outf = file(url_path, 'w') for title_name in sorted(title_names): if title_name == '无': continue if error_func is not None and error_func(title_name): print "%s: error func name: %s" % (fb_type, title_name) continue if len(title_name.decode('utf-8')) < 2: print "%s: short name: %s" % (fb_type, title_name) continue if is_chinese(title_name): outf.write(title_name + '\n') if url_path: url_outf.write("%s\t%s\n" % (title_name, title2url[title_name])) outf.close() if url_path: url_outf.close()
def load_and_write_baike_name(bk_name_map, error_bracket_names, out_path): resource = Resource.get_singleton() bk_info_map = resource.get_baike_info() baike_entity_info_path = os.path.join(result_dir, '360/360_entity_info.json') total = 21710208 Print('load and write baike name to [%s]' % out_path) baike_name_attrs = load_baike_name_attrs() art_work_types = set([ 'fb:film.film', 'fb:book.book', 'fb:book.written_work', 'fb:cvg.computer_videogame', 'fb:tv.tv_program' ]) outf = file(out_path, 'w') error_f = file('log/error.log', 'a') for line in tqdm(file(baike_entity_info_path), total=total): bk_url, obj = line.split('\t') bk_url = bk_url.decode('utf-8') static_info = bk_info_map[bk_url] bk_types = static_info.types is_art_work = False for bk_type in bk_types: if bk_type in art_work_types: is_art_work = True obj = json.loads(obj) names = [obj['ename'], obj['title']] info_names = set() info = obj['info'] for baike_name in baike_name_attrs: if not baike_name in info: continue info_values = info[baike_name] for info_value in info_values: if info_value in names: continue info_bracket_names = [] info_value_names = unfold(info_value, info_bracket_names, [obj['ename'], obj['title']]) for info_value_name in info_value_names: in_bracket = info_value_name in info_bracket_names if in_bracket and info_value_name in error_bracket_names and not is_abbre( info_value_names, info_value_name): continue if in_bracket and is_art_work: continue info_names.add(info_value_name) names.extend(info_names) fb_names = bk_name_map.get(bk_url, []) fb_names = [x for x in fb_names if not x in names] if len(fb_names) < 10: names.extend(fb_names) else: error_f.write('%s\t%s\t%s\n' % ('long_baike_names', bk_url, ' '.join(fb_names))) names = list(set(names)) names = [html_unescape(x.replace('\n', "")).strip() for x in names] outf.write("%s\t%s\n" % (bk_url, "\t".join(names))) # is_person = "fb:people.person" in bk_types_map[bk_url] # if is_person: # extra_names = [] # for name in names: # extra_names = person_extra_names(name) # names.extend(extra_names) outf.close() error_f.close()
from .table_rule_parser import TableParser from .collect_table import collect_tables, load_local_info from src.extractor.resource import Resource if __name__ == "__main__": table_parser = TableParser(None, None) table_parser.init(paths = None) table_parser.load_extra_table(None) table_cnt_path = os.path.join(table_dir, 'table_column_count.tsv') local_type_info_path = os.path.join(table_dir, 'local_info.json') local_info = load_local_info(local_type_info_path, table_cnt_path) cols_cnt, cols_type_cnt, cols_title_cnt = collect_tables(table_cnt_path, local_info) ruled_tables = set() schema = Resource.get_singleton().get_schema() for table_rule in table_parser.table_rules: table_rule.check(schema) for t in table_rule.register_tables: ruled_tables.add(t) coverred_cnt = 0 miss_cnt = 0 total = 0 for table in cols_cnt: cnt = cols_cnt[table] total += cnt if cnt < 20: # if cnt > 20 and cnt < 30: # print table, cnt continue
def extract_summary_name(summary_path, keywords, outpath, bracket_name_outpath): Print('extract extra name from [%s]' % summary_path) # url2names = Resource.get_singleton().get_url2names() url2names = load_url2names() bk_info_map = Resource.get_singleton().get_baike_info() error_domains = ['fb:chemistry'] ext = SummaryNameExtractor() outf = file(outpath, 'w') bracket_name_outf = file(bracket_name_outpath, 'w') for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)): url, summary = line.split('\t') types = bk_info_map[url].types in_error_domain = False for bk_type in types: if get_domain(bk_type) in error_domains: in_error_domain = True if in_error_domain: continue summary = json.loads(summary)['summary'] summary = summary.replace(u'(', u'(').replace(u')', u')') names = url2names[url] names = [x.decode('utf-8') for x in names] ret = ext.find_name_sentence(summary, names) if ret is None: extra_name = None sentences = split_sentences(summary) if len(sentences) > 0: first_sentence = sentences[0] no_subj = True for name in names: if name in first_sentence: no_subj = False if no_subj: extra_name = ext.find_no_subj_name(summary, keywords) else: rest_sentence, first_name = ret extra_name = ext.find_new_extra_name(rest_sentence, keywords) if extra_name is not None: extra_name = extra_name.strip() extra_names = unfold(extra_name, names) succeed_names = [] for extra_name in extra_names: extra_name = extra_name.strip(u'\'" \t\n”“') if not has_strange_punc(extra_name) \ and not too_long_name(extra_name, names) \ and not extra_name in names \ and not error_bracket_name(extra_name, names) \ and not too_short_name(extra_name) \ and not is_error_name(extra_name) \ and not digit_in_name(extra_name): succeed_names.append(extra_name) if len(succeed_names) > 0: succeed_names = list(set(succeed_names)) outf.write('%s\t%s\n' % (url, "\t".join(succeed_names))) names.extend(succeed_names) # extract bracket name extra_bracket_names = ext.extract_bracket_names( summary, keywords, names) succeed_names = [] for extra_name in extra_bracket_names: extra_name = extra_name.strip() extra_names = unfold(extra_name, names) for extra_name in extra_names: extra_name = extra_name.strip(u'\'" \t\n”“') if not has_strange_punc(extra_name) \ and not too_long_name(extra_name, names) \ and not extra_name in names \ and not too_short_name(extra_name) \ and not is_error_name(extra_name) \ and not digit_in_name(extra_name): succeed_names.append(extra_name) if len(succeed_names) > 0: succeed_names = list(set(succeed_names)) bracket_name_outf.write('%s\t%s\n' % (url, "\t".join(succeed_names))) outf.close() bracket_name_outf.close()
def extract_type_info(son_name_map_path, bk_type_info_path): if os.path.exists(bk_type_info_path): return resource = Resource.get_singleton() baike_ename_title = resource.get_baike_ename_title() name2bk = resource.get_name2bk() baike_info_map = resource.get_baike_info() ename2bk = {} for bk_url in baike_ename_title: if not bk_url in baike_info_map: continue types = baike_info_map[bk_url].types if is_art_work(types): continue enames = baike_ename_title[bk_url] for ename in enames: if not ename in ename2bk: ename2bk[ename] = [] ename2bk[ename].append(bk_url) local_info_map = {} for line in file(son_name_map_path, 'r'): names = line.strip().split('\t') parent_name = names[0] son_names = names[1:] parent_urls = name2bk[parent_name] for parent_url in parent_urls: types = baike_info_map[parent_url].types if is_art_work(types): continue if not parent_url in local_info_map: local_info_map[parent_url] = {} info = local_info_map[parent_url] info['type'] = types info['fb'] = baike_info_map[parent_url].fb_uri if not 'name' in info: info['name'] = [] info['name'].append(parent_name) for son_name in son_names: son_urls = ename2bk[son_name] for son_url in son_urls: if not son_url in local_info_map: local_info_map[son_url] = {} types = baike_info_map[son_url].types info = local_info_map[son_url] info['type'] = types info['fb'] = baike_info_map[parent_url].fb_uri if not 'name' in info: info['name'] = [] info['name'].append(son_name) outf = file(bk_type_info_path, 'w') for url in local_info_map: outf.write('%s\t%s\n' % (url, json.dumps(local_info_map[url], ensure_ascii=False))) outf.close()