def debug():

    ner = NamedEntityReg()
    entity_linker = PageMemoryEntityLinker()

    baike_ename_title_map = Resource.get_singleton().get_baike_ename_title()

    table_parser = Resource.get_singleton().get_table_parser(
        entity_linker, ner)
    important_doains = Resource.get_singleton().get_important_domains()

    url = 'baike.so.com/doc/8342332-8659322.html'
    path = os.path.join(cache_dir, 'tables/兰萨法姆.html')
    entity_types = entity_linker.bk_info_map[url].types
    names = entity_linker.url2names[url]
    page_info = PageInfo(baike_ename_title_map[url][0], names, url,
                         get_url_domains(entity_types, important_doains),
                         entity_types)
    html = load_html_file(path)
    entity_linker.start_new_page(url)
    entity_linker.team_suffix_dict.meet_url(
        'baike.so.com/doc/6644091-6857906.html')

    tables = parse_tables_from_html(html)
    tables = [encode_table(table) for table in tables]

    for table in tables:
        print table['columns']
        kns = table_parser.parse(table, page_info, entity_types)
        for kn in kns:
            print kn.info()
示例#2
0
def collect_team_suffix(suffix_out_path):
    Print("collect team suffix, write to [%s]" % suffix_out_path)

    ename_title_map = Resource.get_singleton().get_baike_ename_title()
    baike_info_map = Resource.get_singleton().get_baike_info()
    ltp = Resource.get_singleton().get_ltp()
    suffix_cnt = {}

    Print("collect suffix")
    for bk_url in tqdm(baike_info_map, total=len(baike_info_map)):
        e_types = baike_info_map[bk_url].types
        if not is_org(e_types):
            continue
        enames = ename_title_map[bk_url]

        for name in enames:
            words = ltp.cut(name)
            ed = len(words)
            for st in range(1, ed):
                suffix = "".join(words[st:])
                if not suffix in suffix_cnt:
                    suffix_cnt[suffix] = 0
                suffix_cnt[suffix] += 1

    threshold = 10
    outf = file(suffix_out_path, 'w')
    for key in sorted(suffix_cnt, key=lambda x: suffix_cnt[x], reverse=True):
        cnt = suffix_cnt[key]
        if cnt < threshold:
            continue
        outf.write("%s\t%d\n" % (key, cnt))
    outf.close()
def gen_name_map(extractor):
    baike_ename_title = Resource.get_singleton().get_baike_ename_title()
    url2names = Resource.get_singleton().get_url2names()
    bk_static_info = Resource.get_singleton().get_baike_info()
    location_dict = Resource.get_singleton().get_location_dict()

    location_dict = set([x.decode('utf-8') for x in location_dict])
    all_names = set()

    for bk_url in url2names:
        if not bk_url in bk_static_info:
            continue
        bk_types = bk_static_info[bk_url].types
        if is_art_work(bk_types):
            continue
        enames = url2names[bk_url]
        is_son = False
        for ename in enames:
            parent_name = extractor.try_extract_parent_name(ename)
            if parent_name:
                is_son = True
        if is_son:
            continue

        for ename in enames:
            all_names.add(ename)

    name_map = {}
    Print("extract parent name")
    for bk_url in tqdm(baike_ename_title, total=len(baike_ename_title)):
        if not bk_url in bk_static_info:
            continue
        bk_types = bk_static_info[bk_url].types
        if is_art_work(bk_types):
            continue

        enames = baike_ename_title[bk_url]
        for ename in enames:
            parent_name = extractor.try_extract_parent_name(
                ename)  # return unicode or None
            if not parent_name:
                continue
            if parent_name.encode('utf-8') in all_names:
                add_to_dict_list(name_map, parent_name, ename.decode('utf-8'))
            second_parent_name = del_loc_prefix(parent_name, location_dict)
            if second_parent_name and second_parent_name.encode(
                    'utf-8') in all_names:
                add_to_dict_list(name_map, second_parent_name,
                                 ename.decode('utf-8'))

    return name_map
示例#4
0
def train_extract_summary_name(summary_path, out_path):
    outf = file(out_path, 'w')
    url2names = Resource.get_singleton().get_url2names()
    extor = SummaryNameExtractor()
    Print('train summary extra name')
    for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)):
        url, summary = line.split('\t')
        summary = json.loads(summary)['summary']
        names = url2names[url]
        names = [x.decode('utf-8') for x in names]

        ret = extor.find_name_sentence(summary, names)
        if ret is None:
            continue
        sent, first_name = ret

        ret = extor.find_extra_name(sent, first_name, names)

        if ret is None:
            continue

        first_name, sent, second_name = ret
        outs = [url, first_name, sent, second_name]
        outf.write('%s\n' % ('\t'.join(outs)))
    outf.close()
def gen_team_suffix_dict(suffixes):
    suffixes = set(suffixes)
    outpath = os.path.join(extra_name_dir, 'extra_team_name_dict.tsv')
    resource = Resource.get_singleton()
    baike_info_map = resource.get_baike_info()
    ename_title_map = resource.get_baike_ename_title()
    url2names = resource.get_url2names()

    team_dicts = TeamDicts(suffixes)
    for bk_url in baike_info_map:
        types = baike_info_map[bk_url].types
        if not is_team(types):
            continue

        ori_names = set(url2names[bk_url])

        enames = ename_title_map[bk_url]
        for ename in enames:
            suffix = try_get_str_suffix(ename, suffixes)
            if len(suffix) == 0:
                continue
            team_dicts.add_url(bk_url, suffix)
            new_name = ename[:len(ename) - len(suffix)]

            if not new_name in ori_names:
                team_dicts.add_name(bk_url, new_name, suffix)

            new_name = new_name + "队"
            if not new_name in ori_names:
                team_dicts.add_name(bk_url, new_name, suffix)

    team_dicts.save(outpath)
 def __init__(self,
              doc_processor,
              rel_extractor,
              linker,
              link_map_out=False):
     self.ltp = Resource.get_singleton().get_ltp()
     self.doc_processor = doc_processor
     self.ner = doc_processor.ner
     self.rel_extractor = rel_extractor
     self.linker = linker
     self.title2url = Resource.get_singleton().get_title2url()
     if link_map_out:
         self.link_map_outf = file(os.path.join(cache_dir, 'link_map.json'),
                                   'w')
     else:
         self.link_map_outf = None
示例#7
0
 def __init__(self, entity_linker, rel_linker):
     self.entity_linker = entity_linker
     self.rel_linker = rel_linker
     self.schema = Resource.get_singleton().get_schema()
     self.title2rel = {
         'profession': 'fb:people.person.profession',
         'nationality': 'fb:people.person.nationality',
     }
示例#8
0
def try_get_str_suffix(name, suffixes):
    ltp = Resource.get_singleton().get_ltp()
    words = list(ltp.cut(name))

    for st in range(1, len(words)):
        text = "".join(words[st:])
        if text in suffixes:
            return text
    return ""
示例#9
0
def try_get_suffix(name, suffixes):
    ltp = Resource.get_singleton().get_ltp()
    words = list(ltp.cut(name.encode('utf-8')))
    words = [w.decode('utf-8') for w in words]

    for st in range(1, len(words)):
        text = u"".join(words[st:])
        if text in suffixes:
            return text
    return u""
示例#10
0
 def __init__(self,
              name_dict=None,
              process_bracket_flag=True,
              add_time_entity=True):
     resource = Resource.get_singleton()
     if name_dict is None:
         name_dict = resource.get_vertical_domain_baike_dict()
     self.ltp = resource.get_ltp()
     self.post_processor = NamedEntityPostProcessor(name_dict,
                                                    process_bracket_flag,
                                                    add_time_entity)
示例#11
0
 def __init__(self, ner=None):
     self.ner = ner
     self.ltp = Resource.get_singleton().get_ltp()
     self.subj_miss_patterns = [
         ['p', '《'],
         ['p', 'n'],
         ['p', 'j'],
         ['p', 'v'],
         ['v'],
         ['a', 'v'],
         ['d', 'v'],
     ]
def generate_data_from_doc(doc_path, bk2fb, fb_uris, outpath):
    resource = Resource.get_singleton()
    fb_rels_map = resource.get_half_named_fb_info()
    ner = NamedEntityReg()
    e_linker = PageMemoryEntityLinker()
    doc_processor = DocProcessor(ner)
    url2names = resource.get_url2names()
    bk_info_map = resource.get_baike_info()
    important_domains = resource.get_important_domains()
    rel_extracotr = VerbRelationExtractor()
    schema = resource.get_schema()

    Print('generate data from [%s]' % os.path.basename(doc_path))
    outf = file(outpath, 'w')
    cnt = 0
    for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)):
        bk_url, doc = line.split('\t')
        if bk_url not in bk2fb:
            continue
        fb_uri = bk2fb[bk_url]
        if fb_uri not in fb_rels_map:
            continue
        fb_rels = fb_rels_map[fb_uri]
        cnt += 1
        if cnt % 100 == 0:
            print ""
            Print('cnt = %d' % cnt)
        # Print('parsing %s' %bk_url)
        # outf.write('##start parsing %s\n' %(bk_url))

        bk_info = bk_info_map[bk_url]
        if bk_info.pop < 4 + 5:
            continue
        types = bk_info.types
        names = url2names[bk_url]
        page_info = PageInfo(names[-1], names, bk_url,
                             get_url_domains(types, important_domains), types)

        e_linker.start_new_page(bk_url)

        # summary = [json.loads(summary)['summary']]
        # chapter_title = 'intro_summary'

        doc = json.loads(doc)
        for chapter_title, chapter in doc:
            chapter = [para for para in chapter if para.find('</table>') == -1]
            if len(chapter) == 0:
                continue
            generate_data_from_chapter(chapter_title, chapter, page_info,
                                       doc_processor, fb_rels, rel_extracotr,
                                       outf, e_linker, schema)

    outf.close()
def test_path():
    ner = NamedEntityReg()
    entity_linker = PageMemoryEntityLinker()

    baike_ename_title_map = Resource.get_singleton().get_baike_ename_title()

    table_parser = Resource.get_singleton().get_table_parser(
        entity_linker, ner)
    important_doains = Resource.get_singleton().get_important_domains()

    doc_path = os.path.join(rel_ext_dir, 'baike_doc.json')
    for line in file(doc_path, 'r'):
        url, doc = line.split('\t')
        if not url in entity_linker.bk_info_map:
            print 'error url', url
        entity_types = entity_linker.bk_info_map[url].types
        names = entity_linker.url2names[url]
        page_info = PageInfo(baike_ename_title_map[url][0], names, url,
                             get_url_domains(entity_types, important_doains),
                             entity_types)

        entity_linker.start_new_page(url)
        doc = json.loads(doc)
        for chapter_title, html in doc:

            if not type(html) is unicode:
                continue

            tables = parse_tables_from_html(html)

            for table in tables:
                table = encode_table(table)

                table_kns = table_parser.parse(table, page_info, entity_types)
                if len(table_kns) > 0:
                    print chapter_title, table['columns']
                    for line, row_kns in table_kns:
                        print "\t%s" % line
                        for kn in row_kns:
                            print "\t\t%s" % kn.info()
def load_local_info(path, cnt_path):
    if not os.path.exists(path):
        outf = file(path, 'w')
        bk_info_map = Resource.get_singleton().get_baike_info()
        for line in file(cnt_path):
            bk_url = line.split('\t')[0]
            types = bk_info_map[bk_url].types
            info = {"types": types}
            outf.write("%s\t%s\n" %(bk_url, json.dumps(info)) )
        outf.close()

    local_info = {}
    for line in file(path):
        bk_url, info = line.split('\t')
        local_info[bk_url] = json.loads(info)
    return local_info
示例#15
0
    def __init__(self, lowercase=True):
        resource = Resource.get_singleton()
        resource.load_baike_names(lowercase=lowercase)
        self.bk_info_map = resource.get_baike_info()
        self.name2bk = resource.get_name2bk(lowercase)
        self.url2names = resource.get_url2names(lowercase)
        self.team_suffix_dict = resource.get_team_suffix_dict()
        if lowercase:
            self.lower_name2bk = resource.get_lower_name2bk()
        self.summary_map = resource.get_summary_with_infobox()

        self.location_dict = resource.get_location_dict()

        self.lowercase = lowercase
        self.memory = PageMemory()

        self.adjust_pop_by_summary()
def gen_province_dict():
    Print('generate province dict')
    resource = Resource.get_singleton()
    baike_info_map = resource.get_baike_info()
    ename_title_map = resource.get_baike_ename_title()
    out_path = os.path.join(dict_dir, 'province.txt')
    province_names = set()
    error_domains = get_error_domains()
    for bk_url in tqdm(ename_title_map, total=len(ename_title_map)):
        enames = ename_title_map[bk_url]
        if not bk_url in baike_info_map:
            continue
        bk_info = baike_info_map[bk_url]
        bk_types = bk_info.types
        if not "fb:location.location" in bk_types:
            continue

        is_province = False
        for bk_type in bk_types:
            if get_domain(bk_type) == 'fb:location' and (
                    'state' in bk_type or "province" in bk_type):
                print "province bk_type: %s" % bk_url
                is_province = True

        for ename in enames:
            ename = ename.decode('utf-8')
            if len(ename) > 2 and (ename.endswith(u'省')
                                   or ename.endswith(u"州")):
                print "province ename: %s %s" % (ename, bk_url)
                is_province = True

        # if is_province:
        #     for bk_type in bk_types:
        #         if get_domain(bk_type) in error_domains:
        #             is_province = False
        #             print "province error type: %s" %(bk_url)

        if is_province:
            province_names.update(enames)

    outf = file(out_path, 'w')
    for name in province_names:
        if not is_chinese(name):
            continue
        outf.write("%s\n" % (name))
    outf.close()
def process(inpath, outpath, name_map, fb_uris):
    schema = Resource.get_singleton().get_schema()
    error_props = load_error_property()

    Print('process %s' % inpath)
    outf = file(outpath, 'w')
    error_outf = file('log/error.log', 'w')
    for line in tqdm(file(inpath), total=nb_lines_of(inpath)):
        fb_key, rels = line.split('\t')
        if not fb_key in fb_uris:
            continue
        rels = json.loads(rels)
        new_rels = {}
        for fb_property, obj in rels:
            if schema.reverse_property(fb_property) == fb_property:
                continue
            if fb_property in error_props:
                continue
            if obj in name_map:
                names = name_map[obj]
            else:
                literal = process_fb_value(obj)
                if literal.startswith('fb:m.'):
                    # error_outf.write('error property %s, entity %s\n' %(fb_property, fb_key))
                    names = []
                else:
                    names = [process_fb_value(obj)]
            if len(names) == 0:
                continue
            if not fb_property in new_rels:
                new_rels[fb_property] = []
            new_rels[fb_property].extend(names)

        big = False
        for fb_property in new_rels:
            new_rels[fb_property] = list(set(new_rels[fb_property]))
            if len(new_rels[fb_property]) > 300:
                error_outf.write(
                    'big size property of url = %s, property = %s, size = %d\n'
                    % (fb_key, fb_property, len(new_rels[fb_property])))
        outf.write("%s\t%s\n" %
                   (fb_key, json.dumps(new_rels, ensure_ascii=False)))
    outf.close()
    error_outf.close()
示例#18
0
def test_chapt():
    import json
    import os
    urls = [
        'baike.so.com/doc/1287918-1361771.html',
        'baike.so.com/doc/4835393-5052275.html',
        'baike.so.com/doc/2526484-2669235.html',
        'baike.so.com/doc/5382393-5618748.html',
        'baike.so.com/doc/6662392-6876216.html',
        'baike.so.com/doc/3056594-3221987.html',
        'baike.so.com/doc/8716294-9038723.html',
        'baike.so.com/doc/5390356-5627004.html'
    ]

    # urls = ["baike.so.com/doc/1287918-1361771.html"]
    resource = Resource.get_singleton()
    url2names = resource.get_url2names()
    baike_info_map = resource.get_baike_info()

    baike_doc_path = os.path.join(rel_ext_dir, 'baike_doc.json')
    doc_processor = DocProcessor()

    for line in file(baike_doc_path):
        url, doc = line.split('\t')
        if not url in urls:
            continue
        doc = json.loads(doc)
        names = url2names[url]
        ename = names[0]
        types = baike_info_map[url].types
        page_info = PageInfo(ename, names, url, [], types)
        print " ".join(page_info.names)

        for chapter_title, chapter in doc:
            print 'parsing %s' % chapter_title
            for ltp_result, str_entities, subj_miss in doc_processor.parse_chapter(
                    chapter_title, chapter, page_info, parse_ner=True):
                # print '\t' + ltp_result.sentence
                if subj_miss:
                    print "\t\t" + ltp_result.sentence
def gen_citytown_dict():
    Print('generate citytown dict')
    resource = Resource.get_singleton()
    baike_info_map = resource.get_baike_info()
    ename_title_map = resource.get_baike_ename_title()

    citydown_names = set()

    for bk_url in tqdm(baike_info_map, total=len(baike_info_map)):
        if not bk_url in baike_info_map or not bk_url in ename_title_map:
            continue
        bk_types = baike_info_map[bk_url].types
        if not 'fb:location.location' in bk_types:
            continue
        if not "fb:location.citytown" in bk_types:
            continue

        if 'fb:people.person' in bk_types:
            continue

        enames = ename_title_map[bk_url]
        # is_error_name = False
        # error_suffix = ['乡', "镇", '村', '街道', '道路']
        # for ename in enames:
        #     for suffix in error_suffix:
        #         if ename.endswith(error_suffix):
        #             is_error_name = True
        # if is_error_name:
        #     continue

        citydown_names.update(enames)

    out_path = os.path.join(dict_dir, 'citytown.txt')
    outf = file(out_path, 'w')
    for name in citydown_names:
        if not is_chinese(name):
            continue
        outf.write("%s\n" % name)
    outf.close()
示例#20
0
 def __init__(self, name_dict, process_bracket_flag, add_time_entity):
     self.dict = name_dict
     self.process_bracket_flag = process_bracket_flag
     self.add_time_entity = add_time_entity
     self.ltp = Resource.get_singleton().get_ltp()
示例#21
0
    def link_partial_match_predicate(self, predicate):
        mapped_probs = {}
        for infobox_pred in self.predicate_map:
            if infobox_pred.find(predicate) == -1:
                continue
            match_ratio = len(predicate.decode('utf-8')) / float(
                len(infobox_pred.decode('utf-8')))
            if match_ratio < 0.5:
                continue
            probs = self.predicate_map[infobox_pred]
            for fb_prop in probs:
                prob = probs[fb_prop] * match_ratio
                if prob < 0.1:
                    continue
                if mapped_probs.get(fb_prop, 0) < prob:
                    mapped_probs[fb_prop] = prob
        return mapped_probs


if __name__ == "__main__":
    # rel_linker = MatchRelLinker()
    # probs = rel_linker.link_partial_match_predicate(u'出版')

    resource = Resource.get_singleton()
    predicate_map = resource.get_predicate_map()

    probs = predicate_map['决赛']
    for prop in probs:
        print prop, probs[prop]
示例#22
0
 def __init__(self):
     resource = Resource.get_singleton()
     self.predicate_map = resource.get_predicate_map()
示例#23
0
        return False
    # if re_eng.match(name):
    #     return False
    # if has_punc_eng(name):
    #     return False
    if BaikeDatetime.parse(name, strict=True) is not None:
        return False
    return True


def get_domain(fb_type):
    return fb_type.split('.')[0]


# valid_domains = set(['fb:film', 'fb:tv', 'fb:soccer', 'fb:sports', 'fb:astronomy', 'fb:music', 'fb:book', 'fb:award'])
valid_domains = Resource.get_singleton().get_important_domains()


def is_vertical_domain(types):
    global valid_domains
    for fb_type in types:
        if get_domain(fb_type) in valid_domains or fb_type in valid_domains:
            return True
    return False


if __name__ == "__main__":
    name2bk = Resource.get_singleton().get_name2bk()

    keys = sorted(name2bk.keys())
def test_ltp_extractor(datas_map, doc_processor, rel_extractor, linker):
    resource = Resource.get_singleton()
    schema = resource.get_schema()
    # base_dir = os.path.join(data_dir, '标注数据')
    # stf_results_map = load_stanford_result(os.path.join(base_dir, 'sentences.txt'), os.path.join(base_dir, 'sentences_stanf_nlp.json'))

    link_maps = None
    link_maps = load_links_map(os.path.join(cache_dir, 'link_map.json'))
    ltp_extractor = SimpleLTPExtractor(doc_processor, rel_extractor, linker,
                                       link_maps is None)

    url2names = resource.get_url2names()
    bk_info_map = resource.get_baike_info()
    url_map = load_url_map()
    important_domains = resource.get_important_domains()
    same_link_map = load_same_linkings()

    estimation = {
        "total output": 0,
        'total labeled': 0,
        'right output': 0,
    }

    str_estimation = {
        "total output": 0,
        'total labeled': 0,
        'right output': 0,
    }

    for baike_name in datas_map:
        datas = datas_map[baike_name]
        url = url_map[baike_name]
        # print baike_name
        # print url
        names = url2names[url]
        types = bk_info_map[url].types
        page_info = PageInfo(baike_name, names, url,
                             get_url_domains(types, important_domains), types)
        linker.entity_linker.start_new_page(url)
        for data in datas:
            sentence = data.sentence
            # if sentence != '《生活大爆炸》(The Big Bang Theory)是由查克·洛尔和比尔·普拉迪创作的一出美国情景喜剧,此剧由华纳兄弟电视公司和查克·洛尔制片公司共同制作。':
            #     continue
            print sentence
            para_info = ParagraphInfo(3, names, baike_name, False, True)
            ltp_result, _ = doc_processor.parse_sentence(sentence, para_info)
            str_entities = doc_processor.ner.recognize(ltp_result.sentence,
                                                       ltp_result, page_info,
                                                       None)
            triples, ltp_result = ltp_extractor.parse_sentence(
                ltp_result, str_entities, page_info, link_maps)

            kl_set = set()
            str_set = set()
            for kl in data.knowledges:

                kl.subj_url = same_link_map.get(kl.subj_url, kl.subj_url)
                kl.obj_url = same_link_map.get(kl.obj_url, kl.obj_url)

                str_set.add("%s\t%s\t%s" %
                            (kl.subj.encode('utf-8'), kl.prop.encode('utf-8'),
                             kl.obj.encode('utf-8')))
                str_set.add("%s\t%s\t%s" %
                            (kl.obj.encode('utf-8'), kl.prop.encode('utf-8'),
                             kl.subj.encode('utf-8')))

                kl_set.add("%s\t%s\t%s" %
                           (kl.subj_url, kl.prop_uri, kl.obj_url))
                reverse_prop_uri = schema.reverse_property(kl.prop_uri)

                if reverse_prop_uri:
                    kl_set.add("%s\t%s\t%s" %
                               (kl.obj_url, reverse_prop_uri, kl.subj_url))
            estimation['total labeled'] += len(data.knowledges)
            str_estimation['total labeled'] += len(data.knowledges)
            for triple in triples:
                str_estimation['total output'] += 1

                subj = ltp_result.text(triple.baike_subj.st,
                                       triple.baike_subj.ed)
                obj = ltp_result.text(triple.baike_obj.st, triple.baike_obj.ed)
                rel = ltp_result.text(triple.fb_rel.st, triple.fb_rel.ed)

                subj_url = same_link_map.get(triple.baike_subj.baike_url,
                                             triple.baike_subj.baike_url)
                obj_url = same_link_map.get(triple.baike_obj.baike_url,
                                            triple.baike_obj.baike_url)
                prop = triple.fb_rel.fb_prop

                triple_str = "%s\t%s\t%s" % (subj, rel, obj)

                if triple_str in str_set:
                    flag_str = 'str_right'
                    str_estimation['right output'] += 1
                else:
                    flag_str = "str_error"

                if prop == 'None':
                    info = "%s:%s\t%s\t%s:%s" % (subj, subj_url, rel, obj,
                                                 obj_url)
                    print "\t%s\t%s" % (info, flag_str)
                    continue

                info = triple.info(ltp_result)
                estimation['total output'] += 1
                if "%s\t%s\t%s" % (subj_url, prop,
                                   obj_url) in kl_set or "%s\t%s\t%s" % (
                                       subj_url, prop.split("^")[0],
                                       obj_url) in kl_set or "%s\t%s\t%s" % (
                                           subj_url, prop.split("^")[-1],
                                           obj_url) in kl_set:
                    estimation['right output'] += 1
                    print '\t%s\t%s' % (info, 'full_right')
                else:
                    print '\t%s\t%s' % (info, 'full_error')

            # str_estimation['total labeled'] += len(data.knowledges)
            # str_estimation['total output'] += len(half_linked_triples)
            # for triple in half_linked_triples:

            #     subj = ltp_result.text(triple.baike_subj.st, triple.baike_subj.ed)
            #     obj = ltp_result.text(triple.baike_obj.st, triple.baike_obj.ed)
            #     subj_url = same_link_map.get(triple.baike_subj.baike_url, triple.baike_subj.baike_url)
            #     obj_url = same_link_map.get(triple.baike_obj.baike_url, triple.baike_obj.baike_url)
            #     rel = ltp_result.text(triple.str_rel.st, triple.str_rel.ed)

            #     triple_str = "%s\t%s\t%s" %(subj_url, rel, obj_url)
            #     info = "%s:%s\t%s\t%s:%s" %(subj, subj_url, rel, obj, obj_url)
            #     if triple_str in str_set:
            #         str_estimation['right output'] += 1
            #         print "\t%s\t%s" %(info, 'str_right')
            #     else:
            #         print "\t%s\t%s" %(info, 'str_error')

            for kl in data.knowledges:
                print '\t\t%s' % kl.info()
    print estimation
    print str_estimation
    ltp_extractor.finish()
示例#25
0
def work(inpath, outpath):
    ner = NamedEntityReg()
    doc_processor = DocProcessor(ner)
    rel_extractor = VerbRelationExtractor()

    entity_linker = PageMemoryEntityLinker()
    rel_linker = MatchRelLinker()
    linker = SeparatedLinker(entity_linker, rel_linker)

    table_parser = Resource.get_singleton().get_table_parser(
        entity_linker, ner)

    ltp_extractor = SimpleLTPExtractor(doc_processor, rel_extractor, linker)
    url2names = linker.entity_linker.url2names
    bk_info_map = linker.entity_linker.bk_info_map
    baike_ename_title_map = Resource.get_singleton().get_baike_ename_title()

    important_domains = Resource.get_singleton().get_important_domains()
    schema = Resource.get_singleton().get_schema()

    outf = file(outpath, 'w')
    total = nb_lines_of(inpath)
    for cnt, line in enumerate(file(inpath), start=1):
        url, chapters = line.split('\t')

        if not url in url2names or not url in bk_info_map or not url in baike_ename_title_map:
            print 'error url %s' % (url)
            continue
        chapters = json.loads(chapters)

        outf.write('##parse url:%s\n' % url)
        Print('parse url:%s (%d/%d)' % (url, cnt, total))
        names = url2names[url]
        types = bk_info_map[url].types
        page_info = PageInfo(baike_ename_title_map[url][0], names, url,
                             get_url_domains(types, important_domains), types)
        entity_linker.start_new_page(url)

        kn_writer = PageKnowledgeHandler()

        for title, chapter in chapters:
            try:
                if type(chapter) is unicode:
                    tables = parse_tables_from_html(chapter)
                    tables = [encode_table(table) for table in tables]
                    for table in tables:
                        table_kns = table_parser.parse(table, page_info, types)
                        if len(table_kns) > 0:
                            for line, row_kns in table_kns:
                                kns = []
                                for kn in row_kns:
                                    kns.append("\t%s\t1" % kn.info())
                                kn_writer.add(line, kns)

                                # outf.write("%s\n" %line)
                                # for kn in row_kns:
                                #     outf.write("\t%s\t1\n" %kn.info())

                else:
                    for ltp_result, str_entities, _ in doc_processor.parse_chapter(
                            title, chapter, page_info, parse_ner=True):
                        if ltp_result is None:
                            continue
                        triples, _ = ltp_extractor.parse_sentence(
                            ltp_result, str_entities, page_info, None, False)
                        triples = [
                            triple for triple in triples
                            if triple.score() > 0.01
                        ]
                        if len(triples) > 0:
                            kns = []
                            for triple in triples:
                                kns.append("\t%s" % triple.info(ltp_result))
                            kn_writer.add(ltp_result.sentence, kns)

                            # outf.write("%s\n" %(ltp_result.sentence))
                            # for triple in triples:
                            #     outf.write("\t%s\n" %triple.info(ltp_result))

            except Exception, e:
                print "error at url:%s chapter:%s" % (url, title)
                print str(e)
        kn_writer.handle_uniq_prop(schema)
        kn_writer.output(outf)
示例#26
0
def gen_title_rel_dict(fb_type,
                       count_filepath,
                       out_path,
                       cnt_threshold,
                       extra_name_filepath=None,
                       error_func=None,
                       url_path=None):
    Print('gen dict by type [%s]' % fb_type)
    candidate_urls = set()
    resource = Resource.get_singleton()
    baike_static_info_map = resource.get_baike_info()
    Print("gen candidate baike_url")
    for bk_url in tqdm(baike_static_info_map,
                       total=len(baike_static_info_map)):
        types = baike_static_info_map[bk_url].types
        if fb_type in types:
            candidate_urls.add(bk_url)

    candidate_names = set()
    if count_filepath is not None:
        for line in file(count_filepath):
            p = line.strip().split('\t')
            if len(p) == 2:
                name, cnt = p
                cnt = int(cnt)
                if cnt >= cnt_threshold:
                    candidate_names.add(name)
    Print('#candidate urls = %d, #candidate names = %d' %
          (len(candidate_urls), len(candidate_names)))
    # ename_title_map = resource.get_baike_ename_title()
    url2names = resource.get_url2names()

    title_names = set()
    title2url = {}
    for candidate_url in candidate_urls:
        enames = url2names[candidate_url]
        for ename in enames:
            if ename in candidate_names or count_filepath is None:
                # assert ename not in title_names
                title_names.add(ename)
                if ename in title2url:
                    pre_pop = baike_static_info_map[title2url[ename]].pop
                    pop = baike_static_info_map[candidate_url].pop
                    if pre_pop > pop:
                        title_url = title2url[ename]
                    else:
                        title_url = candidate_url
                else:
                    title_url = candidate_url
                title2url[ename] = title_url
            else:
                print "%s: miss name: %s" % (fb_type, ename)

    if extra_name_filepath is not None:
        Print("add extra name from [%s]" % extra_name_filepath)
        for line in file(extra_name_filepath):
            title_names.add(line.rstrip())

    outf = file(out_path, 'w')
    if url_path:
        url_outf = file(url_path, 'w')

    for title_name in sorted(title_names):
        if title_name == '无':
            continue
        if error_func is not None and error_func(title_name):
            print "%s: error func name: %s" % (fb_type, title_name)
            continue
        if len(title_name.decode('utf-8')) < 2:
            print "%s: short name: %s" % (fb_type, title_name)
            continue
        if is_chinese(title_name):
            outf.write(title_name + '\n')
            if url_path:

                url_outf.write("%s\t%s\n" %
                               (title_name, title2url[title_name]))
    outf.close()

    if url_path:
        url_outf.close()
示例#27
0
def load_and_write_baike_name(bk_name_map, error_bracket_names, out_path):
    resource = Resource.get_singleton()
    bk_info_map = resource.get_baike_info()
    baike_entity_info_path = os.path.join(result_dir,
                                          '360/360_entity_info.json')
    total = 21710208
    Print('load and write baike name to [%s]' % out_path)
    baike_name_attrs = load_baike_name_attrs()

    art_work_types = set([
        'fb:film.film', 'fb:book.book', 'fb:book.written_work',
        'fb:cvg.computer_videogame', 'fb:tv.tv_program'
    ])

    outf = file(out_path, 'w')
    error_f = file('log/error.log', 'a')
    for line in tqdm(file(baike_entity_info_path), total=total):
        bk_url, obj = line.split('\t')
        bk_url = bk_url.decode('utf-8')

        static_info = bk_info_map[bk_url]
        bk_types = static_info.types
        is_art_work = False
        for bk_type in bk_types:
            if bk_type in art_work_types:
                is_art_work = True

        obj = json.loads(obj)
        names = [obj['ename'], obj['title']]
        info_names = set()

        info = obj['info']
        for baike_name in baike_name_attrs:
            if not baike_name in info:
                continue
            info_values = info[baike_name]
            for info_value in info_values:
                if info_value in names:
                    continue
                info_bracket_names = []
                info_value_names = unfold(info_value, info_bracket_names,
                                          [obj['ename'], obj['title']])
                for info_value_name in info_value_names:

                    in_bracket = info_value_name in info_bracket_names
                    if in_bracket and info_value_name in error_bracket_names and not is_abbre(
                            info_value_names, info_value_name):
                        continue
                    if in_bracket and is_art_work:
                        continue
                    info_names.add(info_value_name)

        names.extend(info_names)
        fb_names = bk_name_map.get(bk_url, [])
        fb_names = [x for x in fb_names if not x in names]
        if len(fb_names) < 10:
            names.extend(fb_names)
        else:
            error_f.write('%s\t%s\t%s\n' %
                          ('long_baike_names', bk_url, ' '.join(fb_names)))

        names = list(set(names))
        names = [html_unescape(x.replace('\n', "")).strip() for x in names]
        outf.write("%s\t%s\n" % (bk_url, "\t".join(names)))

        # is_person = "fb:people.person" in bk_types_map[bk_url]
        # if is_person:
        #     extra_names = []
        #     for name in names:
        #         extra_names = person_extra_names(name)
        #     names.extend(extra_names)

    outf.close()
    error_f.close()
示例#28
0
from .table_rule_parser import TableParser
from .collect_table import collect_tables, load_local_info
from src.extractor.resource import Resource

if __name__ == "__main__":
    table_parser = TableParser(None, None)
    table_parser.init(paths = None)
    table_parser.load_extra_table(None)

    table_cnt_path = os.path.join(table_dir, 'table_column_count.tsv')
    local_type_info_path = os.path.join(table_dir, 'local_info.json')
    local_info = load_local_info(local_type_info_path, table_cnt_path)
    cols_cnt, cols_type_cnt, cols_title_cnt = collect_tables(table_cnt_path, local_info)

    ruled_tables = set()
    schema = Resource.get_singleton().get_schema()
    for table_rule in table_parser.table_rules:
        table_rule.check(schema)
        for t in table_rule.register_tables:
            ruled_tables.add(t)

    coverred_cnt = 0
    miss_cnt = 0
    total = 0
    for table in cols_cnt:
        cnt = cols_cnt[table]
        total += cnt
        if cnt < 20:
        #     if cnt > 20 and cnt < 30:
        #         print table, cnt
            continue
示例#29
0
def extract_summary_name(summary_path, keywords, outpath,
                         bracket_name_outpath):
    Print('extract extra name from [%s]' % summary_path)
    # url2names = Resource.get_singleton().get_url2names()
    url2names = load_url2names()
    bk_info_map = Resource.get_singleton().get_baike_info()
    error_domains = ['fb:chemistry']
    ext = SummaryNameExtractor()
    outf = file(outpath, 'w')
    bracket_name_outf = file(bracket_name_outpath, 'w')
    for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)):
        url, summary = line.split('\t')
        types = bk_info_map[url].types

        in_error_domain = False
        for bk_type in types:
            if get_domain(bk_type) in error_domains:
                in_error_domain = True
        if in_error_domain:
            continue

        summary = json.loads(summary)['summary']
        summary = summary.replace(u'(', u'(').replace(u')', u')')

        names = url2names[url]
        names = [x.decode('utf-8') for x in names]
        ret = ext.find_name_sentence(summary, names)
        if ret is None:
            extra_name = None
            sentences = split_sentences(summary)
            if len(sentences) > 0:
                first_sentence = sentences[0]
                no_subj = True
                for name in names:
                    if name in first_sentence:
                        no_subj = False
                if no_subj:
                    extra_name = ext.find_no_subj_name(summary, keywords)
        else:
            rest_sentence, first_name = ret
            extra_name = ext.find_new_extra_name(rest_sentence, keywords)

        if extra_name is not None:
            extra_name = extra_name.strip()
            extra_names = unfold(extra_name, names)
            succeed_names = []
            for extra_name in extra_names:
                extra_name = extra_name.strip(u'\'" \t\n”“')
                if not has_strange_punc(extra_name) \
                    and not too_long_name(extra_name, names) \
                    and not extra_name in names \
                    and not error_bracket_name(extra_name, names) \
                    and not too_short_name(extra_name) \
                    and not is_error_name(extra_name) \
                    and not digit_in_name(extra_name):
                    succeed_names.append(extra_name)
            if len(succeed_names) > 0:
                succeed_names = list(set(succeed_names))
                outf.write('%s\t%s\n' % (url, "\t".join(succeed_names)))
                names.extend(succeed_names)

        # extract bracket name
        extra_bracket_names = ext.extract_bracket_names(
            summary, keywords, names)
        succeed_names = []
        for extra_name in extra_bracket_names:
            extra_name = extra_name.strip()
            extra_names = unfold(extra_name, names)
            for extra_name in extra_names:
                extra_name = extra_name.strip(u'\'" \t\n”“')
                if not has_strange_punc(extra_name) \
                    and not too_long_name(extra_name, names) \
                    and not extra_name in names \
                    and not too_short_name(extra_name) \
                    and not is_error_name(extra_name) \
                    and not digit_in_name(extra_name):
                    succeed_names.append(extra_name)
        if len(succeed_names) > 0:
            succeed_names = list(set(succeed_names))
            bracket_name_outf.write('%s\t%s\n' %
                                    (url, "\t".join(succeed_names)))

    outf.close()
    bracket_name_outf.close()
示例#30
0
def extract_type_info(son_name_map_path, bk_type_info_path):
    if os.path.exists(bk_type_info_path):
        return
    resource = Resource.get_singleton()
    baike_ename_title = resource.get_baike_ename_title()
    name2bk = resource.get_name2bk()
    baike_info_map = resource.get_baike_info()

    ename2bk = {}
    for bk_url in baike_ename_title:
        if not bk_url in baike_info_map:
            continue
        types = baike_info_map[bk_url].types
        if is_art_work(types):
            continue

        enames = baike_ename_title[bk_url]
        for ename in enames:
            if not ename in ename2bk:
                ename2bk[ename] = []
            ename2bk[ename].append(bk_url)

    local_info_map = {}
    for line in file(son_name_map_path, 'r'):
        names = line.strip().split('\t')
        parent_name = names[0]
        son_names = names[1:]
        parent_urls = name2bk[parent_name]
        for parent_url in parent_urls:
            types = baike_info_map[parent_url].types
            if is_art_work(types):
                continue

            if not parent_url in local_info_map:
                local_info_map[parent_url] = {}
            info = local_info_map[parent_url]
            info['type'] = types
            info['fb'] = baike_info_map[parent_url].fb_uri
            if not 'name' in info:
                info['name'] = []
            info['name'].append(parent_name)

        for son_name in son_names:
            son_urls = ename2bk[son_name]

            for son_url in son_urls:
                if not son_url in local_info_map:
                    local_info_map[son_url] = {}
                types = baike_info_map[son_url].types
                info = local_info_map[son_url]
                info['type'] = types
                info['fb'] = baike_info_map[parent_url].fb_uri
                if not 'name' in info:
                    info['name'] = []
                info['name'].append(son_name)

    outf = file(bk_type_info_path, 'w')
    for url in local_info_map:
        outf.write('%s\t%s\n' %
                   (url, json.dumps(local_info_map[url], ensure_ascii=False)))
    outf.close()