Пример #1
0
def collect_team_suffix(suffix_out_path):
    Print("collect team suffix, write to [%s]" % suffix_out_path)

    ename_title_map = Resource.get_singleton().get_baike_ename_title()
    baike_info_map = Resource.get_singleton().get_baike_info()
    ltp = Resource.get_singleton().get_ltp()
    suffix_cnt = {}

    Print("collect suffix")
    for bk_url in tqdm(baike_info_map, total=len(baike_info_map)):
        e_types = baike_info_map[bk_url].types
        if not is_org(e_types):
            continue
        enames = ename_title_map[bk_url]

        for name in enames:
            words = ltp.cut(name)
            ed = len(words)
            for st in range(1, ed):
                suffix = "".join(words[st:])
                if not suffix in suffix_cnt:
                    suffix_cnt[suffix] = 0
                suffix_cnt[suffix] += 1

    threshold = 10
    outf = file(suffix_out_path, 'w')
    for key in sorted(suffix_cnt, key=lambda x: suffix_cnt[x], reverse=True):
        cnt = suffix_cnt[key]
        if cnt < threshold:
            continue
        outf.write("%s\t%d\n" % (key, cnt))
    outf.close()
Пример #2
0
def merge_summary_and_infobox(summary_path, infobox_path, out_path):
    Print("load summary from [%s]" %summary_path)
    outf = file(out_path, 'w')
    summary_map = {}
    for line in tqdm(file(summary_path, 'r'), total = nb_lines_of(summary_path)):
        p = line.split('\t')
        key = p[0]
        summary = json.loads(p[1])['summary']
        # summary = filter_bad_summary(summary)
        summary_map[key] = summary.encode('utf-8')
    Print('add infobox value to summary, path is [%s]' %infobox_path)
    for line in tqdm(file(infobox_path), total = nb_lines_of(infobox_path)):
        p = line.split('\t')
        key = p[0]
        info_values = list()
        info = json.loads(p[1])['info']
        for value_list in info.values():
            for value in value_list:
                info_values.append(value)
        if len(info_values) == 0:
            continue
        
        text = u"。" + u"#".join(info_values)
        text = text.encode('utf-8')
        if not key in summary_map:
            summary_map[key] = text
        else:
            summary_map[key] = summary_map[key] + text
    
    Print("write summary and infobox to [%s]" %out_path)
    outf = file(out_path, 'w')
    for bk_url in tqdm(sorted(summary_map.keys()), total = len(summary_map)):
        summary = {'summary': summary_map[bk_url]}
        outf.write('%s\t%s\n' %(bk_url, json.dumps(summary, ensure_ascii = False)) )
    outf.close()
Пример #3
0
def generate_data_from_doc(doc_path, bk2fb, fb_uris, outpath):
    resource = Resource.get_singleton()
    fb_rels_map = resource.get_half_named_fb_info()
    ner = NamedEntityReg()
    e_linker = PageMemoryEntityLinker()
    doc_processor = DocProcessor(ner)
    url2names = resource.get_url2names()
    bk_info_map = resource.get_baike_info()
    important_domains = resource.get_important_domains()
    rel_extracotr = VerbRelationExtractor()
    schema = resource.get_schema()

    Print('generate data from [%s]' % os.path.basename(doc_path))
    outf = file(outpath, 'w')
    cnt = 0
    for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)):
        bk_url, doc = line.split('\t')
        if bk_url not in bk2fb:
            continue
        fb_uri = bk2fb[bk_url]
        if fb_uri not in fb_rels_map:
            continue
        fb_rels = fb_rels_map[fb_uri]
        cnt += 1
        if cnt % 100 == 0:
            print ""
            Print('cnt = %d' % cnt)
        # Print('parsing %s' %bk_url)
        # outf.write('##start parsing %s\n' %(bk_url))

        bk_info = bk_info_map[bk_url]
        if bk_info.pop < 4 + 5:
            continue
        types = bk_info.types
        names = url2names[bk_url]
        page_info = PageInfo(names[-1], names, bk_url,
                             get_url_domains(types, important_domains), types)

        e_linker.start_new_page(bk_url)

        # summary = [json.loads(summary)['summary']]
        # chapter_title = 'intro_summary'

        doc = json.loads(doc)
        for chapter_title, chapter in doc:
            chapter = [para for para in chapter if para.find('</table>') == -1]
            if len(chapter) == 0:
                continue
            generate_data_from_chapter(chapter_title, chapter, page_info,
                                       doc_processor, fb_rels, rel_extracotr,
                                       outf, e_linker, schema)

    outf.close()
Пример #4
0
def train_extract_summary_name(summary_path, out_path):
    outf = file(out_path, 'w')
    url2names = Resource.get_singleton().get_url2names()
    extor = SummaryNameExtractor()
    Print('train summary extra name')
    for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)):
        url, summary = line.split('\t')
        summary = json.loads(summary)['summary']
        names = url2names[url]
        names = [x.decode('utf-8') for x in names]

        ret = extor.find_name_sentence(summary, names)
        if ret is None:
            continue
        sent, first_name = ret

        ret = extor.find_extra_name(sent, first_name, names)

        if ret is None:
            continue

        first_name, sent, second_name = ret
        outs = [url, first_name, sent, second_name]
        outf.write('%s\n' % ('\t'.join(outs)))
    outf.close()
Пример #5
0
def collect_table_cnt(cnt_path, outpath, local_info):
    Print("collect table cols cnt from [%s], write to [%s]" %(os.path.basename(cnt_path), os.path.basename(outpath)))
    cols_cnt, cols_type_cnt, cols_title_cnt = collect_tables(cnt_path, local_info)

    outf = file(outpath, 'w')
    useful_cols = []
    total = 0
    for cols in sorted(cols_cnt.keys(), key = lambda x: (len(x), x), reverse = True):
        cols_obj = cols.split(" # ")
        if not check_in(useful_cols, cols_obj):
            if cols_cnt[cols]  < 20:
                continue
            total += cols_cnt[cols]
            useful_cols.append(cols_obj)
            
            types_cnt = topk_keys(cols_type_cnt[cols], 8)
            titles_cnt = topk_keys(cols_title_cnt[cols], 4)
            types_str = " ".join([fb_type + "#" + str(cnt) for fb_type, cnt in types_cnt])
            titles_str = " ".join([title + "#" + str(cnt) for title, cnt in titles_cnt])
            outf.write("%s\t%d\n" %(cols, cols_cnt[cols] ))
            for fb_type, cnt in types_cnt:
                outf.write("\t%s\t%d\n" %(fb_type, cnt))
            for title, cnt in titles_cnt:
                outf.write('\t%s\t%d\n' %(title, cnt))

        else:
            total += cols_cnt[cols]
    print total
    outf.close()
Пример #6
0
    def init(self, paths=None):
        if paths is None:
            rules_dir = os.path.join(table_dir, 'rules')
            paths = glob.glob(rules_dir + "/*rule")

        path_str = " ".join([os.path.basename(x) for x in paths])
        Print('load table rule from [%s]' % path_str)
        for path in paths:
            self.load_from_file(path)
Пример #7
0
def filter_summary(ori_path, new_path):
    Print('filter summary from [%s] to [%s]' %(os.path.basename(ori_path), os.path.basename(new_path)))
    outf = file(new_path, 'w')
    for line in tqdm(file(ori_path), total = nb_lines_of(ori_path)):
        bk_url, summary = line.split('\t')
        summary = json.loads(summary)['summary']
        new_summary = filter_bad_summary(summary)
        new_summary = {'summary': new_summary}
        outf.write("%s\t%s\n" %(bk_url, json.dumps(new_summary, ensure_ascii = False)) )
    outf.close()
 def save(self, filepath):
     Print("save team dict to [%s]" % filepath)
     outf = file(filepath, 'w')
     for suffix in self.dicts:
         team_dict = self.dicts[suffix]
         for bk_url in team_dict:
             out = "%s\t%s\t%s" % (suffix, bk_url, "\t".join(
                 team_dict[bk_url]))
             outf.write(out.rstrip() + '\n')
     outf.close()
def gen_name_map(extractor):
    baike_ename_title = Resource.get_singleton().get_baike_ename_title()
    url2names = Resource.get_singleton().get_url2names()
    bk_static_info = Resource.get_singleton().get_baike_info()
    location_dict = Resource.get_singleton().get_location_dict()

    location_dict = set([x.decode('utf-8') for x in location_dict])
    all_names = set()

    for bk_url in url2names:
        if not bk_url in bk_static_info:
            continue
        bk_types = bk_static_info[bk_url].types
        if is_art_work(bk_types):
            continue
        enames = url2names[bk_url]
        is_son = False
        for ename in enames:
            parent_name = extractor.try_extract_parent_name(ename)
            if parent_name:
                is_son = True
        if is_son:
            continue

        for ename in enames:
            all_names.add(ename)

    name_map = {}
    Print("extract parent name")
    for bk_url in tqdm(baike_ename_title, total=len(baike_ename_title)):
        if not bk_url in bk_static_info:
            continue
        bk_types = bk_static_info[bk_url].types
        if is_art_work(bk_types):
            continue

        enames = baike_ename_title[bk_url]
        for ename in enames:
            parent_name = extractor.try_extract_parent_name(
                ename)  # return unicode or None
            if not parent_name:
                continue
            if parent_name.encode('utf-8') in all_names:
                add_to_dict_list(name_map, parent_name, ename.decode('utf-8'))
            second_parent_name = del_loc_prefix(parent_name, location_dict)
            if second_parent_name and second_parent_name.encode(
                    'utf-8') in all_names:
                add_to_dict_list(name_map, second_parent_name,
                                 ename.decode('utf-8'))

    return name_map
Пример #10
0
def load_baike_ename_title():
    path = os.path.join(result_dir, '360/360_entity_info_processed.json')
    Print('load baike\'s ename and title from [%s]' % path)
    ename_title_map = {}
    for line in tqdm(file(path), total=nb_lines_of(path)):
        bk_url, obj = line.split('\t')
        obj = json.loads(obj)
        ename, title = obj['ename'].encode('utf-8'), obj['title'].encode(
            'utf-8')
        if title != ename:
            ename_title_map[bk_url] = [ename, title]
        else:
            ename_title_map[bk_url] = [ename]
    return ename_title_map
Пример #11
0
 def load_extra_table(self, path=None):
     if path is None:
         path = os.path.join(table_dir, 'rules/extra_table.tsv')
     Print('load extra table from [%s]' % os.path.basename(path))
     for line in file(path):
         p = line.strip().split("\t")
         table = p[0]
         rule_names = p[1:]
         for rule in rule_names:
             find = False
             for table_rule in self.table_rules:
                 if table_rule.name == rule:
                     find = True
                     table_rule.register_table(table)
                     break
             assert find
def gen_province_dict():
    Print('generate province dict')
    resource = Resource.get_singleton()
    baike_info_map = resource.get_baike_info()
    ename_title_map = resource.get_baike_ename_title()
    out_path = os.path.join(dict_dir, 'province.txt')
    province_names = set()
    error_domains = get_error_domains()
    for bk_url in tqdm(ename_title_map, total=len(ename_title_map)):
        enames = ename_title_map[bk_url]
        if not bk_url in baike_info_map:
            continue
        bk_info = baike_info_map[bk_url]
        bk_types = bk_info.types
        if not "fb:location.location" in bk_types:
            continue

        is_province = False
        for bk_type in bk_types:
            if get_domain(bk_type) == 'fb:location' and (
                    'state' in bk_type or "province" in bk_type):
                print "province bk_type: %s" % bk_url
                is_province = True

        for ename in enames:
            ename = ename.decode('utf-8')
            if len(ename) > 2 and (ename.endswith(u'省')
                                   or ename.endswith(u"州")):
                print "province ename: %s %s" % (ename, bk_url)
                is_province = True

        # if is_province:
        #     for bk_type in bk_types:
        #         if get_domain(bk_type) in error_domains:
        #             is_province = False
        #             print "province error type: %s" %(bk_url)

        if is_province:
            province_names.update(enames)

    outf = file(out_path, 'w')
    for name in province_names:
        if not is_chinese(name):
            continue
        outf.write("%s\n" % (name))
    outf.close()
Пример #13
0
def extract_table_columns():
    outpath = os.path.join(table_dir, 'table_column_count.tsv')
    outf = file(outpath, 'w')
    doc_path = os.path.join(rel_ext_dir, 'baike_doc.json')
    Print('count table\'s columns')
    for line in tqdm(file(doc_path), total=nb_lines_of(doc_path)):
        bk_url, doc = line.split('\t')
        doc = json.loads(doc)
        for title, chapter in doc:
            if type(chapter) is not unicode:
                continue
            try:
                tables = parse_tables_from_html(chapter)
                for table in tables:
                    outf.write('%s\t%s\t%s\n' %
                               (bk_url, title, " # ".join(table['columns'])))
            except Exception, e:
                print 'error at parse %s title = %s' % (bk_url, title)
def process(inpath, outpath, name_map, fb_uris):
    schema = Resource.get_singleton().get_schema()
    error_props = load_error_property()

    Print('process %s' % inpath)
    outf = file(outpath, 'w')
    error_outf = file('log/error.log', 'w')
    for line in tqdm(file(inpath), total=nb_lines_of(inpath)):
        fb_key, rels = line.split('\t')
        if not fb_key in fb_uris:
            continue
        rels = json.loads(rels)
        new_rels = {}
        for fb_property, obj in rels:
            if schema.reverse_property(fb_property) == fb_property:
                continue
            if fb_property in error_props:
                continue
            if obj in name_map:
                names = name_map[obj]
            else:
                literal = process_fb_value(obj)
                if literal.startswith('fb:m.'):
                    # error_outf.write('error property %s, entity %s\n' %(fb_property, fb_key))
                    names = []
                else:
                    names = [process_fb_value(obj)]
            if len(names) == 0:
                continue
            if not fb_property in new_rels:
                new_rels[fb_property] = []
            new_rels[fb_property].extend(names)

        big = False
        for fb_property in new_rels:
            new_rels[fb_property] = list(set(new_rels[fb_property]))
            if len(new_rels[fb_property]) > 300:
                error_outf.write(
                    'big size property of url = %s, property = %s, size = %d\n'
                    % (fb_key, fb_property, len(new_rels[fb_property])))
        outf.write("%s\t%s\n" %
                   (fb_key, json.dumps(new_rels, ensure_ascii=False)))
    outf.close()
    error_outf.close()
def gen_citytown_dict():
    Print('generate citytown dict')
    resource = Resource.get_singleton()
    baike_info_map = resource.get_baike_info()
    ename_title_map = resource.get_baike_ename_title()

    citydown_names = set()

    for bk_url in tqdm(baike_info_map, total=len(baike_info_map)):
        if not bk_url in baike_info_map or not bk_url in ename_title_map:
            continue
        bk_types = baike_info_map[bk_url].types
        if not 'fb:location.location' in bk_types:
            continue
        if not "fb:location.citytown" in bk_types:
            continue

        if 'fb:people.person' in bk_types:
            continue

        enames = ename_title_map[bk_url]
        # is_error_name = False
        # error_suffix = ['乡', "镇", '村', '街道', '道路']
        # for ename in enames:
        #     for suffix in error_suffix:
        #         if ename.endswith(error_suffix):
        #             is_error_name = True
        # if is_error_name:
        #     continue

        citydown_names.update(enames)

    out_path = os.path.join(dict_dir, 'citytown.txt')
    outf = file(out_path, 'w')
    for name in citydown_names:
        if not is_chinese(name):
            continue
        outf.write("%s\n" % name)
    outf.close()
Пример #16
0
    prof_dict_path = os.path.join(dict_dir, 'profession.txt')
    prof_url_dict_path = os.path.join(dict_dir, 'profession_url.txt')
    prof_dict = set(load_file(prof_dict_path))

    candidate_profs = set()
    for line in file(prof_cnt_path):
        p = line.split('\t')
        if len(p) == 2:
            prof = p[0]
            cnt = int(p[1])
            if cnt >= prof_cnt_threshold and is_chinese(
                    prof) and prof not in prof_dict and len(
                        prof.decode('utf-8')) >= 2:
                candidate_profs.add(prof)

    Print("#candidate name = %d" % len(candidate_profs))
    resource = Resource.get_singleton()
    baike_ename_title_map = resource.get_baike_ename_title()
    prof2bk = {}
    for bk_url in baike_ename_title_map:
        enames = baike_ename_title_map[bk_url]
        for ename in enames:
            if ename in candidate_profs:
                if not ename in prof2bk:
                    prof2bk[ename] = set()
                prof2bk[ename].add(bk_url)

    Print("#hit candidate name = %d" % len(prof2bk))
    extra_prof_out_path = os.path.join(dict_dir, 'extra_profession.txt')
    extra_prof_url_out_path = os.path.join(dict_dir,
                                           'extra_profession_url.txt')
Пример #17
0
            return True
    return False


if __name__ == "__main__":
    name2bk = Resource.get_singleton().get_name2bk()

    keys = sorted(name2bk.keys())

    year_pattern = re.compile(ur'(公元前|公元)?\d{1,4}年$')
    re_digit = re.compile(r'^[0-9+\-=!?]+$')
    re_eng = re.compile(r"^[a-zA-Z]+$")

    valid_func = is_vertical_domain
    out_path = os.path.join(dict_dir, 'vertical_domain_baike_dict.txt')
    Print('use valid_func: valic_domains')

    bk_info_map = Resource.get_singleton().get_baike_info()

    outf = file(out_path, 'w')
    Print('write dict to %s' % out_path)
    for name in tqdm(keys, total=len(keys)):

        if not is_valid_dict_name(name):
            continue
        # if has_punc_eng(name):
        #     continue
        bks = name2bk[name]
        # pop = 0
        valid = False
        for bk_url in bks:
Пример #18
0
 def adjust_pop_by_summary(self):
     Print('adjust entity popularity according to its summary length')
     for bk_url in tqdm(self.bk_info_map, total=len(self.bk_info_map)):
         summary_length = len(self.summary_map.get(bk_url, "")) / 100
         self.bk_info_map[bk_url].pop += min(summary_length * 2, 10)
def extract_team_extra_name_from_summary(summary_path, out_path):

    resource = Resource().get_singleton()
    url2names = resource.get_url2names()
    ename_title_map = resource.get_baike_ename_title()
    baike_info_map = resource.get_baike_info()
    location_dict = resource.get_location_dict()
    ltp = resource.get_ltp()

    ner = NamedEntityReg()
    team_suffixes = load_team_suffix()
    team_suffixes = [x.decode('utf-8') for x in team_suffixes]
    team_suffixes = set(team_suffixes)

    Print('extract org\'s extra name from summary [%s]' % summary_path)
    Print("result write to [%s]" % out_path)
    outf = file(out_path, 'w')
    for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)):
        bk_url, summary = line.split('\t')
        types = baike_info_map[bk_url].types
        if not is_team(types):
            continue

        enames = ename_title_map[bk_url]
        enames = [x.decode('utf-8') for x in enames]
        ori_names = url2names[bk_url]
        ori_names = set([x.decode('utf-8') for x in ori_names])

        summary = json.loads(summary)['summary']
        sentences = split_sentences(summary)

        parsed_names = []
        j_names_cnt = {}
        for sentence in sentences:
            names, j_names = parse_entity(sentence, ltp, ner, location_dict)
            parsed_names.extend(names)
            for j_name in j_names:
                if not j_name in j_names_cnt:
                    j_names_cnt[j_name] = 0
                j_names_cnt[j_name] += 1

        succeed_names = set()
        suffixes = []
        for ename in enames:
            if "fb:sports.sports_team" in types:
                suffix = try_get_suffix(ename, team_suffixes)
            else:
                suffix = u''
            suffixes.append(suffix)

            if has_eng_digit(ename):
                continue

            if len(suffix) > 0:
                new_name = ename[:len(ename) - len(suffix)]
                if not is_location(new_name, location_dict):
                    succeed_names.add(new_name)
                    succeed_names.add(new_name + u"队")

        parsed_names = set(parsed_names)
        for parsed_name in parsed_names:
            valid = False
            for ename, suffix in zip(enames, suffixes):
                if has_eng_digit(ename):
                    continue
                if is_good_sub_seq(parsed_name, ename, suffix):
                    valid = True
            if valid:
                succeed_names.add(parsed_name)

        for j_name in j_names_cnt:
            if j_names_cnt[j_name] >= 2:
                valid = False
                for ename, suffix in zip(enames, suffixes):
                    if has_eng_digit(ename):
                        continue
                    if j_name not in ename and is_good_sub_seq(
                            j_name, ename, suffix):
                        valid = True
                if valid:
                    succeed_names.add(j_name)

        succeed_names = [
            new_name for new_name in succeed_names if not new_name in ori_names
        ]
        succeed_names = [
            new_name for new_name in succeed_names
            if not has_strange_punc(new_name)
        ]
        # succeed_names = [new_name for new_name in succeed_names if not is_location(new_name, location_dict)]
        if len(succeed_names) > 0:
            succeed_names = set(succeed_names)
            outf.write('%s\t%s\n' % (bk_url, "\t".join(succeed_names)))

    outf.close()
Пример #20
0
def gen_title_rel_dict(fb_type,
                       count_filepath,
                       out_path,
                       cnt_threshold,
                       extra_name_filepath=None,
                       error_func=None,
                       url_path=None):
    Print('gen dict by type [%s]' % fb_type)
    candidate_urls = set()
    resource = Resource.get_singleton()
    baike_static_info_map = resource.get_baike_info()
    Print("gen candidate baike_url")
    for bk_url in tqdm(baike_static_info_map,
                       total=len(baike_static_info_map)):
        types = baike_static_info_map[bk_url].types
        if fb_type in types:
            candidate_urls.add(bk_url)

    candidate_names = set()
    if count_filepath is not None:
        for line in file(count_filepath):
            p = line.strip().split('\t')
            if len(p) == 2:
                name, cnt = p
                cnt = int(cnt)
                if cnt >= cnt_threshold:
                    candidate_names.add(name)
    Print('#candidate urls = %d, #candidate names = %d' %
          (len(candidate_urls), len(candidate_names)))
    # ename_title_map = resource.get_baike_ename_title()
    url2names = resource.get_url2names()

    title_names = set()
    title2url = {}
    for candidate_url in candidate_urls:
        enames = url2names[candidate_url]
        for ename in enames:
            if ename in candidate_names or count_filepath is None:
                # assert ename not in title_names
                title_names.add(ename)
                if ename in title2url:
                    pre_pop = baike_static_info_map[title2url[ename]].pop
                    pop = baike_static_info_map[candidate_url].pop
                    if pre_pop > pop:
                        title_url = title2url[ename]
                    else:
                        title_url = candidate_url
                else:
                    title_url = candidate_url
                title2url[ename] = title_url
            else:
                print "%s: miss name: %s" % (fb_type, ename)

    if extra_name_filepath is not None:
        Print("add extra name from [%s]" % extra_name_filepath)
        for line in file(extra_name_filepath):
            title_names.add(line.rstrip())

    outf = file(out_path, 'w')
    if url_path:
        url_outf = file(url_path, 'w')

    for title_name in sorted(title_names):
        if title_name == '无':
            continue
        if error_func is not None and error_func(title_name):
            print "%s: error func name: %s" % (fb_type, title_name)
            continue
        if len(title_name.decode('utf-8')) < 2:
            print "%s: short name: %s" % (fb_type, title_name)
            continue
        if is_chinese(title_name):
            outf.write(title_name + '\n')
            if url_path:

                url_outf.write("%s\t%s\n" %
                               (title_name, title2url[title_name]))
    outf.close()

    if url_path:
        url_outf.close()
def count_predicate(inpath, outpath, predicate):
    cnt_map = {}
    for line in tqdm(file(inpath), total=nb_lines_of(inpath)):
        _, obj = line.split('\t')
        info = json.loads(obj)['info']
        values = info.get(predicate, [])
        for value in values:
            if not value in cnt_map:
                cnt_map[value] = 1
            else:
                cnt_map[value] += 1
    outf = file(outpath, 'w')
    for key in sorted(cnt_map.keys(), key=lambda x: cnt_map[x], reverse=True):
        if is_chinese(key):
            outf.write("%s\t%s\n" % (key, cnt_map[key]))
    outf.close()


if __name__ == "__main__":
    base_dir = os.path.join(rel_ext_dir, 'infobox_count')
    predicates = [u'职业', u'国籍']
    infobox_path = os.path.join(result_dir,
                                '360/360_entity_info_processed.json')
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    for predicate in predicates:
        Print("process predicate %s" % predicate)
        outpath = os.path.join(base_dir, '%s_cnt.tsv' % (predicate))
        count_predicate(infobox_path, outpath, predicate)
Пример #22
0
            main_pred = predicate.split("#")[0]
            if len(main_pred.decode('utf-8')) < 2:
                continue
            pred_map = self.map[predicate]
            for prop in pred_map:
                prop_cnt = pred_map[prop]
                self.add(main_pred, [prop], prop_cnt)
        for predicate in error_preds:
            self.map.pop(predicate)


if __name__ == '__main__':
    inpath = sys.argv[1]
    outpath = sys.argv[2]

    Print('collect from [%s] write to [%s]' % (inpath, outpath))
    pred_maps = PredicateMaps()
    for line in file(inpath):
        if not line.startswith('\t'):
            continue
        l = line.strip()
        p = l.split('\t')
        predicate = p[0]
        predicate = predicate.strip("'\" :#")
        if len(predicate) == 0:
            continue
        if len(predicate.decode('utf-8')) < 2 and predicate != '是':
            continue

        props = p[1:]
        pred_maps.add(predicate, props)
Пример #23
0
def extract_summary_name(summary_path, keywords, outpath,
                         bracket_name_outpath):
    Print('extract extra name from [%s]' % summary_path)
    # url2names = Resource.get_singleton().get_url2names()
    url2names = load_url2names()
    bk_info_map = Resource.get_singleton().get_baike_info()
    error_domains = ['fb:chemistry']
    ext = SummaryNameExtractor()
    outf = file(outpath, 'w')
    bracket_name_outf = file(bracket_name_outpath, 'w')
    for line in tqdm(file(summary_path), total=nb_lines_of(summary_path)):
        url, summary = line.split('\t')
        types = bk_info_map[url].types

        in_error_domain = False
        for bk_type in types:
            if get_domain(bk_type) in error_domains:
                in_error_domain = True
        if in_error_domain:
            continue

        summary = json.loads(summary)['summary']
        summary = summary.replace(u'(', u'(').replace(u')', u')')

        names = url2names[url]
        names = [x.decode('utf-8') for x in names]
        ret = ext.find_name_sentence(summary, names)
        if ret is None:
            extra_name = None
            sentences = split_sentences(summary)
            if len(sentences) > 0:
                first_sentence = sentences[0]
                no_subj = True
                for name in names:
                    if name in first_sentence:
                        no_subj = False
                if no_subj:
                    extra_name = ext.find_no_subj_name(summary, keywords)
        else:
            rest_sentence, first_name = ret
            extra_name = ext.find_new_extra_name(rest_sentence, keywords)

        if extra_name is not None:
            extra_name = extra_name.strip()
            extra_names = unfold(extra_name, names)
            succeed_names = []
            for extra_name in extra_names:
                extra_name = extra_name.strip(u'\'" \t\n”“')
                if not has_strange_punc(extra_name) \
                    and not too_long_name(extra_name, names) \
                    and not extra_name in names \
                    and not error_bracket_name(extra_name, names) \
                    and not too_short_name(extra_name) \
                    and not is_error_name(extra_name) \
                    and not digit_in_name(extra_name):
                    succeed_names.append(extra_name)
            if len(succeed_names) > 0:
                succeed_names = list(set(succeed_names))
                outf.write('%s\t%s\n' % (url, "\t".join(succeed_names)))
                names.extend(succeed_names)

        # extract bracket name
        extra_bracket_names = ext.extract_bracket_names(
            summary, keywords, names)
        succeed_names = []
        for extra_name in extra_bracket_names:
            extra_name = extra_name.strip()
            extra_names = unfold(extra_name, names)
            for extra_name in extra_names:
                extra_name = extra_name.strip(u'\'" \t\n”“')
                if not has_strange_punc(extra_name) \
                    and not too_long_name(extra_name, names) \
                    and not extra_name in names \
                    and not too_short_name(extra_name) \
                    and not is_error_name(extra_name) \
                    and not digit_in_name(extra_name):
                    succeed_names.append(extra_name)
        if len(succeed_names) > 0:
            succeed_names = list(set(succeed_names))
            bracket_name_outf.write('%s\t%s\n' %
                                    (url, "\t".join(succeed_names)))

    outf.close()
    bracket_name_outf.close()
Пример #24
0
from src.IOUtil import Print, rel_ext_dir, result_dir, nb_lines_of
from tqdm import tqdm
import json


def load_baike_ename_title():
    path = os.path.join(result_dir, '360/360_entity_info_processed.json')
    Print('load baike\'s ename and title from [%s]' % path)
    ename_title_map = {}
    for line in tqdm(file(path), total=nb_lines_of(path)):
        bk_url, obj = line.split('\t')
        obj = json.loads(obj)
        ename, title = obj['ename'].encode('utf-8'), obj['title'].encode(
            'utf-8')
        if title != ename:
            ename_title_map[bk_url] = [ename, title]
        else:
            ename_title_map[bk_url] = [ename]
    return ename_title_map


if __name__ == "__main__":
    ename_title_map = load_baike_ename_title()
    out_path = os.path.join(rel_ext_dir, 'baike_ename_title.tsv')
    Print("write to [%s]" % out_path)
    outf = file(out_path, 'w')
    for bk_url in tqdm(sorted(ename_title_map.keys()),
                       total=len(ename_title_map)):
        outf.write('%s\t%s\n' % (bk_url, "\t".join(ename_title_map[bk_url])))
    outf.close()