Exemplo n.º 1
0
def get_fudanperiod_entity(
        entity_dict=False):  # get fudaninc entities stored in periodCache
    from hzlib.libfile import write_file
    saved_dir = '/data/hproject/2016/fudaninc-20160825'
    entities = set()
    i = 0
    for f in os.listdir(saved_dir):
        fname = os.path.join(saved_dir, f)
        with open(fname) as fd:
            try:
                data = json.load(fd)
            except:
                i += 1
                with open('failed.txt', 'a') as out:
                    out.write(fd.read() + '\n')
                print(i)
            for entity, dic in data.iteritems():
                if entity_dict:
                    m = regdropbrackets.match(entity)
                    if m:
                        entities.add(m.group(1))
                    else:
                        entities.add(entity)
                else:
                    entities.add(entity)
    if entity_dict:
        write_file('fudankg_entities_dict.txt', list(entities))
    else:
        write_file('fudankg_entities.txt', list(entities))
Exemplo n.º 2
0
def comic_song_extract_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdropbrackets.match(line.decode('utf-8'))
        entity = m.group(1).encode('utf-8') if m else line
        entities.add(entity)

    print('comic song entities length: ', len(entities))
    if persistent:
        write_file('entities/comic_song_entities.txt', entities)
    return entities
Exemplo n.º 3
0
def load_merge_step5_wiki_simplified(dirname, fname):
    merge_step5_wiki_simplified = {}

    name = os.path.join(dirname, fname)
    for js in read_file_iter(name, jsn=True):
        for key, value in js.iteritems():
            entity = value[u'resource_label']
            if u'resource_alias' in value:
                merge_step5_wiki_simplified[entity] = value[u'resource_alias']
                m = regdropbrackets.match(entity)
                if m:
                    merge_step5_wiki_simplified[m.group(
                        1)] = value[u'resource_alias']

    return merge_step5_wiki_simplified
Exemplo n.º 4
0
def get_fudankg_entity(entity_dict=False):
    from hzlib.libfile import write_file
    saved_dir = '/data/crawler_file_cache/fudankg_saved'
    entities = set()

    for f in os.listdir(saved_dir):
        fname = os.path.join(saved_dir, f)
        with open(fname) as fd:
            for entity, dic in json.load(fd).iteritems():
                if entity_dict:
                    m = regdropbrackets.match(entity)
                    if m:
                        entities.add(m.group(1))
                    else:
                        entities.add(entity)
                else:
                    entities.add(entity)
    if entity_dict:
        write_file('fudankg_entities_dict.txt', list(entities))
    else:
        write_file('fudankg_entities.txt', list(entities))
Exemplo n.º 5
0
def fudan_ea_to_json(entity,
                     attribute,
                     attribute_name,
                     extra_tag,
                     values,
                     category=None,
                     searchscore=None,
                     alias=[]):
    """
    :param entity: type(entity) is unicode
    """
    tags = [entity, entity.lower(), entity.upper(), extra_tag]
    entity_name = entity

    aliases = alias
    m = regdropbrackets.match(entity)
    if m:
        entity_name = m.group(1)
        tags.append(entity_name.lower())
        tags.append(entity_name.upper())

    eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute))

    # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match
    ret = {
        'id': eid,
        'entity': entity,
        'entity_name': entity_name,
        'attribute': attribute,
        'attribute_name': attribute_name,
        'value': values[0] if len(values) > 0 else '',
        'values': values,
        'tags': list(set(tags)),
        'searchscore': searchscore,
    }
    if category:
        ret.update({'category': category})
    if searchscore:
        ret.update({'searchscore': searchscore})
    return ret
Exemplo n.º 6
0
def ea_to_json(entity, attribute, attribute_name, extra_tag, values):
    """
    :param entity: type(entity) is unicode
    """

    tags = [entity, entity.lower(), entity.upper(), extra_tag]
    alias = get_all_aliases(entity)
    if alias:
        tags.extend(list(alias))


#    alias_mapping = load_alias_mapping()
#    if entity in alias_mapping:
#        tags.extend(alias_mapping[entity])

    entity_name = entity

    m = regdropbrackets.match(entity)
    if m:
        entity_name = m.group(1)
        tags.append(entity_name.lower())
        tags.append(entity_name.upper())

    eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute))

    # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match
    return {
        'id': eid,
        'entity': entity,
        'entity_name': entity_name,
        'attribute': attribute,
        'attribute_name': attribute_name,
        'value': values[0],
        'values': values,
        'tags': list(set(tags))
    }