def get_fudanperiod_entity( entity_dict=False): # get fudaninc entities stored in periodCache from hzlib.libfile import write_file saved_dir = '/data/hproject/2016/fudaninc-20160825' entities = set() i = 0 for f in os.listdir(saved_dir): fname = os.path.join(saved_dir, f) with open(fname) as fd: try: data = json.load(fd) except: i += 1 with open('failed.txt', 'a') as out: out.write(fd.read() + '\n') print(i) for entity, dic in data.iteritems(): if entity_dict: m = regdropbrackets.match(entity) if m: entities.add(m.group(1)) else: entities.add(entity) else: entities.add(entity) if entity_dict: write_file('fudankg_entities_dict.txt', list(entities)) else: write_file('fudankg_entities.txt', list(entities))
def comic_song_extract_entity(fname, persistent=False): entities = set() for line in read_file_iter(fname): m = regdropbrackets.match(line.decode('utf-8')) entity = m.group(1).encode('utf-8') if m else line entities.add(entity) print('comic song entities length: ', len(entities)) if persistent: write_file('entities/comic_song_entities.txt', entities) return entities
def load_merge_step5_wiki_simplified(dirname, fname): merge_step5_wiki_simplified = {} name = os.path.join(dirname, fname) for js in read_file_iter(name, jsn=True): for key, value in js.iteritems(): entity = value[u'resource_label'] if u'resource_alias' in value: merge_step5_wiki_simplified[entity] = value[u'resource_alias'] m = regdropbrackets.match(entity) if m: merge_step5_wiki_simplified[m.group( 1)] = value[u'resource_alias'] return merge_step5_wiki_simplified
def get_fudankg_entity(entity_dict=False): from hzlib.libfile import write_file saved_dir = '/data/crawler_file_cache/fudankg_saved' entities = set() for f in os.listdir(saved_dir): fname = os.path.join(saved_dir, f) with open(fname) as fd: for entity, dic in json.load(fd).iteritems(): if entity_dict: m = regdropbrackets.match(entity) if m: entities.add(m.group(1)) else: entities.add(entity) else: entities.add(entity) if entity_dict: write_file('fudankg_entities_dict.txt', list(entities)) else: write_file('fudankg_entities.txt', list(entities))
def fudan_ea_to_json(entity, attribute, attribute_name, extra_tag, values, category=None, searchscore=None, alias=[]): """ :param entity: type(entity) is unicode """ tags = [entity, entity.lower(), entity.upper(), extra_tag] entity_name = entity aliases = alias m = regdropbrackets.match(entity) if m: entity_name = m.group(1) tags.append(entity_name.lower()) tags.append(entity_name.upper()) eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute)) # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match ret = { 'id': eid, 'entity': entity, 'entity_name': entity_name, 'attribute': attribute, 'attribute_name': attribute_name, 'value': values[0] if len(values) > 0 else '', 'values': values, 'tags': list(set(tags)), 'searchscore': searchscore, } if category: ret.update({'category': category}) if searchscore: ret.update({'searchscore': searchscore}) return ret
def ea_to_json(entity, attribute, attribute_name, extra_tag, values): """ :param entity: type(entity) is unicode """ tags = [entity, entity.lower(), entity.upper(), extra_tag] alias = get_all_aliases(entity) if alias: tags.extend(list(alias)) # alias_mapping = load_alias_mapping() # if entity in alias_mapping: # tags.extend(alias_mapping[entity]) entity_name = entity m = regdropbrackets.match(entity) if m: entity_name = m.group(1) tags.append(entity_name.lower()) tags.append(entity_name.upper()) eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute)) # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match return { 'id': eid, 'entity': entity, 'entity_name': entity_name, 'attribute': attribute, 'attribute_name': attribute_name, 'value': values[0], 'values': values, 'tags': list(set(tags)) }