예제 #1
0
 def read_jsn(self, data_dir):
     for fname in os.listdir(data_dir):
         for js in libfile.read_file_iter(os.path.join(data_dir, fname),
                                          jsn=True):
             self.parse_info(js)
     for fname in os.listdir(data_dir):
         for js in libfile.read_file_iter(os.path.join(data_dir, fname),
                                          jsn=True):
             self.parse_subcompany(js)
             self.parse_alias(js)
예제 #2
0
def begin_filter_with_chinese(fname):
    entities = set()
    for line in read_file_iter(fname):
        if regchinese.match(line.decode('utf-8')):
            entities.add(line)

    write_file('entities_chinese_2_10.txt', entities)
 def read_jsn(self, data_dir):
     for fname in os.listdir(data_dir):
         for js in libfile.read_file_iter(os.path.join(data_dir, fname),
                                          jsn=True):
             try:
                 self.parse(js)
             except Exception, e:
                 print('{}: {}'.format(type(e), e.message))
예제 #4
0
def begin_filter_with_search(fname):
    entities = set()

    for line in read_file_iter(fname):
        m = regentityfilt.match(line.decode('utf-8'))
        if m:
            entities.add(m.group(1))

    return entities
예제 #5
0
def load_zhwiki_alias(dirname, fname):
    zhwiki_entity_alias = {}

    name = os.path.join(dirname, fname)
    for js in read_file_iter(name, jsn=True):
        if u'chinese_aliases' in js:
            zhwiki_entity_alias[js[u'chinese_label']] = js[u'chinese_aliases']

    return zhwiki_entity_alias
예제 #6
0
def comic_song_extract_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdropbrackets.match(line.decode('utf-8'))
        entity = m.group(1).encode('utf-8') if m else line
        entities.add(entity)

    print('comic song entities length: ', len(entities))
    if persistent:
        write_file('entities/comic_song_entities.txt', entities)
    return entities
예제 #7
0
def wiki_title_entity(fname, persistent=False):
    entities = set()

    for line in read_file_iter(fname):
        m = regdisambiguation.match(line.strip().decode('utf-8'))
        item = m.group(1).encode('utf-8') if m else line.strip()
        if not item.startswith('\xee'):  # human unreadable string
            entities.add(item.strip())

    write_file('entities/{}_title'.format(fname), entities)
    if persistent:
        print('wiki title entities length: ', len(entities))
    return entities
예제 #8
0
def dbpedia_extract_entity(fname, persistent=False):
    entities = set()

    for jsn in read_file_iter(fname, jsn=True):
        key, value = jsn.items()[0]
        key = value[u'resource_label'].strip()

        m = regdisambiguation.match(key)
        entity = m.group(1) if m else key
        entities.add(entity.encode('utf-8'))

    print('dbpedia entities length: ', len(entities))
    if persistent:
        write_file('entities/dbpedia_entities.txt', entities)
    return entities
예제 #9
0
def bdbk_extract_entity(ifilename, persistent=False):
    entities = set()
    last_line = '</>'

    for line in read_file_iter(ifilename):
        if last_line == '</>':
            entities.add(line)
        elif line.startswith('@@@LINK='):
            entities.add(line[8:])
        last_line = line

    print('bdbk entities length: ', len(entities))
    if persistent:
        write_file('entities/{}_entities.txt'.format(ifilename), entities)
    return entities
예제 #10
0
def load_dbpedia():
    data = {}
    for line in read_file_iter(DIR + 'merge_step_5_simplified.json', jsn=True):
        for key, value in line.items():
            entity = value[u'resource_label']
            data[entity] = {}

            if u'short_abstract' in value:
                data[entity]['definition'] = value[u'short_abstract']

#            if u'resource_alias' in value:
#                data[entity]['aliases'] = value[u'resource_alias']

    send_definition_to_es(data, 'definition')
    return data
예제 #11
0
def load_merge_step5_wiki_simplified(dirname, fname):
    merge_step5_wiki_simplified = {}

    name = os.path.join(dirname, fname)
    for js in read_file_iter(name, jsn=True):
        for key, value in js.iteritems():
            entity = value[u'resource_label']
            if u'resource_alias' in value:
                merge_step5_wiki_simplified[entity] = value[u'resource_alias']
                m = regdropbrackets.match(entity)
                if m:
                    merge_step5_wiki_simplified[m.group(
                        1)] = value[u'resource_alias']

    return merge_step5_wiki_simplified
예제 #12
0
def load_wikidata():
    """ this function cost too much memory
    """
    data = {}
    for jsn in read_file_iter('wikidata_zh_simplified.json', jsn=True):
        m = regdisambiguation.match(jsn[u'chinese_label'])
        item = m.group(1) if m else jsn[u'chinese_label']
        entity = item.strip().encode('utf-8')
        data[entity] = {}

        if u'chinese_aliases' in jsn:
            data[entity]['aliases'] = jsn[u'chinese_aliases']
            jsn.pop(u'chinese_aliases')

        data[entity]['attributes'] = jsn
예제 #13
0
def wiki_extract_entity(fname, persistent=False):
    entities = set()

    for jsn in read_file_iter(fname, jsn=True):
        m = regdisambiguation.match(jsn[u'chinese_label'])
        item = m.group(1) if m else jsn[u'chinese_label']
        entities.add(item.encode('utf-8').strip())
        if u'chinese_aliases' in jsn:
            entities.update(
                map(string.strip,
                    map(lambda x: x.encode('utf-8'), jsn[u'chinese_aliases'])))

    print('wiki entities length: ', len(entities))
    if persistent:
        write_file('entities/wiki_entities.txt', entities)
    return entities
예제 #14
0
def zgdbk_extract_entity(infilename, persistent=False):
    entities = set()
    re_entity = re.compile('<span id="span2" class="STYLE2">(.+)</span')

    for line in read_file_iter(infilename):
        m = re_entity.match(line)
        if m:
            entity = regrmlabel(m.group(1))
            entity = zgdbk_parse_entity(entity)
            if entity:
                entities.add(entity.strip())

    print('zgdbk entities length: ', len(entities))
    if persistent:
        write_file('entities/zgdbk_entities.txt', entities)
    return entities
예제 #15
0
def load_dbpedia():
    data = {}
    for line in read_file_iter('merge_step_5_simplified.json', jsn=True):
        for key, value in line.items():
            entity = value[u'resource_label'].encode('utf-8')
            data[entity] = {}

            if u'short_abstract' in value:
                data[entity]['definition'] = value[u'short_abstract']
                value.pop(u'short_abstract')

            if u'resource_alias' in value:
                data[entity]['aliases'] = value[u'resource_alias']
                value.pop(u'resource_alias')

            data[entity]['attributes'] = value
예제 #16
0
def begin_filter_with_lower(fname):
    entities = set()
    for line in read_file_iter(fname):
        entities.add(line.lower())
    return entities
 def read_jsn(self, data_dir):
     for fname in os.listdir(data_dir):
         for js in libfile.read_file_iter(os.path.join(data_dir, fname),
                                          jsn=True):
             self.parse(js)
예제 #18
0
def load_zgdbk_info(dirname='.'):
    fname = os.path.join(dirname, 'zgdbk_entity_info.txt')
    send_definition_to_es(read_file_iter(fname, jsn=True), field=None)