示例#1
0
def get_mid2wiki(filename):
    # print("Loading Wiki")
    mid2wiki = defaultdict(bool)
    fin = open(filename)
    for line in fin.readlines():
        items = line.strip().split('\t')
        sub = rdf2fb(clean_uri(items[0]))
        mid2wiki[sub] = True
    return mid2wiki
示例#2
0
def get_names_for_entities(namespath):
    print("getting names map...")
    names = {}
    with open(namespath, 'r') as f:
        for i, line in enumerate(f):
            if i % 1000000 == 0:
                print("line: {}".format(i))

            items = line.strip().split("\t")
            if len(items) != 4:
                print("ERROR: line - {}".format(line))
            entity = clean_uri(items[0])
            type = clean_uri(items[1])
            literal = clean_uri(items[2]).lower()
            if entity not in names.keys():
                names[entity] = [literal]
            else:
                names[entity].append(literal)
    return names
示例#3
0
def trim_names(fbsubsetpath, namespath, outpath):
    print("getting all entity MIDs from Freebase subset...")
    mids_to_check = get_all_entity_mids(fbsubsetpath)
    print("trimming names...")
    outfile = open(outpath, 'w')
    with open(namespath, 'r') as f:
        for i, line in enumerate(f):
            if i % 1000000 == 0:
                print("line: {}".format(i))

            items = line.strip().split("\t")
            if len(items) != 4:
                print("ERROR: line - {}".format(line))
            entity = www2fb(clean_uri(items[0]))
            type = clean_uri(items[1])

            if entity in mids_to_check:
                outfile.write(line)

    outfile.close()
示例#4
0
def create_inverted_index_entity(namespath, outpath):
    print("creating the index map...")
    index = {}
    size = 0
    with open(namespath, 'r') as f:
        for i, line in enumerate(f):
            if i % 1000000 == 0:
                print("line: {}".format(i))

            items = line.strip().split("\t")
            if len(items) != 4:
                print("ERROR: line - {}".format(line))

            entity_mid = clean_uri(items[0])
            entity_type = clean_uri(items[1])
            entity_name = clean_uri(items[2])

            name_ngrams = get_name_ngrams(entity_name)

            for ngram_tuple in name_ngrams:
                size += 1
                ngram = " ".join(ngram_tuple)
                ngram = strip_accents(ngram)
                # print(ngram)
                if ngram in index.keys():
                    index[ngram].add(entity_mid)
                else:
                    index[ngram] = set([entity_mid])


    print("num keys: {}".format(len(index)))
    print("total key-value pairs: {}".format(size))

    print("dumping to pickle...")
    with open(outpath, 'wb') as f:
        pickle.dump(index, f)

    print("DONE")
示例#5
0
        match_mid_list.extend(mids)
        for mid in mids:
            if mid_dic.get(mid) is not None:
                tuplelist.append((mid, name))
    tupleset.extend(tuplelist)
    head_mid_idx[i] = list(set(tuplelist))
    if tuplelist:
        id_match.add(i)
tupleset = set(tupleset)
tuple_topic = []
with open('data/FB5M.name.txt', 'r',encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i % 1000000 == 0:
            print("line: {}".format(i))
        items = line.strip().split("\t")
        if (www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2]))) in tupleset and items[1] == "<fb:type.object.name>":
            tuple_topic.append((www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2]))))
tuple_topic = set(tuple_topic)


######################## Learn entity representation  ########################
head_emb = np.zeros((total_num, args.embed_dim))
TEXT = data.Field(lower=True)
ED = data.Field(sequential=False, use_vocab=False)
train, dev = data.TabularDataset.splits(path=args.output, train='entity_train.txt', validation='entity_valid.txt', format='tsv', fields=[('text', TEXT), ('mid', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', None)]
test = data.TabularDataset(path=os.path.join(args.output, 'test.txt'), format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)  # training data includes validation data

args.gpu == -1
# load the model
示例#6
0
 with open(os.path.join(args.dataset, "annotated_wd_data_train_answerable.txt"), 'r') as f:
     for i, line in enumerate(f):
         items = line.strip().split("\t")
         if len(items) != 4:
             print("ERROR: line - {}".format(line))
             break
         entiset.add(items[0])  # entiset.add(www2fb(items[2]))
 outfile = open(os.path.join(args.output, 'names.trimmed.txt'), 'w')  # output file path for trimmed names file
 with open(args.names, 'r') as f:
     for i, line in enumerate(f):
         if i % 1000000 == 0:
             print("line: {}".format(i))
         items = line.strip().split("\t")
         if len(items) != 3:
             print("ERROR: line - {}".format(line))
         entity = clean_uri(items[0])
         if entity in fb_mids:
             name = processed_text(clean_uri(items[2]))
             if name.strip() != "":
                 if entity in entiset:
                     outfile.write("{}\t{}\n".format(entity, name))
                 elif name in gramset:
                     entiset.add(entity)
                     outfile.write("{}\t{}\n".format(entity, name))
                     #name_gram = [name]
                     #tokens = name.split()
                     #maxlen = len(tokens)
                     #if maxlen > 2:
                     #    j = maxlen - 1
                     #    for token in [tokens[idx:idx + j] for idx in range(maxlen - j + 1)]:
                     #        name_gram.append(' '.join(token))
示例#7
0
        match_mid_list.extend(mids)
        for mid in mids:
            if mid_dic.get(mid) is not None:
                tuplelist.append((mid, name))
    tupleset.extend(tuplelist)
    head_mid_idx[i] = list(set(tuplelist))
    if tuplelist:
        id_match.add(i)
tupleset = set(tupleset)
tuple_topic = []
with open('data/FB5M.name.txt', 'r') as f:
    for i, line in enumerate(f):
        if i % 1000000 == 0:
            print("line: {}".format(i))
        items = line.strip().split("\t")
        if (www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2]))
            ) in tupleset and items[1] == "<fb:type.object.name>":
            tuple_topic.append((www2fb(clean_uri(items[0])),
                                processed_text(clean_uri(items[2]))))
tuple_topic = set(tuple_topic)

######################## Learn entity representation  ########################
head_emb = np.zeros((total_num, args.embed_dim))
TEXT = data.Field(lower=True)
ED = data.Field(sequential=False, use_vocab=False)
train, dev = data.TabularDataset.splits(path=args.output,
                                        train='entity_train.txt',
                                        validation='entity_valid.txt',
                                        format='tsv',
                                        fields=[('text', TEXT), ('mid', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None),
示例#8
0
     for i, line in enumerate(f):
         items = line.strip().split("\t")
         if len(items) != 4:
             print("ERROR: line - {}".format(line))
             break
         entiset.add(www2fb(items[0]))  # entiset.add(www2fb(items[2]))
 outfile = open(os.path.join(args.output, 'names.trimmed.txt'),
                'w')  # output file path for trimmed names file
 with open(args.names, 'r') as f:
     for i, line in enumerate(f):
         if i % 1000000 == 0:
             print("line: {}".format(i))
         items = line.strip().split("\t")
         if len(items) != 4:
             print("ERROR: line - {}".format(line))
         entity = www2fb(clean_uri(items[0]))
         if entity in fb_mids:
             name = processed_text(clean_uri(items[2]))
             if name.strip() != "":
                 if entity in entiset:
                     outfile.write("{}\t{}\n".format(entity, name))
                 elif name in gramset:
                     entiset.add(entity)
                     outfile.write("{}\t{}\n".format(entity, name))
                     #name_gram = [name]
                     #tokens = name.split()
                     #maxlen = len(tokens)
                     #if maxlen > 2:
                     #    j = maxlen - 1
                     #    for token in [tokens[idx:idx + j] for idx in range(maxlen - j + 1)]:
                     #        name_gram.append(' '.join(token))