Пример #1
0
def generalize_question(a, b, parser=None):
    # replace entity mention in question with a generic symbol

    if parser is None:
        parser = LC_Qaud_LinkedParser()

    _, _, uris = parser.parse_sparql(b)
    uris = [uri for uri in uris if uri.is_entity()]

    i = 0
    for item in find_mentions(a, uris):
        a = "{} #en{} {}".format(a[:item["start"]], "t" * (i + 1),
                                 a[item["end"]:])
        b = b.replace(item["uri"].raw_uri, "#en{}".format("t" * (i + 1)))

    # remove extra info from the relation's uri and remaining entities
    for item in [
            "http://dbpedia.org/resource/", "http://dbpedia.org/ontology/",
            "http://dbpedia.org/property/",
            "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    ]:
        b = b.replace(item, "")
    b = b.replace("<", "").replace(">", "")

    return a, b
Пример #2
0
    # split into separate files
    train_filepath = os.path.join(lc_quad_dir, 'LCQuad_train.json')
    trail_filepath = os.path.join(lc_quad_dir, 'LCQuad_trial.json')
    test_filepath = os.path.join(lc_quad_dir, 'LCQuad_test.json')

    ds = json.load(open("../../../output/lc_quad_gold.json"))
    total = len(ds)
    train_size = int(.7 * total)
    dev_size = int(.2 * total)
    test_size = int(.1 * total)

    json.dump(ds[:train_size], open(train_filepath, "w"))
    json.dump(ds[train_size:train_size + dev_size], open(trail_filepath, "w"))
    json.dump(ds[train_size + dev_size:], open(test_filepath, "w"))

    parser = LC_Qaud_LinkedParser()

    print('Split train set')
    save_split(train_dir, *split(train_filepath, parser))
    print('Split dev set')
    save_split(dev_dir, *split(trail_filepath, parser))
    print('Split test set')
    save_split(test_dir, *split(test_filepath, parser))

    # parse sentences
    print("parse train set")
    parse(train_dir, cp=classpath)
    print("parse dev set")
    parse(dev_dir, cp=classpath)
    print("parse test set")
    parse(test_dir, cp=classpath)