def generalize_question(a, b, parser=None): # replace entity mention in question with a generic symbol if parser is None: parser = LC_Qaud_LinkedParser() _, _, uris = parser.parse_sparql(b) uris = [uri for uri in uris if uri.is_entity()] i = 0 for item in find_mentions(a, uris): a = "{} #en{} {}".format(a[:item["start"]], "t" * (i + 1), a[item["end"]:]) b = b.replace(item["uri"].raw_uri, "#en{}".format("t" * (i + 1))) # remove extra info from the relation's uri and remaining entities for item in [ "http://dbpedia.org/resource/", "http://dbpedia.org/ontology/", "http://dbpedia.org/property/", "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ]: b = b.replace(item, "") b = b.replace("<", "").replace(">", "") return a, b
# split into separate files train_filepath = os.path.join(lc_quad_dir, 'LCQuad_train.json') trail_filepath = os.path.join(lc_quad_dir, 'LCQuad_trial.json') test_filepath = os.path.join(lc_quad_dir, 'LCQuad_test.json') ds = json.load(open("../../../output/lc_quad_gold.json")) total = len(ds) train_size = int(.7 * total) dev_size = int(.2 * total) test_size = int(.1 * total) json.dump(ds[:train_size], open(train_filepath, "w")) json.dump(ds[train_size:train_size + dev_size], open(trail_filepath, "w")) json.dump(ds[train_size + dev_size:], open(test_filepath, "w")) parser = LC_Qaud_LinkedParser() print('Split train set') save_split(train_dir, *split(train_filepath, parser)) print('Split dev set') save_split(dev_dir, *split(trail_filepath, parser)) print('Split test set') save_split(test_dir, *split(test_filepath, parser)) # parse sentences print("parse train set") parse(train_dir, cp=classpath) print("parse dev set") parse(dev_dir, cp=classpath) print("parse test set") parse(test_dir, cp=classpath)