def download_from_wikidata() -> None: parser = argparse.ArgumentParser() parser.add_argument("--datapath", default=None, type=str, required=True, help="") parser.add_argument("--outpath", default=None, type=str, required=True, help="") parser.add_argument("--use", action="store_true", help="") args = parser.parse_args() t = Relations(args.datapath) filenames = t.get_available_filenames() t.load_data(filenames) entities = t.get_all_entities(["obj_uri", "sub_uri"]) base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json" for entity in tqdm.tqdm(entities): download_entity(base_url.format(entity), os.path.join(args.outpath, entity + ".json"))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data", default=None, type=str, required=True, help="") parser.add_argument("--entities", default=None, type=str, required=True, help="") parser.add_argument("--outpath", default=None, type=str, required=True, help="") parser.add_argument("--languagemapping", default=None, type=str, required=True, help="") args = parser.parse_args() lang2translateid = load_languagemapping(args.languagemapping) for lang in lang2translateid: t = Relations(args.data) filenames = t.get_available_filenames() t.load_data(filenames) count = collections.Counter() logfile = open(os.path.join(args.outpath, lang + ".log"), "w") for filename, relations in t.data.items(): LOG.info("Processing relation: {}".format(filename)) outdirectory = os.path.join(args.outpath, lang) os.makedirs(outdirectory, exist_ok=True) with open(os.path.join(outdirectory, filename + ".jsonl"), "w") as fout: for relation in relations: count["in_file"] += 1 if ("sub_uri" in relation and "obj_uri" in relation and "sub_label" in relation and "obj_label" in relation): count["available"] += 1 obj_uri = relation["obj_uri"] sub_uri = relation["sub_uri"] # load entitiy information obj_surface = get_entity_surface( args.entities, obj_uri, lang) sub_surface = get_entity_surface( args.entities, sub_uri, lang) # write out if obj_surface and sub_surface: count["converted"] += 1 to_write = { "sub_uri": sub_uri, "obj_uri": obj_uri, "obj_label": obj_surface, "sub_label": sub_surface, "from_english": False } else: # use english surface forms to_write = { "sub_uri": sub_uri, "obj_uri": obj_uri, "obj_label": relation["obj_label"], "sub_label": relation["sub_label"], "from_english": True } fout.write(json.dumps(to_write) + "\n") summary = "{}|{}|{}|(converted/available/in_file)".format( count["converted"], count["available"], count["in_file"]) LOG.info(summary) logfile.write("{}|{}\n".format(filename, summary)) logfile.close()