Пример #1
0
def download_from_wikidata() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--datapath",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--outpath",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--use", action="store_true", help="")
    args = parser.parse_args()
    t = Relations(args.datapath)
    filenames = t.get_available_filenames()
    t.load_data(filenames)
    entities = t.get_all_entities(["obj_uri", "sub_uri"])
    base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"
    for entity in tqdm.tqdm(entities):
        download_entity(base_url.format(entity),
                        os.path.join(args.outpath, entity + ".json"))
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--entities",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--outpath",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--languagemapping",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    args = parser.parse_args()
    lang2translateid = load_languagemapping(args.languagemapping)

    for lang in lang2translateid:
        t = Relations(args.data)
        filenames = t.get_available_filenames()
        t.load_data(filenames)
        count = collections.Counter()
        logfile = open(os.path.join(args.outpath, lang + ".log"), "w")
        for filename, relations in t.data.items():
            LOG.info("Processing relation: {}".format(filename))
            outdirectory = os.path.join(args.outpath, lang)
            os.makedirs(outdirectory, exist_ok=True)
            with open(os.path.join(outdirectory, filename + ".jsonl"),
                      "w") as fout:
                for relation in relations:
                    count["in_file"] += 1
                    if ("sub_uri" in relation and "obj_uri" in relation
                            and "sub_label" in relation
                            and "obj_label" in relation):
                        count["available"] += 1
                        obj_uri = relation["obj_uri"]
                        sub_uri = relation["sub_uri"]
                        # load entitiy information
                        obj_surface = get_entity_surface(
                            args.entities, obj_uri, lang)
                        sub_surface = get_entity_surface(
                            args.entities, sub_uri, lang)
                        # write out
                        if obj_surface and sub_surface:
                            count["converted"] += 1
                            to_write = {
                                "sub_uri": sub_uri,
                                "obj_uri": obj_uri,
                                "obj_label": obj_surface,
                                "sub_label": sub_surface,
                                "from_english": False
                            }
                        else:
                            # use english surface forms
                            to_write = {
                                "sub_uri": sub_uri,
                                "obj_uri": obj_uri,
                                "obj_label": relation["obj_label"],
                                "sub_label": relation["sub_label"],
                                "from_english": True
                            }
                        fout.write(json.dumps(to_write) + "\n")
            summary = "{}|{}|{}|(converted/available/in_file)".format(
                count["converted"], count["available"], count["in_file"])
            LOG.info(summary)
            logfile.write("{}|{}\n".format(filename, summary))
        logfile.close()