def main(ranking, all_lemmas, language_code, top_n, dump_enriched,
         dump_top_lemmas, pid_batch, prop_batch):
    """
     Extract FrameNet data given a ranking of corpus Lexical Units (lemmas).
     Return frames only if FEs map to Wikidata properties via exact matching of labels and aliases.
    """
    logger.info("Loading ranked corpus Lexical Units (LUs) from '%s' ..." %
                ranking.name)
    # Remember to preserve the order
    lus = json.load(ranking, object_pairs_hook=OrderedDict)
    logger.info("Loaded %d LUs" % len(lus))
    logger.info("Will consider the top %d LUs" % top_n)
    top = get_top_n_lus(lus, top_n)
    logger.debug("Top LUs: %s" % top)
    logger.info("Retrieving the full list of Wikidata properties ...")
    all_pids = get_property_ids(pid_batch)
    all_properties = get_entities(all_pids, prop_batch)
    logger.info("Extracting label and aliases only ...")
    clean_properties = get_labels_and_aliases(all_properties, language_code)
    enriched = intersect_lemmas_with_framenet(top, clean_properties)
    logger.info("Managed to enrich %d LUs with FrameNet data" % len(enriched))
    logger.info("Dumping top enriched LUs to '%s' ..." % dump_enriched.name)
    json.dump(enriched, dump_enriched, indent=2)
    top = extract_top_corpus_tokens(enriched, json.load(all_lemmas))
    logger.info("Dumping top lemmas with tokens to '%s' ..." %
                dump_top_lemmas.name)
    json.dump(top, dump_top_lemmas, indent=2)
    return 0
def main(ranking, all_lemmas, language_code, top_n, dump_enriched, dump_top_lemmas, pid_batch, prop_batch):
    """
     Extract FrameNet data given a ranking of corpus Lexical Units (lemmas).
     Return frames only if FEs map to Wikidata properties via exact matching of labels and aliases.
    """
    logger.info("Loading ranked corpus Lexical Units (LUs) from '%s' ..." % ranking.name)
    # Remember to preserve the order
    lus = json.load(ranking, object_pairs_hook=OrderedDict)
    logger.info("Loaded %d LUs" % len(lus))
    logger.info("Will consider the top %d LUs" % top_n)
    top = get_top_n_lus(lus, top_n)
    logger.debug("Top LUs: %s" % top)
    logger.info("Retrieving the full list of Wikidata properties ...")
    all_pids = get_property_ids(pid_batch)
    all_properties = get_entities(all_pids, prop_batch)
    logger.info("Extracting label and aliases only ...")
    clean_properties = get_labels_and_aliases(all_properties, language_code)
    enriched = intersect_lemmas_with_framenet(top, clean_properties)
    logger.info("Managed to enrich %d LUs with FrameNet data" % len(enriched))
    logger.info("Dumping top enriched LUs to '%s' ..." % dump_enriched.name)
    json.dump(enriched, dump_enriched, indent=2)
    top = extract_top_corpus_tokens(enriched, json.load(all_lemmas))
    logger.info("Dumping top lemmas with tokens to '%s' ..." % dump_top_lemmas.name)
    json.dump(top, dump_top_lemmas, indent=2)
    return 0
示例#3
0
def main(corpus_frames, language_code, pid_batch, prop_batch, outfile):
    """ Map FEs to Wikidata properties via exact matches """
    all_pids = get_property_ids(pid_batch)
    all_properties = get_entities(all_pids, prop_batch)
    clean_properties = get_labels_and_aliases(all_properties, language_code)
    logger.debug(json.dumps(clean_properties, indent=2))
    logger.info("Computing exact matches mapping ...")
    exact_matches = compute_exact_matches(json.load(corpus_frames), clean_properties)
    logger.info("Total matches: %d Will dump to '%s' ..." %(len(exact_matches), outfile.name))
    json.dump(exact_matches, outfile, indent=2)
    return 0