def main(ranking, all_lemmas, language_code, top_n, dump_enriched, dump_top_lemmas, pid_batch, prop_batch): """ Extract FrameNet data given a ranking of corpus Lexical Units (lemmas). Return frames only if FEs map to Wikidata properties via exact matching of labels and aliases. """ logger.info("Loading ranked corpus Lexical Units (LUs) from '%s' ..." % ranking.name) # Remember to preserve the order lus = json.load(ranking, object_pairs_hook=OrderedDict) logger.info("Loaded %d LUs" % len(lus)) logger.info("Will consider the top %d LUs" % top_n) top = get_top_n_lus(lus, top_n) logger.debug("Top LUs: %s" % top) logger.info("Retrieving the full list of Wikidata properties ...") all_pids = get_property_ids(pid_batch) all_properties = get_entities(all_pids, prop_batch) logger.info("Extracting label and aliases only ...") clean_properties = get_labels_and_aliases(all_properties, language_code) enriched = intersect_lemmas_with_framenet(top, clean_properties) logger.info("Managed to enrich %d LUs with FrameNet data" % len(enriched)) logger.info("Dumping top enriched LUs to '%s' ..." % dump_enriched.name) json.dump(enriched, dump_enriched, indent=2) top = extract_top_corpus_tokens(enriched, json.load(all_lemmas)) logger.info("Dumping top lemmas with tokens to '%s' ..." % dump_top_lemmas.name) json.dump(top, dump_top_lemmas, indent=2) return 0
def main(corpus_frames, language_code, pid_batch, prop_batch, outfile): """ Map FEs to Wikidata properties via exact matches """ all_pids = get_property_ids(pid_batch) all_properties = get_entities(all_pids, prop_batch) clean_properties = get_labels_and_aliases(all_properties, language_code) logger.debug(json.dumps(clean_properties, indent=2)) logger.info("Computing exact matches mapping ...") exact_matches = compute_exact_matches(json.load(corpus_frames), clean_properties) logger.info("Total matches: %d Will dump to '%s' ..." %(len(exact_matches), outfile.name)) json.dump(exact_matches, outfile, indent=2) return 0