def main(): args = parse_args() config = load_config(args.config, ["wiki", "language_path", "wikidata", "redirections"], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100 }, relative_to=args.relative_to) prefix = config.prefix or induce_wikipedia_prefix(config.wiki) collection = TypeCollection(config.wikidata, num_names_to_load=0) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) redirections = load_redirections(config.redirections) seen = 0 with open(args.out, "wt") as fout: try: for i, (article_name, article) in tqdm(enumerate(iterate_articles(config.wiki))): if i == 5409: continue fixed_article, article_qid = convert( article_name, article, collection=collection, anchor_trie=anchor_trie, wiki_trie=wiki_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions= trie_index2indices_transitions, redirections=redirections, prefix=prefix) if fixed_article is False: continue for paragraph in fixed_article: for word, qids in paragraph: if len(qids) > 0: fout.write(word.rstrip() + "\t" + "\t".join(qids + [article_qid]) + "\n") else: fout.write(word.rstrip() + "\n") fout.write("\n") seen += 1 if seen >= config.sample_size: break finally: fout.flush() fout.close()
def main(): args = parse_args() config = load_config(args.config, [ "wiki", "language_path", "wikidata", "redirections", "classification", "path" ], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100, "wiki": None, "min_count": 0, "min_percent": 0.0 }, relative_to=args.relative_to) if config.wiki is None: raise ValueError("must provide path to 'wiki' in config.") prefix = get_prefix(config) print("Load type_collection") collection = TypeCollection(config.wikidata, num_names_to_load=config.num_names_to_load, prefix=prefix, verbose=True) fname = config.wiki all_tags = fix_and_parse_tags(config, collection, config.sample_size) test_tags = all_tags[:config.sample_size] train_tags = all_tags[config.sample_size:] oracles = [ load_oracle_classification(classification) for classification in config.classification ] def get_name(idx): if idx < config.num_names_to_load: if idx in collection.known_names: return collection.known_names[idx] + " (%s)" % ( collection.ids[idx], ) else: return collection.ids[idx] else: return maybe_web_get_name( collection.ids[idx]) + " (%s)" % (collection.ids[idx], ) while True: total_report, ambiguous_tags = disambiguate_batch( test_tags, train_tags, oracles) summarize_disambiguation(total_report) if args.log is not None: with open(args.log, "at") as fout: summarize_disambiguation(total_report, file=fout) if args.verbose: try: summarize_ambiguities(ambiguous_tags, oracles, get_name) except KeyboardInterrupt as e: pass if args.interactive: enter_or_quit() else: break
def main(): args = parse_args() config = load_config(args.config, [ "wiki", "language_path", "wikidata", "redirections", "classification" ], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100, "wiki": None, "fix_links": False, "min_count": 0, "min_percent": 0.0 }, relative_to=args.relative_to) if config.wiki is None: raise ValueError("must provide path to 'wiki' in config.") prefix = get_prefix(config) collection = TypeCollection(config.wikidata, num_names_to_load=config.num_names_to_load, prefix=prefix, verbose=True) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) fname = config.wiki test_tags = fix_and_parse_tags(config, collection, config.sample_size) aucs = load_aucs() ids = sorted( set([ idx for doc_tags in test_tags for _, tag in doc_tags if tag is not None for idx in tag[2] if len(tag[2]) > 1 ])) id2pos = {idx: k for k, idx in enumerate(ids)} # use reduced identity system: remapped_tags = [] for doc_tags in test_tags: for text, tag in doc_tags: if tag is not None: remapped_tags.append( (id2pos[tag[1]] if len(tag[2]) > 1 else tag[1], np.array([id2pos[idx] for idx in tag[2]]) if len(tag[2]) > 1 else tag[2], tag[3])) test_tags = remapped_tags aucs = {key: value for key, value in aucs.items() if value > 0.5} print("%d relations to pick from with %d ids." % (len(aucs), len(ids)), flush=True) cached_satisfy = get_cached_satisfy(collection, aucs, ids, mmap=args.method == "greedy") del collection key2row = {key: k for k, key in enumerate(sorted(aucs.keys()))} if args.method == "greedy": picks, _ = beam_project(cached_satisfy, key2row, remapped_tags, aucs, ids, beam_width=1, penalty=args.penalty, log=args.log) elif args.method == "beam": picks, _ = beam_project(cached_satisfy, key2row, remapped_tags, aucs, ids, beam_width=args.beam_width, penalty=args.penalty, log=args.log) elif args.method == "cem": picks, _ = cem_project(cached_satisfy, key2row, remapped_tags, aucs, ids, n_samples=args.samples, penalty=args.penalty, log=args.log) elif args.method == "ga": picks, _ = ga_project(cached_satisfy, key2row, remapped_tags, aucs, ids, ngen=args.ngen, n_samples=args.samples, penalty=args.penalty, log=args.log) else: raise ValueError("unknown method %r." % (args.method, )) with open(args.out, "wt") as fout: json.dump(picks, fout)