def get_prefix(config): return config.prefix or induce_wikipedia_prefix(config.wiki)
def main(): args = parse_args() config = load_config(args.config, ["wiki", "language_path", "wikidata", "redirections"], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100 }, relative_to=args.relative_to) prefix = config.prefix or induce_wikipedia_prefix(config.wiki) collection = TypeCollection(config.wikidata, num_names_to_load=0) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) redirections = load_redirections(config.redirections) seen = 0 with open(args.out, "wt") as fout: try: for i, (article_name, article) in tqdm(enumerate(iterate_articles(config.wiki))): if i == 5409: continue fixed_article, article_qid = convert( article_name, article, collection=collection, anchor_trie=anchor_trie, wiki_trie=wiki_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions= trie_index2indices_transitions, redirections=redirections, prefix=prefix) if fixed_article is False: continue for paragraph in fixed_article: for word, qids in paragraph: if len(qids) > 0: fout.write(word.rstrip() + "\t" + "\t".join(qids + [article_qid]) + "\n") else: fout.write(word.rstrip() + "\n") fout.write("\n") seen += 1 if seen >= config.sample_size: break finally: fout.flush() fout.close()