def main(): args = parse_args() makedirs(args.out, exist_ok=True) wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load( args.wikipedia2wikidata_trie) print('loaded trie') redirections = load_redirections(args.redirections) anchor_trie = construct_anchor_trie( anchor_tags=args.anchor_tags, wikipedia2wikidata_trie=wikipedia2wikidata_trie, redirections=redirections, prefix=args.prefix) anchor_trie.save(join(args.out, 'trie.marisa')) ((trie_index2indices_offsets, trie_index2indices_values, trie_index2indices_counts), (trie_index2contexts_offsets, trie_index2contexts_values, trie_index2contexts_counts)) = construct_mapping( anchor_tags=args.anchor_tags, wikipedia2wikidata_trie=wikipedia2wikidata_trie, redirections=redirections, prefix=args.prefix, anchor_trie=anchor_trie) np.save(join(args.out, "trie_index2indices_offsets.npy"), trie_index2indices_offsets) np.save(join(args.out, "trie_index2indices_values.npy"), trie_index2indices_values) np.save(join(args.out, "trie_index2indices_counts.npy"), trie_index2indices_counts) np.save(join(args.out, "trie_index2contexts_offsets.npy"), trie_index2contexts_offsets) np.save(join(args.out, "trie_index2contexts_values.npy"), trie_index2contexts_values) np.save(join(args.out, "trie_index2contexts_counts.npy"), trie_index2contexts_counts)
def fix_and_parse_tags(config, collection, size): trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) prefix = get_prefix(config) redirections = load_redirections(config.redirections) docs = load_wikipedia_docs(config.wiki, size) while True: try: collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) except (ValueError, ) as e: print("issue reading blacklist, please fix.") print(str(e)) enter_or_quit() continue break print("Load first_names") with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin: first_names = set(fin.read().splitlines()) all_tags = [] for doc in get_progress_bar('fixing links', item='article')(docs): tags = obtain_tags( doc, wiki_trie=wiki_trie, anchor_trie=anchor_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions=trie_index2indices_transitions, redirections=redirections, prefix=prefix, first_names=first_names, collection=collection, fix_destination=fix_destination, min_count=config.min_count, min_percent=config.min_percent) if any(x is not None for _, x in tags): all_tags.append(tags) collection.reset_cache() return all_tags
def main(): args = parse_args() config = load_config(args.config, ["wiki", "language_path", "wikidata", "redirections"], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100 }, relative_to=args.relative_to) prefix = config.prefix or induce_wikipedia_prefix(config.wiki) collection = TypeCollection(config.wikidata, num_names_to_load=0) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) redirections = load_redirections(config.redirections) seen = 0 with open(args.out, "wt") as fout: try: for i, (article_name, article) in tqdm(enumerate(iterate_articles(config.wiki))): if i == 5409: continue fixed_article, article_qid = convert( article_name, article, collection=collection, anchor_trie=anchor_trie, wiki_trie=wiki_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions= trie_index2indices_transitions, redirections=redirections, prefix=prefix) if fixed_article is False: continue for paragraph in fixed_article: for word, qids in paragraph: if len(qids) > 0: fout.write(word.rstrip() + "\t" + "\t".join(qids + [article_qid]) + "\n") else: fout.write(word.rstrip() + "\n") fout.write("\n") seen += 1 if seen >= config.sample_size: break finally: fout.flush() fout.close()
help="Location where anchor tags were saved (tsv).") parser.add_argument("redirections", type=str, help="Location where redirections were saved (tsv).") parser.add_argument("out", type=str, help="Directory to save trie/data in.") return parser.parse_args(argv) def main(): args = parse_args() makedirs(args.out, exist_ok=True) wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load( args.wikipedia2wikidata_trie ) print('loaded trie') redirections = load_redirections(args.redirections) anchor_trie = construct_anchor_trie( anchor_tags=args.anchor_tags, wikipedia2wikidata_trie=wikipedia2wikidata_trie, redirections=redirections, prefix=args.prefix ) anchor_trie.save(join(args.out, 'trie.marisa')) ( ( trie_index2indices_offsets, trie_index2indices_values, trie_index2indices_counts ), ( trie_index2contexts_offsets,