示例#1
0
def load_trie(language_path):
    trie_index2indices_values = OffsetArray.load(
        join(language_path, "trie_index2indices"))
    trie_index2indices_counts = OffsetArray(
        np.load(join(language_path, "trie_index2indices_counts.npy")),
        trie_index2indices_values.offsets)
    trie = marisa_trie.Trie().load(join(language_path, "trie.marisa"))
    return trie_index2indices_values, trie_index2indices_counts, trie
示例#2
0
def fix_and_parse_tags(config, collection, size):
    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    prefix = get_prefix(config)
    redirections = load_redirections(config.redirections)
    docs = load_wikipedia_docs(config.wiki, size)

    while True:
        try:
            collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
        except (ValueError, ) as e:
            print("issue reading blacklist, please fix.")
            print(str(e))
            enter_or_quit()
            continue
        break

    print("Load first_names")
    with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin:
        first_names = set(fin.read().splitlines())

    all_tags = []
    for doc in get_progress_bar('fixing links', item='article')(docs):
        tags = obtain_tags(
            doc,
            wiki_trie=wiki_trie,
            anchor_trie=anchor_trie,
            trie_index2indices=trie_index2indices,
            trie_index2indices_counts=trie_index2indices_counts,
            trie_index2indices_transitions=trie_index2indices_transitions,
            redirections=redirections,
            prefix=prefix,
            first_names=first_names,
            collection=collection,
            fix_destination=fix_destination,
            min_count=config.min_count,
            min_percent=config.min_percent)
        if any(x is not None for _, x in tags):
            all_tags.append(tags)
    collection.reset_cache()
    return all_tags
def main():
    args = parse_args()
    config = load_config(args.config,
                         ["wiki", "language_path", "wikidata", "redirections"],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100
                         },
                         relative_to=args.relative_to)
    prefix = config.prefix or induce_wikipedia_prefix(config.wiki)

    collection = TypeCollection(config.wikidata, num_names_to_load=0)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    redirections = load_redirections(config.redirections)

    seen = 0
    with open(args.out, "wt") as fout:
        try:
            for i, (article_name,
                    article) in tqdm(enumerate(iterate_articles(config.wiki))):
                if i == 5409:
                    continue
                fixed_article, article_qid = convert(
                    article_name,
                    article,
                    collection=collection,
                    anchor_trie=anchor_trie,
                    wiki_trie=wiki_trie,
                    trie_index2indices=trie_index2indices,
                    trie_index2indices_counts=trie_index2indices_counts,
                    trie_index2indices_transitions=
                    trie_index2indices_transitions,
                    redirections=redirections,
                    prefix=prefix)
                if fixed_article is False:
                    continue
                for paragraph in fixed_article:
                    for word, qids in paragraph:
                        if len(qids) > 0:
                            fout.write(word.rstrip() + "\t" +
                                       "\t".join(qids + [article_qid]) + "\n")
                        else:
                            fout.write(word.rstrip() + "\n")
                    fout.write("\n")
                seen += 1
                if seen >= config.sample_size:
                    break
        finally:
            fout.flush()
            fout.close()