def main():
    args = parse_args()
    makedirs(args.out, exist_ok=True)
    wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load(
        args.wikipedia2wikidata_trie)
    print('loaded trie')
    redirections = load_redirections(args.redirections)
    anchor_trie = construct_anchor_trie(
        anchor_tags=args.anchor_tags,
        wikipedia2wikidata_trie=wikipedia2wikidata_trie,
        redirections=redirections,
        prefix=args.prefix)
    anchor_trie.save(join(args.out, 'trie.marisa'))
    ((trie_index2indices_offsets, trie_index2indices_values,
      trie_index2indices_counts),
     (trie_index2contexts_offsets, trie_index2contexts_values,
      trie_index2contexts_counts)) = construct_mapping(
          anchor_tags=args.anchor_tags,
          wikipedia2wikidata_trie=wikipedia2wikidata_trie,
          redirections=redirections,
          prefix=args.prefix,
          anchor_trie=anchor_trie)
    np.save(join(args.out, "trie_index2indices_offsets.npy"),
            trie_index2indices_offsets)
    np.save(join(args.out, "trie_index2indices_values.npy"),
            trie_index2indices_values)
    np.save(join(args.out, "trie_index2indices_counts.npy"),
            trie_index2indices_counts)

    np.save(join(args.out, "trie_index2contexts_offsets.npy"),
            trie_index2contexts_offsets)
    np.save(join(args.out, "trie_index2contexts_values.npy"),
            trie_index2contexts_values)
    np.save(join(args.out, "trie_index2contexts_counts.npy"),
            trie_index2contexts_counts)
Exemplo n.º 2
0
def fix_and_parse_tags(config, collection, size):
    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    prefix = get_prefix(config)
    redirections = load_redirections(config.redirections)
    docs = load_wikipedia_docs(config.wiki, size)

    while True:
        try:
            collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
        except (ValueError, ) as e:
            print("issue reading blacklist, please fix.")
            print(str(e))
            enter_or_quit()
            continue
        break

    print("Load first_names")
    with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin:
        first_names = set(fin.read().splitlines())

    all_tags = []
    for doc in get_progress_bar('fixing links', item='article')(docs):
        tags = obtain_tags(
            doc,
            wiki_trie=wiki_trie,
            anchor_trie=anchor_trie,
            trie_index2indices=trie_index2indices,
            trie_index2indices_counts=trie_index2indices_counts,
            trie_index2indices_transitions=trie_index2indices_transitions,
            redirections=redirections,
            prefix=prefix,
            first_names=first_names,
            collection=collection,
            fix_destination=fix_destination,
            min_count=config.min_count,
            min_percent=config.min_percent)
        if any(x is not None for _, x in tags):
            all_tags.append(tags)
    collection.reset_cache()
    return all_tags
def main():
    args = parse_args()
    config = load_config(args.config,
                         ["wiki", "language_path", "wikidata", "redirections"],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100
                         },
                         relative_to=args.relative_to)
    prefix = config.prefix or induce_wikipedia_prefix(config.wiki)

    collection = TypeCollection(config.wikidata, num_names_to_load=0)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    redirections = load_redirections(config.redirections)

    seen = 0
    with open(args.out, "wt") as fout:
        try:
            for i, (article_name,
                    article) in tqdm(enumerate(iterate_articles(config.wiki))):
                if i == 5409:
                    continue
                fixed_article, article_qid = convert(
                    article_name,
                    article,
                    collection=collection,
                    anchor_trie=anchor_trie,
                    wiki_trie=wiki_trie,
                    trie_index2indices=trie_index2indices,
                    trie_index2indices_counts=trie_index2indices_counts,
                    trie_index2indices_transitions=
                    trie_index2indices_transitions,
                    redirections=redirections,
                    prefix=prefix)
                if fixed_article is False:
                    continue
                for paragraph in fixed_article:
                    for word, qids in paragraph:
                        if len(qids) > 0:
                            fout.write(word.rstrip() + "\t" +
                                       "\t".join(qids + [article_qid]) + "\n")
                        else:
                            fout.write(word.rstrip() + "\n")
                    fout.write("\n")
                seen += 1
                if seen >= config.sample_size:
                    break
        finally:
            fout.flush()
            fout.close()
        help="Location where anchor tags were saved (tsv).")
    parser.add_argument("redirections", type=str,
        help="Location where redirections were saved (tsv).")
    parser.add_argument("out", type=str,
        help="Directory to save trie/data in.")
    return parser.parse_args(argv)


def main():
    args = parse_args()
    makedirs(args.out, exist_ok=True)
    wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load(
        args.wikipedia2wikidata_trie
    )
    print('loaded trie')
    redirections = load_redirections(args.redirections)
    anchor_trie = construct_anchor_trie(
        anchor_tags=args.anchor_tags,
        wikipedia2wikidata_trie=wikipedia2wikidata_trie,
        redirections=redirections,
        prefix=args.prefix
    )
    anchor_trie.save(join(args.out, 'trie.marisa'))
    (
        (
            trie_index2indices_offsets,
            trie_index2indices_values,
            trie_index2indices_counts
        ),
        (
            trie_index2contexts_offsets,