Exemplo n.º 1
0
def load_trie(language_path):
    trie_index2indices_values = OffsetArray.load(
        join(language_path, "trie_index2indices"))
    trie_index2indices_counts = OffsetArray(
        np.load(join(language_path, "trie_index2indices_counts.npy")),
        trie_index2indices_values.offsets)
    trie = marisa_trie.Trie().load(join(language_path, "trie.marisa"))
    return trie_index2indices_values, trie_index2indices_counts, trie
Exemplo n.º 2
0
def fix_and_parse_tags(config, collection, size):
    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    prefix = get_prefix(config)
    redirections = load_redirections(config.redirections)
    docs = load_wikipedia_docs(config.wiki, size)

    while True:
        try:
            collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
        except (ValueError, ) as e:
            print("issue reading blacklist, please fix.")
            print(str(e))
            enter_or_quit()
            continue
        break

    print("Load first_names")
    with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin:
        first_names = set(fin.read().splitlines())

    all_tags = []
    for doc in get_progress_bar('fixing links', item='article')(docs):
        tags = obtain_tags(
            doc,
            wiki_trie=wiki_trie,
            anchor_trie=anchor_trie,
            trie_index2indices=trie_index2indices,
            trie_index2indices_counts=trie_index2indices_counts,
            trie_index2indices_transitions=trie_index2indices_transitions,
            redirections=redirections,
            prefix=prefix,
            first_names=first_names,
            collection=collection,
            fix_destination=fix_destination,
            min_count=config.min_count,
            min_percent=config.min_percent)
        if any(x is not None for _, x in tags):
            all_tags.append(tags)
    collection.reset_cache()
    return all_tags
def main():
    args = parse_args()
    config = load_config(args.config,
                         ["wiki", "language_path", "wikidata", "redirections"],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100
                         },
                         relative_to=args.relative_to)
    prefix = config.prefix or induce_wikipedia_prefix(config.wiki)

    collection = TypeCollection(config.wikidata, num_names_to_load=0)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    redirections = load_redirections(config.redirections)

    seen = 0
    with open(args.out, "wt") as fout:
        try:
            for i, (article_name,
                    article) in tqdm(enumerate(iterate_articles(config.wiki))):
                if i == 5409:
                    continue
                fixed_article, article_qid = convert(
                    article_name,
                    article,
                    collection=collection,
                    anchor_trie=anchor_trie,
                    wiki_trie=wiki_trie,
                    trie_index2indices=trie_index2indices,
                    trie_index2indices_counts=trie_index2indices_counts,
                    trie_index2indices_transitions=
                    trie_index2indices_transitions,
                    redirections=redirections,
                    prefix=prefix)
                if fixed_article is False:
                    continue
                for paragraph in fixed_article:
                    for word, qids in paragraph:
                        if len(qids) > 0:
                            fout.write(word.rstrip() + "\t" +
                                       "\t".join(qids + [article_qid]) + "\n")
                        else:
                            fout.write(word.rstrip() + "\n")
                    fout.write("\n")
                seen += 1
                if seen >= config.sample_size:
                    break
        finally:
            fout.flush()
            fout.close()
Exemplo n.º 4
0
def main():
    args = parse_args()
    if args.new_language_path == args.language_path:
        raise ValueError("new_language_path and language_path must be "
                         "different: cannot generate a fixed trie in "
                         "the same directory as the original trie.")

    c = TypeCollection(args.wikidata, num_names_to_load=0)
    c.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
    original_values = np.load(
        join(args.language_path, "trie_index2indices_values.npy"))
    original_offsets = np.load(
        join(args.language_path, "trie_index2indices_offsets.npy"))
    original_counts = np.load(
        join(args.language_path, "trie_index2indices_counts.npy"))
    original_trie_path = join(args.language_path, 'trie.marisa')
    trie = marisa_trie.Trie().load(original_trie_path)
    initialize_globals(c)
    t0 = time.time()

    old_location_shift = None
    values, offsets, counts = original_values, original_offsets, original_counts
    for step in range(args.steps):
        anchor_length = get_trie_properties(trie, offsets, values)
        (offsets, values,
         counts), location_shift = fix(collection=c,
                                       offsets=offsets,
                                       values=values,
                                       counts=counts,
                                       anchor_length=anchor_length,
                                       num_category_link=8)
        if old_location_shift is not None:
            # see where newly shifted values are now pointing
            # to (extra indirection level):
            location_shift = location_shift[old_location_shift]
            location_shift[old_location_shift == -1] = -1
        old_location_shift = location_shift
        pre_reduced_values = values[location_shift]
        pre_reduced_values[location_shift == -1] = -1
        num_changes = int((pre_reduced_values != original_values).sum())
        change_volume = int(
            (original_counts[pre_reduced_values != original_values].sum()))
        print("step %d with %d changes, %d total links" %
              (step, num_changes, change_volume))
    pre_reduced_values = values[location_shift]
    pre_reduced_values[location_shift == -1] = -1
    t1 = time.time()
    num_changes = int((pre_reduced_values != original_values).sum())
    print("Done with link fixing in %.3fs, with %d changes." %
          (t1 - t0, num_changes))

    # show some remappings:
    np.random.seed(1234)
    num_samples = 10
    samples = np.random.choice(np.where(
        np.logical_and(
            np.logical_and(pre_reduced_values != original_values,
                           pre_reduced_values != -1),
            original_values != -1))[0],
                               size=num_samples,
                               replace=False)
    print("Sample fixes:")
    for index in samples:
        print("   %r (%d) -> %r (%d)" %
              (c.get_name(int(
                  original_values[index])), int(original_values[index]),
               c.get_name(int(pre_reduced_values[index])),
               int(pre_reduced_values[index])))
    print("")

    samples = np.random.choice(
        np.where(OffsetArray(values, offsets).edges() == 0)[0],
        size=num_samples,
        replace=False)
    print("Sample deletions:")
    for index in samples:
        print("   %r" % (trie.restore_key(int(index))))

    # prune out anchors where there are no more linked items:
    print("Removing empty anchors from trie...")
    t0 = time.time()
    non_empty_offsets = np.where(OffsetArray(values, offsets).edges() != 0)[0]
    fixed_trie = filter_trie(trie, non_empty_offsets)

    contexts_found = true_exists(
        join(args.language_path, "trie_index2contexts_values.npy"))
    if contexts_found:
        contexts_values = np.load(
            join(args.language_path, "trie_index2contexts_values.npy"))
        contexts_offsets = np.load(
            join(args.language_path, "trie_index2contexts_offsets.npy"))
        contexts_counts = np.load(
            join(args.language_path, "trie_index2contexts_counts.npy"))

    to_port = [(offsets, values, counts),
               (original_offsets, pre_reduced_values, original_values)]
    if contexts_found:
        to_port.append((contexts_offsets, contexts_values, contexts_counts))

    ported = remap_trie_offset_array(trie, fixed_trie, to_port)
    offsets, values, counts = ported[0]
    original_offsets, pre_reduced_values, original_values = ported[1]
    t1 = time.time()
    print("Removed %d empty anchors from trie in %.3fs" % (
        len(trie) - len(fixed_trie),
        t1 - t0,
    ))

    print("Saving...")
    makedirs(args.new_language_path, exist_ok=True)

    np.save(join(args.new_language_path, "trie_index2indices_values.npy"),
            values)
    np.save(join(args.new_language_path, "trie_index2indices_offsets.npy"),
            offsets)
    np.save(join(args.new_language_path, "trie_index2indices_counts.npy"),
            counts)
    if contexts_found:
        contexts_offsets, contexts_values, contexts_counts = ported[2]
        np.save(join(args.new_language_path, "trie_index2contexts_values.npy"),
                contexts_values)
        np.save(
            join(args.new_language_path, "trie_index2contexts_offsets.npy"),
            contexts_offsets)
        np.save(join(args.new_language_path, "trie_index2contexts_counts.npy"),
                contexts_counts)
    new_trie_path = join(args.new_language_path, 'trie.marisa')
    fixed_trie.save(new_trie_path)

    transition = np.vstack([original_values, pre_reduced_values]).T
    np.save(
        join(args.new_language_path,
             "trie_index2indices_transition_values.npy"), transition)
    np.save(
        join(args.new_language_path,
             "trie_index2indices_transition_offsets.npy"), original_offsets)
    print("Done.")