def main():
    args = parse_args()
    config = load_config(args.config,
                         ["wiki", "language_path", "wikidata", "redirections"],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100
                         },
                         relative_to=args.relative_to)
    prefix = config.prefix or induce_wikipedia_prefix(config.wiki)

    collection = TypeCollection(config.wikidata, num_names_to_load=0)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    redirections = load_redirections(config.redirections)

    seen = 0
    with open(args.out, "wt") as fout:
        try:
            for i, (article_name,
                    article) in tqdm(enumerate(iterate_articles(config.wiki))):
                if i == 5409:
                    continue
                fixed_article, article_qid = convert(
                    article_name,
                    article,
                    collection=collection,
                    anchor_trie=anchor_trie,
                    wiki_trie=wiki_trie,
                    trie_index2indices=trie_index2indices,
                    trie_index2indices_counts=trie_index2indices_counts,
                    trie_index2indices_transitions=
                    trie_index2indices_transitions,
                    redirections=redirections,
                    prefix=prefix)
                if fixed_article is False:
                    continue
                for paragraph in fixed_article:
                    for word, qids in paragraph:
                        if len(qids) > 0:
                            fout.write(word.rstrip() + "\t" +
                                       "\t".join(qids + [article_qid]) + "\n")
                        else:
                            fout.write(word.rstrip() + "\n")
                    fout.write("\n")
                seen += 1
                if seen >= config.sample_size:
                    break
        finally:
            fout.flush()
            fout.close()
예제 #2
0
def main():
    args = parse_args()
    config = load_config(args.config, [
        "wiki", "language_path", "wikidata", "redirections", "classification",
        "path"
    ],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100,
                             "wiki": None,
                             "min_count": 0,
                             "min_percent": 0.0
                         },
                         relative_to=args.relative_to)
    if config.wiki is None:
        raise ValueError("must provide path to 'wiki' in config.")
    prefix = get_prefix(config)

    print("Load type_collection")
    collection = TypeCollection(config.wikidata,
                                num_names_to_load=config.num_names_to_load,
                                prefix=prefix,
                                verbose=True)

    fname = config.wiki
    all_tags = fix_and_parse_tags(config, collection, config.sample_size)
    test_tags = all_tags[:config.sample_size]
    train_tags = all_tags[config.sample_size:]

    oracles = [
        load_oracle_classification(classification)
        for classification in config.classification
    ]

    def get_name(idx):
        if idx < config.num_names_to_load:
            if idx in collection.known_names:
                return collection.known_names[idx] + " (%s)" % (
                    collection.ids[idx], )
            else:
                return collection.ids[idx]
        else:
            return maybe_web_get_name(
                collection.ids[idx]) + " (%s)" % (collection.ids[idx], )

    while True:
        total_report, ambiguous_tags = disambiguate_batch(
            test_tags, train_tags, oracles)
        summarize_disambiguation(total_report)
        if args.log is not None:
            with open(args.log, "at") as fout:
                summarize_disambiguation(total_report, file=fout)
        if args.verbose:
            try:
                summarize_ambiguities(ambiguous_tags, oracles, get_name)
            except KeyboardInterrupt as e:
                pass
        if args.interactive:
            enter_or_quit()
        else:
            break
예제 #3
0
def main():
    args = parse_args()
    config = load_config(args.config, [
        "wiki", "language_path", "wikidata", "redirections", "classification"
    ],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100,
                             "wiki": None,
                             "fix_links": False,
                             "min_count": 0,
                             "min_percent": 0.0
                         },
                         relative_to=args.relative_to)
    if config.wiki is None:
        raise ValueError("must provide path to 'wiki' in config.")
    prefix = get_prefix(config)
    collection = TypeCollection(config.wikidata,
                                num_names_to_load=config.num_names_to_load,
                                prefix=prefix,
                                verbose=True)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    fname = config.wiki
    test_tags = fix_and_parse_tags(config, collection, config.sample_size)
    aucs = load_aucs()
    ids = sorted(
        set([
            idx for doc_tags in test_tags for _, tag in doc_tags
            if tag is not None for idx in tag[2] if len(tag[2]) > 1
        ]))
    id2pos = {idx: k for k, idx in enumerate(ids)}
    # use reduced identity system:
    remapped_tags = []
    for doc_tags in test_tags:
        for text, tag in doc_tags:
            if tag is not None:
                remapped_tags.append(
                    (id2pos[tag[1]] if len(tag[2]) > 1 else tag[1],
                     np.array([id2pos[idx] for idx in tag[2]])
                     if len(tag[2]) > 1 else tag[2], tag[3]))
    test_tags = remapped_tags

    aucs = {key: value for key, value in aucs.items() if value > 0.5}
    print("%d relations to pick from with %d ids." % (len(aucs), len(ids)),
          flush=True)
    cached_satisfy = get_cached_satisfy(collection,
                                        aucs,
                                        ids,
                                        mmap=args.method == "greedy")
    del collection
    key2row = {key: k for k, key in enumerate(sorted(aucs.keys()))}

    if args.method == "greedy":
        picks, _ = beam_project(cached_satisfy,
                                key2row,
                                remapped_tags,
                                aucs,
                                ids,
                                beam_width=1,
                                penalty=args.penalty,
                                log=args.log)
    elif args.method == "beam":
        picks, _ = beam_project(cached_satisfy,
                                key2row,
                                remapped_tags,
                                aucs,
                                ids,
                                beam_width=args.beam_width,
                                penalty=args.penalty,
                                log=args.log)
    elif args.method == "cem":
        picks, _ = cem_project(cached_satisfy,
                               key2row,
                               remapped_tags,
                               aucs,
                               ids,
                               n_samples=args.samples,
                               penalty=args.penalty,
                               log=args.log)
    elif args.method == "ga":
        picks, _ = ga_project(cached_satisfy,
                              key2row,
                              remapped_tags,
                              aucs,
                              ids,
                              ngen=args.ngen,
                              n_samples=args.samples,
                              penalty=args.penalty,
                              log=args.log)
    else:
        raise ValueError("unknown method %r." % (args.method, ))
    with open(args.out, "wt") as fout:
        json.dump(picks, fout)