示例#1
0
def build_fixed_point(out, prefix):
    wiki_fixed_point_save = join(
        out, "wikidata_%s_fixed_points_values.npy" % (prefix, ))
    if not true_exists(wiki_fixed_point_save):
        print("building %s fixed point property." % (prefix, ))
        trie = marisa_trie.RecordTrie('i').load(
            join(out, WIKITILE_2_WIKIDATA_TRIE_NAME))
        num_items = count_lines(join(out, WIKIDATA_IDS_NAME))
        fixed_point_relation = {}

        category_prefix = "%s/Category:" % (prefix, )
        article_prefix = "%s/" % (prefix, )
        wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME)
        relevant_items = trie.iteritems(category_prefix)

        for name, category_idx in relevant_items:
            article_name = article_prefix + name[len(category_prefix):]
            for fixed_point_name_alternate in fixed_point_name_alternates(
                    article_name):
                matches = trie.get(fixed_point_name_alternate, None)
                if matches is not None and len(matches) > 0:
                    fixed_point_relation[category_idx] = [matches[0][0]]
                    break
        print("Found %d fixed point relations for %s" % (
            len(fixed_point_relation),
            prefix,
        ))
        save_record_with_offset(
            join(out, "wikidata_%s_fixed_points" % (prefix, )),
            fixed_point_relation, num_items)
示例#2
0
def get_wikidata_mapping(name2id_path,
                         wikidata_ids_path,
                         jsons,
                         relation_names,
                         verbose=False):
    approx_max_quantity = 24642416
    if verbose:
        pbar = None
        from IPython.display import clear_output
    else:
        pbar = get_progress_bar("collect wikilinks",
                                max_value=approx_max_quantity)
        pbar.start()
        clear_output = None
    wikidata_ids = []
    entity_types = []
    subclass = []
    seen = 0

    relations = {
        name: (open(outfile, "wt"), is_temporal)
        for name, outfile, is_temporal in relation_names
    }
    fout_name2id = None if true_exists(name2id_path) else open(
        name2id_path, "wt")
    fout_wikidata_ids = None if true_exists(wikidata_ids_path) else open(
        wikidata_ids_path, "wt")
    try:
        t_then = time.time()
        seen_last = 0
        speed = None
        index = 0
        for doc in jsons:
            seen += 1
            if seen % 2000 == 0:
                if verbose:
                    t_now = time.time()
                    new_speed = (seen - seen_last) / (t_now - t_then)
                    if speed is None:
                        speed = new_speed
                    else:
                        speed = 0.9 * speed + 0.1 * new_speed
                    clear_output(wait=True)
                    print("%.3f%% done (%d seen, %.3f docs/s, ETA: %ds)" %
                          (100.0 * seen / approx_max_quantity, seen, speed,
                           int((approx_max_quantity - seen) / speed)),
                          flush=True)
                    seen_last = seen
                    t_then = t_now
                else:
                    if seen < approx_max_quantity:
                        pbar.update(seen)
            if fout_name2id is not None:
                if "sitelinks" in doc:
                    for key, value in doc["sitelinks"].items():
                        if key.endswith("wiki"):
                            fout_name2id.write(key + "/" + value["title"] +
                                               "\t" + str(index) + "\n")
            index += 1
            if fout_wikidata_ids is not None:
                fout_wikidata_ids.write(doc["id"] + "\n")
            for name, (outfile, is_temporal) in relations.items():
                if is_temporal:
                    outfile.write("\t".join(
                        get_claim_time(doc["claims"].get(name, []))) + "\n")
                else:
                    outfile.write("\t".join(
                        get_related_entities(doc["claims"].get(name, []))) +
                                  "\n")
        if pbar is not None:
            pbar.finish()
    finally:
        for name, (outfile, _) in relations.items():
            outfile.close()
        if fout_name2id is not None:
            fout_name2id.close()
        if fout_wikidata_ids is not None:
            fout_wikidata_ids.close()
示例#3
0
def main():
    args = parse_args()
    makedirs(args.wikidata, exist_ok=True)

    wikidata_names2prop_names = property_names(
        join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json'))
    wikidata_names2temporal_prop_names = temporal_property_names(
        join(PROJECT_DIR, "data", "wikidata",
             'wikidata_time_property_names.json'))
    # fields to make easily accessible:
    important_properties = [
        wikidata_properties.INSTANCE_OF,
        wikidata_properties.SUBCLASS_OF,
        wikidata_properties.PART_OF,
        wikidata_properties.OCCUPATION,
        wikidata_properties.FIELD_OF_WORK,
        wikidata_properties.FIELD_OF_THIS_OCCUPATION,
        wikidata_properties.MEDICAL_SPECIALITY,
        wikidata_properties.GENRE,
        wikidata_properties.SEX_OR_GENDER,
        wikidata_properties.COUNTRY_OF_CITIZENSHIP,
        wikidata_properties.COUNTRY,
        wikidata_properties.CONTINENT,
        wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY,
        wikidata_properties.SPORT,
        wikidata_properties.STUDIES,
        wikidata_properties.SERIES,
        wikidata_properties.USE,
        wikidata_properties.LOCATION,
        wikidata_properties.FACE_OF,
        wikidata_properties.IS_A_LIST_OF,
        wikidata_properties.COUNTRY_OF_ORIGIN,
        wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED,
        wikidata_properties.INDUSTRY,
        wikidata_properties.PARENT_TAXON,
        wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION,
        wikidata_properties.POSITION_HELD,
        wikidata_properties.CATEGORYS_MAIN_TOPIC,
        # temporal properties
        wikidata_properties.PUBLICATION_DATE,
        wikidata_properties.DATE_OF_BIRTH,
        wikidata_properties.DATE_OF_DEATH,
        wikidata_properties.INCEPTION,
        wikidata_properties.DISSOLVED_OR_ABOLISHED,
        wikidata_properties.POINT_IN_TIME,
        wikidata_properties.START_TIME,
        wikidata_properties.END_TIME
    ]
    prop_names2wikidata_names = {
        value: key
        for key, value in wikidata_names2prop_names.items()
    }
    wikidata_important_properties = [
        prop_names2wikidata_names[prop] for prop in important_properties
    ]
    wikidata_important_properties_fnames = [
        (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name
         in wikidata_names2temporal_prop_names)
        for name in wikidata_important_properties
    ]

    missing_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not true_exists(outfile)
    ]

    wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME)
    wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME)

    work_to_be_done = (not true_exists(wikidata_ids_path)
                       or not true_exists(wikititle2wikidata_path) or
                       len(missing_wikidata_important_properties_fnames) > 0)

    if work_to_be_done:
        get_wikidata_mapping(
            wikititle2wikidata_path, wikidata_ids_path,
            open_wikidata_file(args.wikidata_dump, args.batch_size),
            missing_wikidata_important_properties_fnames)

    numpy_wikidata_important_properties_fnames = [
        (name, outfile, is_temporal)
        for name, outfile, is_temporal in wikidata_important_properties_fnames
        if not values_exist(join(args.wikidata, "wikidata_%s" % (name, )))
    ]

    # obtain a mapping from id -> number
    if len(numpy_wikidata_important_properties_fnames) > 0:
        _, id2index = load_wikidata_ids(args.wikidata)
        # make relations numerical:
        for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames:
            with open(outfile, "rt") as fin:
                lines = fin.read().splitlines()
            fin_pbar = get_progress_bar("loading relation %r" %
                                        (relname, ))(lines)
            if is_temporal:
                value = np.zeros(len(lines) * 2 + 1, dtype=np.int32)
                position = 1
                seen = 0
                for idx, line in enumerate(fin_pbar):
                    for wikidata_id in line.split('\t'):
                        if len(wikidata_id) > 0:
                            value[position] = idx
                            value[position + 1] = parse_year(wikidata_id)
                            position += 2
                            seen += 1
                            break
                value[0] = len(lines)
                value = value[:position]
                np.save(
                    join(args.wikidata,
                         "wikidata_%s_values.sparse.npy" % (relname, )), value)
            else:
                relation = [line2indices(id2index, line) for line in fin_pbar]
                save_record_with_offset(
                    join(args.wikidata, "wikidata_%s" % (relname, )), relation)
        del id2index

    # convert the mapping from wikinames to integer values:
    trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME)
    if not true_exists(trie_save_path):
        print("loading wikipedia name -> wikidata")
        name2id = pandas.read_csv(wikititle2wikidata_path,
                                  sep="\t",
                                  encoding='utf-8')
        print("loaded")
        trie = marisa_trie.RecordTrie(
            'i',
            get_progress_bar("convert to trie", max_value=name2id.shape[0])(
                (key, (value, )) for _, key, value in name2id.itertuples()))
        trie.save(trie_save_path)

    build_fixed_point(args.wikidata, "enwiki")
示例#4
0
def values_exist(path):
    return (true_exists(path + "_values.npy")
            or true_exists(path + "_values.sparse.npy"))
示例#5
0
def main():
    args = parse_args()
    if args.new_language_path == args.language_path:
        raise ValueError("new_language_path and language_path must be "
                         "different: cannot generate a fixed trie in "
                         "the same directory as the original trie.")

    c = TypeCollection(args.wikidata, num_names_to_load=0)
    c.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))
    original_values = np.load(
        join(args.language_path, "trie_index2indices_values.npy"))
    original_offsets = np.load(
        join(args.language_path, "trie_index2indices_offsets.npy"))
    original_counts = np.load(
        join(args.language_path, "trie_index2indices_counts.npy"))
    original_trie_path = join(args.language_path, 'trie.marisa')
    trie = marisa_trie.Trie().load(original_trie_path)
    initialize_globals(c)
    t0 = time.time()

    old_location_shift = None
    values, offsets, counts = original_values, original_offsets, original_counts
    for step in range(args.steps):
        anchor_length = get_trie_properties(trie, offsets, values)
        (offsets, values,
         counts), location_shift = fix(collection=c,
                                       offsets=offsets,
                                       values=values,
                                       counts=counts,
                                       anchor_length=anchor_length,
                                       num_category_link=8)
        if old_location_shift is not None:
            # see where newly shifted values are now pointing
            # to (extra indirection level):
            location_shift = location_shift[old_location_shift]
            location_shift[old_location_shift == -1] = -1
        old_location_shift = location_shift
        pre_reduced_values = values[location_shift]
        pre_reduced_values[location_shift == -1] = -1
        num_changes = int((pre_reduced_values != original_values).sum())
        change_volume = int(
            (original_counts[pre_reduced_values != original_values].sum()))
        print("step %d with %d changes, %d total links" %
              (step, num_changes, change_volume))
    pre_reduced_values = values[location_shift]
    pre_reduced_values[location_shift == -1] = -1
    t1 = time.time()
    num_changes = int((pre_reduced_values != original_values).sum())
    print("Done with link fixing in %.3fs, with %d changes." %
          (t1 - t0, num_changes))

    # show some remappings:
    np.random.seed(1234)
    num_samples = 10
    samples = np.random.choice(np.where(
        np.logical_and(
            np.logical_and(pre_reduced_values != original_values,
                           pre_reduced_values != -1),
            original_values != -1))[0],
                               size=num_samples,
                               replace=False)
    print("Sample fixes:")
    for index in samples:
        print("   %r (%d) -> %r (%d)" %
              (c.get_name(int(
                  original_values[index])), int(original_values[index]),
               c.get_name(int(pre_reduced_values[index])),
               int(pre_reduced_values[index])))
    print("")

    samples = np.random.choice(
        np.where(OffsetArray(values, offsets).edges() == 0)[0],
        size=num_samples,
        replace=False)
    print("Sample deletions:")
    for index in samples:
        print("   %r" % (trie.restore_key(int(index))))

    # prune out anchors where there are no more linked items:
    print("Removing empty anchors from trie...")
    t0 = time.time()
    non_empty_offsets = np.where(OffsetArray(values, offsets).edges() != 0)[0]
    fixed_trie = filter_trie(trie, non_empty_offsets)

    contexts_found = true_exists(
        join(args.language_path, "trie_index2contexts_values.npy"))
    if contexts_found:
        contexts_values = np.load(
            join(args.language_path, "trie_index2contexts_values.npy"))
        contexts_offsets = np.load(
            join(args.language_path, "trie_index2contexts_offsets.npy"))
        contexts_counts = np.load(
            join(args.language_path, "trie_index2contexts_counts.npy"))

    to_port = [(offsets, values, counts),
               (original_offsets, pre_reduced_values, original_values)]
    if contexts_found:
        to_port.append((contexts_offsets, contexts_values, contexts_counts))

    ported = remap_trie_offset_array(trie, fixed_trie, to_port)
    offsets, values, counts = ported[0]
    original_offsets, pre_reduced_values, original_values = ported[1]
    t1 = time.time()
    print("Removed %d empty anchors from trie in %.3fs" % (
        len(trie) - len(fixed_trie),
        t1 - t0,
    ))

    print("Saving...")
    makedirs(args.new_language_path, exist_ok=True)

    np.save(join(args.new_language_path, "trie_index2indices_values.npy"),
            values)
    np.save(join(args.new_language_path, "trie_index2indices_offsets.npy"),
            offsets)
    np.save(join(args.new_language_path, "trie_index2indices_counts.npy"),
            counts)
    if contexts_found:
        contexts_offsets, contexts_values, contexts_counts = ported[2]
        np.save(join(args.new_language_path, "trie_index2contexts_values.npy"),
                contexts_values)
        np.save(
            join(args.new_language_path, "trie_index2contexts_offsets.npy"),
            contexts_offsets)
        np.save(join(args.new_language_path, "trie_index2contexts_counts.npy"),
                contexts_counts)
    new_trie_path = join(args.new_language_path, 'trie.marisa')
    fixed_trie.save(new_trie_path)

    transition = np.vstack([original_values, pre_reduced_values]).T
    np.save(
        join(args.new_language_path,
             "trie_index2indices_transition_values.npy"), transition)
    np.save(
        join(args.new_language_path,
             "trie_index2indices_transition_offsets.npy"), original_offsets)
    print("Done.")