def build_fixed_point(out, prefix): wiki_fixed_point_save = join( out, "wikidata_%s_fixed_points_values.npy" % (prefix, )) if not true_exists(wiki_fixed_point_save): print("building %s fixed point property." % (prefix, )) trie = marisa_trie.RecordTrie('i').load( join(out, WIKITILE_2_WIKIDATA_TRIE_NAME)) num_items = count_lines(join(out, WIKIDATA_IDS_NAME)) fixed_point_relation = {} category_prefix = "%s/Category:" % (prefix, ) article_prefix = "%s/" % (prefix, ) wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME) relevant_items = trie.iteritems(category_prefix) for name, category_idx in relevant_items: article_name = article_prefix + name[len(category_prefix):] for fixed_point_name_alternate in fixed_point_name_alternates( article_name): matches = trie.get(fixed_point_name_alternate, None) if matches is not None and len(matches) > 0: fixed_point_relation[category_idx] = [matches[0][0]] break print("Found %d fixed point relations for %s" % ( len(fixed_point_relation), prefix, )) save_record_with_offset( join(out, "wikidata_%s_fixed_points" % (prefix, )), fixed_point_relation, num_items)
def get_wikidata_mapping(name2id_path, wikidata_ids_path, jsons, relation_names, verbose=False): approx_max_quantity = 24642416 if verbose: pbar = None from IPython.display import clear_output else: pbar = get_progress_bar("collect wikilinks", max_value=approx_max_quantity) pbar.start() clear_output = None wikidata_ids = [] entity_types = [] subclass = [] seen = 0 relations = { name: (open(outfile, "wt"), is_temporal) for name, outfile, is_temporal in relation_names } fout_name2id = None if true_exists(name2id_path) else open( name2id_path, "wt") fout_wikidata_ids = None if true_exists(wikidata_ids_path) else open( wikidata_ids_path, "wt") try: t_then = time.time() seen_last = 0 speed = None index = 0 for doc in jsons: seen += 1 if seen % 2000 == 0: if verbose: t_now = time.time() new_speed = (seen - seen_last) / (t_now - t_then) if speed is None: speed = new_speed else: speed = 0.9 * speed + 0.1 * new_speed clear_output(wait=True) print("%.3f%% done (%d seen, %.3f docs/s, ETA: %ds)" % (100.0 * seen / approx_max_quantity, seen, speed, int((approx_max_quantity - seen) / speed)), flush=True) seen_last = seen t_then = t_now else: if seen < approx_max_quantity: pbar.update(seen) if fout_name2id is not None: if "sitelinks" in doc: for key, value in doc["sitelinks"].items(): if key.endswith("wiki"): fout_name2id.write(key + "/" + value["title"] + "\t" + str(index) + "\n") index += 1 if fout_wikidata_ids is not None: fout_wikidata_ids.write(doc["id"] + "\n") for name, (outfile, is_temporal) in relations.items(): if is_temporal: outfile.write("\t".join( get_claim_time(doc["claims"].get(name, []))) + "\n") else: outfile.write("\t".join( get_related_entities(doc["claims"].get(name, []))) + "\n") if pbar is not None: pbar.finish() finally: for name, (outfile, _) in relations.items(): outfile.close() if fout_name2id is not None: fout_name2id.close() if fout_wikidata_ids is not None: fout_wikidata_ids.close()
def main(): args = parse_args() makedirs(args.wikidata, exist_ok=True) wikidata_names2prop_names = property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_property_names.json')) wikidata_names2temporal_prop_names = temporal_property_names( join(PROJECT_DIR, "data", "wikidata", 'wikidata_time_property_names.json')) # fields to make easily accessible: important_properties = [ wikidata_properties.INSTANCE_OF, wikidata_properties.SUBCLASS_OF, wikidata_properties.PART_OF, wikidata_properties.OCCUPATION, wikidata_properties.FIELD_OF_WORK, wikidata_properties.FIELD_OF_THIS_OCCUPATION, wikidata_properties.MEDICAL_SPECIALITY, wikidata_properties.GENRE, wikidata_properties.SEX_OR_GENDER, wikidata_properties.COUNTRY_OF_CITIZENSHIP, wikidata_properties.COUNTRY, wikidata_properties.CONTINENT, wikidata_properties.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY, wikidata_properties.SPORT, wikidata_properties.STUDIES, wikidata_properties.SERIES, wikidata_properties.USE, wikidata_properties.LOCATION, wikidata_properties.FACE_OF, wikidata_properties.IS_A_LIST_OF, wikidata_properties.COUNTRY_OF_ORIGIN, wikidata_properties.PRODUCT_OR_MATERIAL_PRODUCED, wikidata_properties.INDUSTRY, wikidata_properties.PARENT_TAXON, wikidata_properties.APPLIES_TO_TERRITORIAL_JURISDICTION, wikidata_properties.POSITION_HELD, wikidata_properties.CATEGORYS_MAIN_TOPIC, # temporal properties wikidata_properties.PUBLICATION_DATE, wikidata_properties.DATE_OF_BIRTH, wikidata_properties.DATE_OF_DEATH, wikidata_properties.INCEPTION, wikidata_properties.DISSOLVED_OR_ABOLISHED, wikidata_properties.POINT_IN_TIME, wikidata_properties.START_TIME, wikidata_properties.END_TIME ] prop_names2wikidata_names = { value: key for key, value in wikidata_names2prop_names.items() } wikidata_important_properties = [ prop_names2wikidata_names[prop] for prop in important_properties ] wikidata_important_properties_fnames = [ (name, join(args.wikidata, "wikidata_%s.txt" % (name, )), name in wikidata_names2temporal_prop_names) for name in wikidata_important_properties ] missing_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not true_exists(outfile) ] wikidata_ids_path = join(args.wikidata, WIKIDATA_IDS_NAME) wikititle2wikidata_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TSV_NAME) work_to_be_done = (not true_exists(wikidata_ids_path) or not true_exists(wikititle2wikidata_path) or len(missing_wikidata_important_properties_fnames) > 0) if work_to_be_done: get_wikidata_mapping( wikititle2wikidata_path, wikidata_ids_path, open_wikidata_file(args.wikidata_dump, args.batch_size), missing_wikidata_important_properties_fnames) numpy_wikidata_important_properties_fnames = [ (name, outfile, is_temporal) for name, outfile, is_temporal in wikidata_important_properties_fnames if not values_exist(join(args.wikidata, "wikidata_%s" % (name, ))) ] # obtain a mapping from id -> number if len(numpy_wikidata_important_properties_fnames) > 0: _, id2index = load_wikidata_ids(args.wikidata) # make relations numerical: for relname, outfile, is_temporal in numpy_wikidata_important_properties_fnames: with open(outfile, "rt") as fin: lines = fin.read().splitlines() fin_pbar = get_progress_bar("loading relation %r" % (relname, ))(lines) if is_temporal: value = np.zeros(len(lines) * 2 + 1, dtype=np.int32) position = 1 seen = 0 for idx, line in enumerate(fin_pbar): for wikidata_id in line.split('\t'): if len(wikidata_id) > 0: value[position] = idx value[position + 1] = parse_year(wikidata_id) position += 2 seen += 1 break value[0] = len(lines) value = value[:position] np.save( join(args.wikidata, "wikidata_%s_values.sparse.npy" % (relname, )), value) else: relation = [line2indices(id2index, line) for line in fin_pbar] save_record_with_offset( join(args.wikidata, "wikidata_%s" % (relname, )), relation) del id2index # convert the mapping from wikinames to integer values: trie_save_path = join(args.wikidata, WIKITILE_2_WIKIDATA_TRIE_NAME) if not true_exists(trie_save_path): print("loading wikipedia name -> wikidata") name2id = pandas.read_csv(wikititle2wikidata_path, sep="\t", encoding='utf-8') print("loaded") trie = marisa_trie.RecordTrie( 'i', get_progress_bar("convert to trie", max_value=name2id.shape[0])( (key, (value, )) for _, key, value in name2id.itertuples())) trie.save(trie_save_path) build_fixed_point(args.wikidata, "enwiki")
def values_exist(path): return (true_exists(path + "_values.npy") or true_exists(path + "_values.sparse.npy"))
def main(): args = parse_args() if args.new_language_path == args.language_path: raise ValueError("new_language_path and language_path must be " "different: cannot generate a fixed trie in " "the same directory as the original trie.") c = TypeCollection(args.wikidata, num_names_to_load=0) c.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) original_values = np.load( join(args.language_path, "trie_index2indices_values.npy")) original_offsets = np.load( join(args.language_path, "trie_index2indices_offsets.npy")) original_counts = np.load( join(args.language_path, "trie_index2indices_counts.npy")) original_trie_path = join(args.language_path, 'trie.marisa') trie = marisa_trie.Trie().load(original_trie_path) initialize_globals(c) t0 = time.time() old_location_shift = None values, offsets, counts = original_values, original_offsets, original_counts for step in range(args.steps): anchor_length = get_trie_properties(trie, offsets, values) (offsets, values, counts), location_shift = fix(collection=c, offsets=offsets, values=values, counts=counts, anchor_length=anchor_length, num_category_link=8) if old_location_shift is not None: # see where newly shifted values are now pointing # to (extra indirection level): location_shift = location_shift[old_location_shift] location_shift[old_location_shift == -1] = -1 old_location_shift = location_shift pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 num_changes = int((pre_reduced_values != original_values).sum()) change_volume = int( (original_counts[pre_reduced_values != original_values].sum())) print("step %d with %d changes, %d total links" % (step, num_changes, change_volume)) pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 t1 = time.time() num_changes = int((pre_reduced_values != original_values).sum()) print("Done with link fixing in %.3fs, with %d changes." % (t1 - t0, num_changes)) # show some remappings: np.random.seed(1234) num_samples = 10 samples = np.random.choice(np.where( np.logical_and( np.logical_and(pre_reduced_values != original_values, pre_reduced_values != -1), original_values != -1))[0], size=num_samples, replace=False) print("Sample fixes:") for index in samples: print(" %r (%d) -> %r (%d)" % (c.get_name(int( original_values[index])), int(original_values[index]), c.get_name(int(pre_reduced_values[index])), int(pre_reduced_values[index]))) print("") samples = np.random.choice( np.where(OffsetArray(values, offsets).edges() == 0)[0], size=num_samples, replace=False) print("Sample deletions:") for index in samples: print(" %r" % (trie.restore_key(int(index)))) # prune out anchors where there are no more linked items: print("Removing empty anchors from trie...") t0 = time.time() non_empty_offsets = np.where(OffsetArray(values, offsets).edges() != 0)[0] fixed_trie = filter_trie(trie, non_empty_offsets) contexts_found = true_exists( join(args.language_path, "trie_index2contexts_values.npy")) if contexts_found: contexts_values = np.load( join(args.language_path, "trie_index2contexts_values.npy")) contexts_offsets = np.load( join(args.language_path, "trie_index2contexts_offsets.npy")) contexts_counts = np.load( join(args.language_path, "trie_index2contexts_counts.npy")) to_port = [(offsets, values, counts), (original_offsets, pre_reduced_values, original_values)] if contexts_found: to_port.append((contexts_offsets, contexts_values, contexts_counts)) ported = remap_trie_offset_array(trie, fixed_trie, to_port) offsets, values, counts = ported[0] original_offsets, pre_reduced_values, original_values = ported[1] t1 = time.time() print("Removed %d empty anchors from trie in %.3fs" % ( len(trie) - len(fixed_trie), t1 - t0, )) print("Saving...") makedirs(args.new_language_path, exist_ok=True) np.save(join(args.new_language_path, "trie_index2indices_values.npy"), values) np.save(join(args.new_language_path, "trie_index2indices_offsets.npy"), offsets) np.save(join(args.new_language_path, "trie_index2indices_counts.npy"), counts) if contexts_found: contexts_offsets, contexts_values, contexts_counts = ported[2] np.save(join(args.new_language_path, "trie_index2contexts_values.npy"), contexts_values) np.save( join(args.new_language_path, "trie_index2contexts_offsets.npy"), contexts_offsets) np.save(join(args.new_language_path, "trie_index2contexts_counts.npy"), contexts_counts) new_trie_path = join(args.new_language_path, 'trie.marisa') fixed_trie.save(new_trie_path) transition = np.vstack([original_values, pre_reduced_values]).T np.save( join(args.new_language_path, "trie_index2indices_transition_values.npy"), transition) np.save( join(args.new_language_path, "trie_index2indices_transition_offsets.npy"), original_offsets) print("Done.")