def _collect_mapped_to(self): result = set() io_time = 0 working_time = 0 log_step = max(1, int(len(self._tasks) / 20.0)) for index, task in enumerate(self._tasks): if index % log_step == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_start work_start = time.time() for mapped_to in self._iter_mapped_to_for_entity(entity): result.add(mapped_to) working_time += time.time() - work_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time)) return result
def __call__(self, chunk: TransformationChunk): self._tasks = chunk.tasks self._context = chunk.context # io_time = 0 working_time = 0 log_step = max(1, int(len(self._tasks) / 20.0)) for index, task in enumerate(self._tasks): if index % log_step == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_start work_start = time.time() self._prune_hierarchy(entity) working_time += time.time() - work_start io_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def _write_terms_mapping(self, terms_to_entities: WikidataEntityMap): log_progress_step = max(1, int(len(self._tasks) / 20.0)) io_time = 0 working_time = 0 for index, task in enumerate(self._tasks): if index % log_progress_step == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_time_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_time_start working_time_start = time.time() for selector in self._context.mapping_selector(entity): self._add_mappings_for_selector(selector, terms_to_entities) working_time += time.time() - working_time_start io_time_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_time_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def _map_terms_to_wikidata(self, terms) -> WikidataEntityMap: # First we collect mapping from terms to entities ID and # we save all found entities. # Then we replace the entity ID with the entities at the end. terms_to_entities = collections.defaultdict(list) entities = {} for index, (entity_id, entity) in enumerate(self._iterate_labels()): search_label = self._get_search_label(entity) search_alias = self._get_search_alias(entity) search_alias_terms = [ term for terms in search_alias for term in terms ] entity_has_mapped = False for search_term in (set(search_label) | set(search_alias_terms)): if search_term not in terms: continue # Given term is in the Wikidata entity text. if entity_id not in terms_to_entities[search_term]: terms_to_entities[search_term].append(entity_id) entity_has_mapped = True if entity_has_mapped: entities[entity_id] = \ WikidataEntity(entity_id, search_label, search_alias) if index % 100000 == 0: logger.info(" {:>7}".format(index)) return { key: [entities[entity_id] for entity_id in values] for key, values in terms_to_entities.items() }
def _load_index(self): index_path = os.path.join(self.directory, "index.json") if not os.path.exists(index_path): logger.info("No index file found '%s'", index_path) return {} with open(index_path, encoding="utf-8") as in_stream: self.index = json.load(in_stream) logger.info("Index of size %s loaded from '%s'", len(self.index), index_path)
def __call__(self, chunk: TransformationChunk): self._tasks = chunk.tasks self._context = chunk.context # logger.info("Loading terms for %i entities", len(chunk.tasks)) terms = self._collect_terms() logger.info("Terms count %i", len(terms)) logger.info("Mapping to labels ...") terms_to_entities = self._map_terms_to_wikidata(terms) logger.info("Saving mappings ...") self._write_terms_mapping(terms_to_entities)
def _save_hierarchy(self, hierarchy): logger.info("Saving hierarchy ...") io_time = 0 working_time = 0 log_step = max(1, int(len(self._tasks) / 20.0)) for index, task in enumerate(self._tasks): if index % int(len(self._tasks) / log_step) == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_start add_time = time.time() self._add_hierarchy(entity, hierarchy) working_time += time.time() - add_time io_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def execute(self): start_time = time.time() input_directory = None output_directory = None skip_step = False for index, step in enumerate(self._steps): logger.info("Executing step %i / %i", index + 1, len(self._steps)) if "directory" in step: input_directory = output_directory output_directory = os.path.join(self._root, step["directory"]) skip_step = os.path.exists(output_directory) logger.info("Output directory changed to '%s'", output_directory) os.makedirs(output_directory, exist_ok=True) elif skip_step: logger.info("Step skipped as output directory already exists") elif "transformer" in step: transformer: AbstractTransformation = step["transformer"] logger.info("Running '%s'", type(transformer).__name__) transformer(input_directory, output_directory) else: raise Exception("Unknown step {}".format(index)) logger.info("All done in: %s", time.time() - start_time)
def __call__(self, chunk: TransformationChunk): self._tasks = chunk.tasks self._context = chunk.context # logger.info("Loading individuals for: %i tasks", len(self._tasks)) entities = self._collect_mapped_to() logger.info("Collecting hierarchy for: %i entities", len(entities)) hierarchy = self._collect_hierarchy(entities) logger.info("Hierarchy size: %i", len(hierarchy)) self._save_hierarchy(hierarchy)
def _collect_hierarchy(self, entities): to_resolve = set(entities) resolved = {} while len(to_resolve) > 0: logger.info(" Iterating hierarchy file with %i entries", len(to_resolve)) last_size = len(to_resolve) for line in self._iterate_json_lines(self._context.hierarchy_file): line_id = self._extract_id_from_line(line) if line_id not in to_resolve: continue line_entry = self._line_to_entry(line) to_resolve.remove(line_id) resolved[line_id] = { "instanceof": [item for item in line_entry["instanceof"]], "subclassof": [item for item in line_entry["subclassof"]] } # Add new entities to resolve, if they have not been already # resolved. We also try to follow subclassof as a primary # source using instanceof only as a backup. if line_entry["subclassof"]: to_resolve.update([ item for item in line_entry["subclassof"] if item not in resolved ]) else: to_resolve.update([ item for item in line_entry["instanceof"] if item not in resolved ]) if last_size == 1 and len(to_resolve) == last_size: logger.info("Missing record for: ") for item in to_resolve: resolved[item] = { "instanceof": [], "subclassof": [], "type": "not-found" } logger.info("Missing record for: %s", item) break return resolved
def _write_index(self): index_path = os.path.join(self.directory, "index.json") logger.info("Saving index to '%s'", index_path) with open(index_path, "w", encoding="utf-8") as out_stream: json.dump(self.index, out_stream)