def _collect_mapped_to(self):
        result = set()

        io_time = 0
        working_time = 0
        log_step = max(1, int(len(self._tasks) / 20.0))
        for index, task in enumerate(self._tasks):
            if index % log_step == 0:
                logger.info(
                    "  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
                        index, len(self._tasks), io_time, working_time))

            io_start = time.time()
            entity = read_json(task.in_path)
            io_time += time.time() - io_start

            work_start = time.time()
            for mapped_to in self._iter_mapped_to_for_entity(entity):
                result.add(mapped_to)
            working_time += time.time() - work_start

        logger.info("  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
            len(self._tasks), len(self._tasks), io_time, working_time))

        return result
    def __call__(self, chunk: TransformationChunk):
        self._tasks = chunk.tasks
        self._context = chunk.context
        #
        io_time = 0
        working_time = 0
        log_step = max(1, int(len(self._tasks) / 20.0))
        for index, task in enumerate(self._tasks):
            if index % log_step == 0:
                logger.info(
                    "  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
                        index, len(self._tasks), io_time, working_time))

            io_start = time.time()
            entity = read_json(task.in_path)
            io_time += time.time() - io_start

            work_start = time.time()
            self._prune_hierarchy(entity)
            working_time += time.time() - work_start

            io_start = time.time()
            write_json(task.out_path, entity)
            io_time += time.time() - io_start

        logger.info("  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
            len(self._tasks), len(self._tasks), io_time, working_time))
    def _write_terms_mapping(self, terms_to_entities: WikidataEntityMap):
        log_progress_step = max(1, int(len(self._tasks) / 20.0))

        io_time = 0
        working_time = 0

        for index, task in enumerate(self._tasks):
            if index % log_progress_step == 0:
                logger.info(
                    "  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
                        index, len(self._tasks), io_time, working_time))
            io_time_start = time.time()
            entity = read_json(task.in_path)
            io_time += time.time() - io_time_start

            working_time_start = time.time()
            for selector in self._context.mapping_selector(entity):
                self._add_mappings_for_selector(selector, terms_to_entities)
            working_time += time.time() - working_time_start

            io_time_start = time.time()
            write_json(task.out_path, entity)
            io_time += time.time() - io_time_start

        logger.info("  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
            len(self._tasks), len(self._tasks), io_time, working_time))
    def _map_terms_to_wikidata(self, terms) -> WikidataEntityMap:
        # First we collect mapping from terms to entities ID and
        # we save all found entities.
        # Then we replace the entity ID with the entities at the end.
        terms_to_entities = collections.defaultdict(list)
        entities = {}
        for index, (entity_id, entity) in enumerate(self._iterate_labels()):
            search_label = self._get_search_label(entity)
            search_alias = self._get_search_alias(entity)
            search_alias_terms = [
                term for terms in search_alias for term in terms
            ]

            entity_has_mapped = False
            for search_term in (set(search_label) | set(search_alias_terms)):
                if search_term not in terms:
                    continue
                # Given term is in the Wikidata entity text.
                if entity_id not in terms_to_entities[search_term]:
                    terms_to_entities[search_term].append(entity_id)
                entity_has_mapped = True

            if entity_has_mapped:
                entities[entity_id] = \
                    WikidataEntity(entity_id, search_label, search_alias)

            if index % 100000 == 0:
                logger.info("  {:>7}".format(index))

        return {
            key: [entities[entity_id] for entity_id in values]
            for key, values in terms_to_entities.items()
        }
Пример #5
0
 def _load_index(self):
     index_path = os.path.join(self.directory, "index.json")
     if not os.path.exists(index_path):
         logger.info("No index file found '%s'", index_path)
         return {}
     with open(index_path, encoding="utf-8") as in_stream:
         self.index = json.load(in_stream)
     logger.info("Index of size %s loaded from '%s'", len(self.index),
                 index_path)
 def __call__(self, chunk: TransformationChunk):
     self._tasks = chunk.tasks
     self._context = chunk.context
     #
     logger.info("Loading terms for %i entities", len(chunk.tasks))
     terms = self._collect_terms()
     logger.info("Terms count %i", len(terms))
     logger.info("Mapping to labels ...")
     terms_to_entities = self._map_terms_to_wikidata(terms)
     logger.info("Saving mappings ...")
     self._write_terms_mapping(terms_to_entities)
    def _save_hierarchy(self, hierarchy):
        logger.info("Saving hierarchy ...")

        io_time = 0
        working_time = 0

        log_step = max(1, int(len(self._tasks) / 20.0))
        for index, task in enumerate(self._tasks):
            if index % int(len(self._tasks) / log_step) == 0:
                logger.info(
                    "  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
                        index, len(self._tasks), io_time, working_time))

            io_start = time.time()
            entity = read_json(task.in_path)
            io_time += time.time() - io_start

            add_time = time.time()
            self._add_hierarchy(entity, hierarchy)
            working_time += time.time() - add_time

            io_start = time.time()
            write_json(task.out_path, entity)
            io_time += time.time() - io_start

        logger.info("  {:>5} / {}   io: {:0.0f}s working: {:0.0f}s".format(
            len(self._tasks), len(self._tasks), io_time, working_time))
Пример #8
0
 def execute(self):
     start_time = time.time()
     input_directory = None
     output_directory = None
     skip_step = False
     for index, step in enumerate(self._steps):
         logger.info("Executing step %i / %i", index + 1, len(self._steps))
         if "directory" in step:
             input_directory = output_directory
             output_directory = os.path.join(self._root, step["directory"])
             skip_step = os.path.exists(output_directory)
             logger.info("Output directory changed to '%s'",
                         output_directory)
             os.makedirs(output_directory, exist_ok=True)
         elif skip_step:
             logger.info("Step skipped as output directory already exists")
         elif "transformer" in step:
             transformer: AbstractTransformation = step["transformer"]
             logger.info("Running '%s'", type(transformer).__name__)
             transformer(input_directory, output_directory)
         else:
             raise Exception("Unknown step {}".format(index))
     logger.info("All done in: %s", time.time() - start_time)
 def __call__(self, chunk: TransformationChunk):
     self._tasks = chunk.tasks
     self._context = chunk.context
     #
     logger.info("Loading individuals for: %i tasks", len(self._tasks))
     entities = self._collect_mapped_to()
     logger.info("Collecting hierarchy for: %i entities", len(entities))
     hierarchy = self._collect_hierarchy(entities)
     logger.info("Hierarchy size: %i", len(hierarchy))
     self._save_hierarchy(hierarchy)
    def _collect_hierarchy(self, entities):
        to_resolve = set(entities)
        resolved = {}

        while len(to_resolve) > 0:
            logger.info("  Iterating hierarchy file with %i entries",
                        len(to_resolve))
            last_size = len(to_resolve)
            for line in self._iterate_json_lines(self._context.hierarchy_file):
                line_id = self._extract_id_from_line(line)
                if line_id not in to_resolve:
                    continue
                line_entry = self._line_to_entry(line)
                to_resolve.remove(line_id)
                resolved[line_id] = {
                    "instanceof": [item for item in line_entry["instanceof"]],
                    "subclassof": [item for item in line_entry["subclassof"]]
                }
                # Add new entities to resolve, if they have not been already
                # resolved. We also try to follow subclassof as a primary
                # source using instanceof only as a backup.
                if line_entry["subclassof"]:
                    to_resolve.update([
                        item for item in line_entry["subclassof"]
                        if item not in resolved
                    ])
                else:
                    to_resolve.update([
                        item for item in line_entry["instanceof"]
                        if item not in resolved
                    ])

            if last_size == 1 and len(to_resolve) == last_size:
                logger.info("Missing record for: ")
                for item in to_resolve:
                    resolved[item] = {
                        "instanceof": [],
                        "subclassof": [],
                        "type": "not-found"
                    }
                    logger.info("Missing record for: %s", item)
                break

        return resolved
Пример #11
0
 def _write_index(self):
     index_path = os.path.join(self.directory, "index.json")
     logger.info("Saving index to '%s'", index_path)
     with open(index_path, "w", encoding="utf-8") as out_stream:
         json.dump(self.index, out_stream)