Exemplo n.º 1
0
    def initialize(self, resources: Resources, configs: Config):
        self.redirects = resources.get('redirects')

        # These NIF readers organize the statements in the specific RDF context,
        # in this case each context correspond to one wiki page, this allows
        # us to read the information more systematically.
        self.struct_reader = NIFBufferedContextReader(
            configs.nif_page_structure)
        self.link_reader = NIFBufferedContextReader(configs.nif_text_links)
Exemplo n.º 2
0
    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init
        self.pack_index = read_index(configs.pack_index)
        self.pack_dir = configs.pack_dir

        self.redirects = resources.get('redirects')

        self.literal_info_reader = NIFBufferedContextReader(
            configs.mapping_literals)
        self.object_info_reader = NIFBufferedContextReader(
            configs.mapping_objects)

        # Set up logging.
        f_handler = logging.FileHandler(configs.reading_log)
        f_format = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        f_handler.setFormatter(f_format)
        self.logger.handlers = [f_handler]
Exemplo n.º 3
0
class DBpediaWikiReader(PackReader):
    def __init__(self):
        super().__init__()
        self.struct_reader = None
        self.link_reader = None
        self.redirects: Dict[str, str] = {}

    def initialize(self, resources: Resources, configs: Config):
        self.redirects = resources.get('redirects')

        # These NIF readers organize the statements in the specific RDF context,
        # in this case each context correspond to one wiki page, this allows
        # us to read the information more systematically.
        self.struct_reader = NIFBufferedContextReader(
            configs.nif_page_structure)
        self.link_reader = NIFBufferedContextReader(configs.nif_text_links)

    def _collect(
        self,
        nif_context: str  # type: ignore
    ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]:
        str_data: Dict[str, str] = {}
        node_data: Dict[str, List[state_type]] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia context: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    node_data['struct'] = self.struct_reader.get(c)
                    node_data['links'] = self.link_reader.get(c)

                    yield str_data, node_data
        print(' ..Done')

    def _parse_pack(
        self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        str_data, node_data = doc_data

        pack = DataPack()
        doc_name: str = str_data['doc_name']
        if doc_name in self.redirects:
            doc_name = self.redirects[doc_name]

        full_text: str = str_data['text']

        pack.set_text(full_text)
        page = WikiPage(pack, 0, len(full_text))
        page.page_id = str_data['oldid']
        page.page_name = doc_name

        if len(node_data['struct']) > 0:
            add_struct(pack, node_data['struct'])
        else:
            logging.warning('Structure info for %s not found.', doc_name)

        if len(node_data['links']) > 0:
            add_anchor_links(pack, node_data['links'], self.redirects)
        else:
            logging.warning('Links for [%s] not found.', doc_name)

        pack.meta.doc_id = doc_name

        yield pack

    @classmethod
    def default_configs(cls):
        """
        This defines a basic config structure
        :return:
        """
        return {
            'redirect_path': None,
            'nif_page_structure': None,
            'nif_text_links': None,
        }
Exemplo n.º 4
0
class DBpediaInfoBoxReader(PackReader):
    def __init__(self):
        super().__init__()
        self.pack_index: Dict[str, str]
        self.pack_dir: str
        self.redirects: Dict[str, str]
        self.logger = logging.getLogger(__name__)

    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init
        self.pack_index = read_index(configs.pack_index)
        self.pack_dir = configs.pack_dir

        self.redirects = resources.get('redirects')

        self.literal_info_reader = NIFBufferedContextReader(
            configs.mapping_literals)
        self.object_info_reader = NIFBufferedContextReader(
            configs.mapping_objects)

        # Set up logging.
        f_handler = logging.FileHandler(configs.reading_log)
        f_format = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        f_handler.setFormatter(f_format)
        self.logger.handlers = [f_handler]

    def _collect(self, info_box_raw: str  # type: ignore
                 ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
        for c, statements in ContextGroupedNIFReader(info_box_raw):
            yield get_resource_name(statements[0][0]), {
                'properties': statements,
                'literals': self.literal_info_reader.get(c),
                'objects': self.object_info_reader.get(c),
            }

    def _parse_pack(
            self, collection: Tuple[str, Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        resource_name, info_box_data = collection

        if resource_name in self.redirects:
            resource_name = self.redirects[resource_name]

        if resource_name in self.pack_index:
            print_progress(f'Add infobox to resource: [{resource_name}]')

            pack_path = os.path.join(
                self.pack_dir,
                self.pack_index[resource_name]
            )

            if os.path.exists(pack_path):
                with open(pack_path) as pack_file:
                    pack = data_utils.deserialize(
                        self._pack_manager, pack_file.read())

                    add_info_boxes(pack, info_box_data['literals'])
                    add_info_boxes(pack, info_box_data['objects'])
                    add_property(pack, info_box_data['properties'])
                    yield pack
        else:
            print_notice(f"Resource {resource_name} is not in the raw packs.")
            self.logger.warning("Resource %s is not in the raw packs.",
                                resource_name)

    @classmethod
    def default_configs(cls):
        """
        This defines a basic config structure
        :return:
        """
        return {
            'pack_index': 'article.idx',
            'pack_dir': '.',
            'mapping_literals': None,
            'mapping_objects': None,
            'reading_log': 'infobox.log',
        }