Пример #1
0
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                element_dict = xmltodict.parse(element_xml_str)

                # Temporary solution for public-only content.
                # TODO: remove this when licensed content are enabled!
                if not self.is_public(element_dict["data"]):
                    continue

                # TODO: It's probably a pointless attribute.
                # del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta["item"] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )
Пример #2
0
    def parse(self, response: scrapy.http.Response):

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data
        elements = json.loads(response.body_as_unicode())
        for i, element in enumerate(elements):
            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element

            # In case JSON string representation is preferred:
            json_str = json.dumps(element, indent=4, sort_keys=True, ensure_ascii=False)
            copyResponse._set_body(json_str)
            print(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)
Пример #3
0
    def parse_content_api_json(self, response: scrapy.http.Response):
        # Build up an item
        md = json.loads(response.body)
        item = LandingPageItem()
        item['landing_page_url'] = response.url
        item['title'] = md['title']
        item['first_publication_date'] = md['first_published_at']
        item['publication_date'] = md['first_published_at']
        # Pick up the 'public updated' date instead, if present:
        if 'public_updated_at' in md:
            item['publication_date'] = md['public_updated_at']
        item['publishers'] = []
        for org in md['links']['organisations']:
            item['publishers'].append(org['title'])
        # item['publisher_metadata'] = md

        # Make a response object to make it easy to parse the HTML fragments in the API:
        resp = response.copy()

        # Go through the documents:
        item['documents'] = []
        for doc in md['details']['documents']:
            resp._set_body(doc)
            doc_item = DocumentItem()
            doc_item['title'] = resp.css('.title ::text').extract_first()
            doc_item['document_url'] = response.urljoin(
                resp.css(
                    '.attachment-details a::attr(href)  ').extract_first())
            doc_item['isbn'] = resp.css(
                'span[class=isbn] ::text').extract_first()
            doc_item['command_paper_number'] = resp.css(
                'span[class=command_paper_number] ::text').extract_first()
            doc_item['house_of_commons_paper_number'] = resp.css(
                'span[class=house_of_commons_paper_number] ::text'
            ).extract_first()
            item['documents'].append(dict(doc_item))

        # Return the composite ite:
        yield item
Пример #4
0
    def parse(self, response: scrapy.http.Response):
        def contains_exceptions(cat_tree):
            return bool(set(cat_tree) & set(self.cat_2_exceptions))

        page_type = AlloSpider.get_page_type(response)
        if page_type == PageType.OTHER:
            self.logger.info('Skipping page (unknown type) ' + response.url)
            return
        elif page_type == PageType.NO_GOODS:
            self.logger.info('Skipping page (no goods) ' + response.url)
            return
        if contains_exceptions(response.meta.get(Names.CAT_TREE_KEY)):
            self.logger.info('Skipping page (Exception) ' + response.url)
            return

        parser = MainAlloParser(self, response.copy())
        if page_type == PageType.PRODUCTS:
            return parser.yield_products()
        elif page_type == PageType.CATALOG:
            return parser.yield_catalog()
        elif page_type == PageType.GOOD:
            return parser.yield_item()
    def check_for_dead_ends_before_parsing(self,
                                           response: scrapy.http.Response):
        """
        Checks if the current response.url has already been parsed or is on the "skip_these_urls"-list.
        If the current url hasn't been parsed already, copies the response and calls parse to gather metadata
        from the current .html
        :param response:
        :return:
        """
        if response is not None:
            # Only call the parse method if the current url is no dead-end without content:
            table_body = response.xpath('//table[@class="smalltable"]')
            if table_body is not None:
                no_entry_regex = re.compile(r'Bisher kein Eintrag')
                for table_item in table_body:
                    if (no_entry_regex.search(table_item.get())) is not None:
                        self.debug_dead_end_counter += 1
                        # print("The URL", response.url, "is a 'Bisher kein Eintrag'-dead-end.")
                        # print("check_for_dead_ends... Method: already parsed URLs =", len(self.debug_parsed_urls),
                        #       "| gathered urls =", len(self.navigation_urls), "| skip_these_urls =",
                        #       len(self.skip_these_urls), "| Total amount of dead-ends:", self.debug_dead_end_counter)

                    # check if the current url has already been parsed:
                    elif (response.url
                          not in self.debug_parsed_urls) and (response
                                                              is not None):
                        # check if current url contains an undesired url-pattern
                        skip_check = False
                        for url_pattern in self.skip_these_urls:
                            current_regex = re.compile(url_pattern)
                            if current_regex.search(response.url) is not None:
                                skip_check = True
                        # if the current url is a "fresh" one, call the parse method to extract metadata
                        if skip_check is False:
                            # print("URL TO BE PARSED: ", response.url)
                            self.debug_parsed_urls.add(response.url)
                            response_copy = response.copy()
                            yield from self.parse(response_copy)