Python http.Response.copy примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapy

Класс/Тип: http.Response

Метод/Функция: copy

Примеров на hotexamples.com: 5

Python http.Response.copy - 5 примеров найдено. Это лучшие примеры Python кода для scrapy.http.Response.copy, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

css(30)

follow(30)

urljoin(30)

xpath(30)

copy(5)

body_as_unicode(1)

Пример #1

Показать файл

    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                element_dict = xmltodict.parse(element_xml_str)

                # Temporary solution for public-only content.
                # TODO: remove this when licensed content are enabled!
                if not self.is_public(element_dict["data"]):
                    continue

                # TODO: It's probably a pointless attribute.
                # del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta["item"] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )

Пример #2

Показать файл

    def parse(self, response: scrapy.http.Response):

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data
        elements = json.loads(response.body_as_unicode())
        for i, element in enumerate(elements):
            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element

            # In case JSON string representation is preferred:
            json_str = json.dumps(element, indent=4, sort_keys=True, ensure_ascii=False)
            copyResponse._set_body(json_str)
            print(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)

Пример #3

Показать файл

Файл: gov_uk_pub.py Проект: ukwa/glean

    def parse_content_api_json(self, response: scrapy.http.Response):
        # Build up an item
        md = json.loads(response.body)
        item = LandingPageItem()
        item['landing_page_url'] = response.url
        item['title'] = md['title']
        item['first_publication_date'] = md['first_published_at']
        item['publication_date'] = md['first_published_at']
        # Pick up the 'public updated' date instead, if present:
        if 'public_updated_at' in md:
            item['publication_date'] = md['public_updated_at']
        item['publishers'] = []
        for org in md['links']['organisations']:
            item['publishers'].append(org['title'])
        # item['publisher_metadata'] = md

        # Make a response object to make it easy to parse the HTML fragments in the API:
        resp = response.copy()

        # Go through the documents:
        item['documents'] = []
        for doc in md['details']['documents']:
            resp._set_body(doc)
            doc_item = DocumentItem()
            doc_item['title'] = resp.css('.title ::text').extract_first()
            doc_item['document_url'] = response.urljoin(
                resp.css(
                    '.attachment-details a::attr(href)  ').extract_first())
            doc_item['isbn'] = resp.css(
                'span[class=isbn] ::text').extract_first()
            doc_item['command_paper_number'] = resp.css(
                'span[class=command_paper_number] ::text').extract_first()
            doc_item['house_of_commons_paper_number'] = resp.css(
                'span[class=house_of_commons_paper_number] ::text'
            ).extract_first()
            item['documents'].append(dict(doc_item))

        # Return the composite ite:
        yield item

Пример #4

Показать файл

Файл: allo.py Проект: AlexBogs/allo_parser

    def parse(self, response: scrapy.http.Response):
        def contains_exceptions(cat_tree):
            return bool(set(cat_tree) & set(self.cat_2_exceptions))

        page_type = AlloSpider.get_page_type(response)
        if page_type == PageType.OTHER:
            self.logger.info('Skipping page (unknown type) ' + response.url)
            return
        elif page_type == PageType.NO_GOODS:
            self.logger.info('Skipping page (no goods) ' + response.url)
            return
        if contains_exceptions(response.meta.get(Names.CAT_TREE_KEY)):
            self.logger.info('Skipping page (Exception) ' + response.url)
            return

        parser = MainAlloParser(self, response.copy())
        if page_type == PageType.PRODUCTS:
            return parser.yield_products()
        elif page_type == PageType.CATALOG:
            return parser.yield_catalog()
        elif page_type == PageType.GOOD:
            return parser.yield_item()

Пример #5

Показать файл

Файл: ginkgomaps_spider.py Проект: openeduhub/oeh-search-etl

    def check_for_dead_ends_before_parsing(self,
                                           response: scrapy.http.Response):
        """
        Checks if the current response.url has already been parsed or is on the "skip_these_urls"-list.
        If the current url hasn't been parsed already, copies the response and calls parse to gather metadata
        from the current .html
        :param response:
        :return:
        """
        if response is not None:
            # Only call the parse method if the current url is no dead-end without content:
            table_body = response.xpath('//table[@class="smalltable"]')
            if table_body is not None:
                no_entry_regex = re.compile(r'Bisher kein Eintrag')
                for table_item in table_body:
                    if (no_entry_regex.search(table_item.get())) is not None:
                        self.debug_dead_end_counter += 1
                        # print("The URL", response.url, "is a 'Bisher kein Eintrag'-dead-end.")
                        # print("check_for_dead_ends... Method: already parsed URLs =", len(self.debug_parsed_urls),
                        #       "| gathered urls =", len(self.navigation_urls), "| skip_these_urls =",
                        #       len(self.skip_these_urls), "| Total amount of dead-ends:", self.debug_dead_end_counter)

                    # check if the current url has already been parsed:
                    elif (response.url
                          not in self.debug_parsed_urls) and (response
                                                              is not None):
                        # check if current url contains an undesired url-pattern
                        skip_check = False
                        for url_pattern in self.skip_these_urls:
                            current_regex = re.compile(url_pattern)
                            if current_regex.search(response.url) is not None:
                                skip_check = True
                        # if the current url is a "fresh" one, call the parse method to extract metadata
                        if skip_check is False:
                            # print("URL TO BE PARSED: ", response.url)
                            self.debug_parsed_urls.add(response.url)
                            response_copy = response.copy()
                            yield from self.parse(response_copy)