Exemplo n.º 1
0
    def parse_binary_search(self, response, minimum=None, maximum=None):
        offset = int(get_parameter_value(response.request.url, 'offset'))

        first_offset = response.request.meta['first']

        if minimum and maximum:
            self.logger.info(f'Starting binary search for {first_offset:,} within [{minimum:,}, {maximum:,}]')
        elif self.is_http_success(response):
            minimum = response.request.meta['minimum']
            maximum = offset
        else:
            minimum = offset + 1
            maximum = response.request.meta['maximum']

        # If the search succeeded, parse the response as usual. We use a threshold, because getting the exact
        # millisecond requires 27 requests.
        if minimum + THRESHOLD >= maximum:
            self.logger.info(f'New offset found after {first_offset:,} at {maximum:,}!')
            if offset == maximum:
                # If the last request used the offset, we can reuse its response.
                yield from self.parse(response)
            else:
                url = replace_parameters(response.request.url, offset=maximum)
                yield self._build_request(url, self.parse, {})
        else:
            url = replace_parameters(response.request.url, offset=(minimum + maximum) // 2)
            yield self._build_request(url, self.parse_binary_search, {'minimum': minimum, 'maximum': maximum,
                                                                      'first': first_offset})
Exemplo n.º 2
0
    def parse_list(self, response):
        data = response.json()
        # The last page returns an empty JSON object.
        if not data:
            return

        for item in data['data']:
            url = replace_parameters(response.request.url, offset=None) + item['ocid']
            yield self.build_request(url, formatter=components(-2))

        url = replace_parameters(response.request.url, offset=data['offset'])
        yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
Exemplo n.º 3
0
    def parse_list(self, response):
        base_url = 'http://public.eprocurement.systems/ocds/tenders/'
        data = json.loads(response.text)
        # The last page returns an empty JSON object.
        if not data:
            return
        for item in data['data']:
            yield self.build_request(base_url + item['ocid'], formatter=components(-1))

        url = replace_parameters(response.request.url, offset=data['offset'])
        yield self.build_request(url, formatter=parameters('offset'), callback=self.parse_list)
Exemplo n.º 4
0
 def parse_pages(self, response):
     content = json.loads(response.text)
     for url in self.get_files_to_download(content):
         yield self.build_request(url,
                                  formatter=components(-1),
                                  dont_filter=True)
     pagination = content['pagination']
     if pagination['current_page'] < pagination['total_pages']:
         page = pagination['current_page'] + 1
         url = replace_parameters(response.request.url, page=page)
         yield self.build_request(url,
                                  formatter=parameters(
                                      'fecha_desde', 'page'),
                                  dont_filter=True,
                                  callback=self.parse_pages)
Exemplo n.º 5
0
    def parse_date_range(self, response):
        offset = int(get_parameter_value(response.request.url, 'offset'))

        # Scrapy uses `datetime.datetime.utcnow()`, so we don't need to worry about time zones.
        start_time = int(self.crawler.stats.get_value('start_time').timestamp() * 1000)
        # We use the first offset to calculate the new offset, and in log lessages.
        first_offset = response.request.meta.get('first', offset)
        # The exponent for the exponential search.
        exponent = response.request.meta.get('exponent', -1) + 1

        # If this offset succeeded, do a binary search from the previous offset to this offset.
        if self.is_http_success(response):
            yield from self.parse_binary_search(response, response.request.meta['prev'], offset)
        # If this offset failed and reached a limit, stop.
        elif offset >= start_time or exponent > EXPONENT_LIMIT:
            self.logger.info(f'No offset found after {first_offset:,} within {2 ** EXPONENT_LIMIT} days.')
            yield self.build_file_error_from_response(response)
        # Otherwise, continue.
        else:
            new_offset = min(first_offset + MILLISECONDS_PER_DAY * 2 ** exponent, start_time)
            url = replace_parameters(response.request.url, offset=new_offset)
            yield self._build_request(url, self.parse_date_range, {'prev': offset, 'exponent': exponent,
                                                                   'first': first_offset})
Exemplo n.º 6
0
 def _build_url(self, params):
     url_params = params.copy()
     url_params.update(self.additional_params)
     return util.replace_parameters(self.base_url, **url_params)
Exemplo n.º 7
0
 def _set_base_url(self, url):
     self.base_url = util.replace_parameters(url, page=None, limit=None, offset=None)
Exemplo n.º 8
0
def test_replace_parameters(url, value, expected):
    assert replace_parameters(url, page=value) == expected