def parse_list(self, response): data = json.loads(response.text) release_type = response.request.meta['release_type'] if 'links' in data and isinstance(data['links'], dict) and 'next' in data['links'] and not self.sample: yield self.build_request( data['links']['next'], formatter=parameters('event', 'startRow'), meta={'release_type': release_type}, callback=self.parse_list ) for release in data['releases']: if release_type == 'planning': uuid = release['tender']['plannedProcurementUUID'] yield self.build_request( 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=' + uuid, formatter=parameters('event', 'PlannedProcurementUUID') ) elif release_type == 'tender': uuid = release['tender']['RFTUUID'] yield self.build_request( 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=' + uuid, formatter=parameters('event', 'RFTUUID') ) elif release_type == 'contract': for award in release['awards']: uuid = award['CNUUID'] yield self.build_request( 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=' + uuid, formatter=parameters('event', 'CNUUID') )
class Uganda(IndexSpider): """ Domain Government Procurement Portal (GPP) of Public Procurement and Disposal of Public Assets Authority (PPDA) API documentation https://docs.google.com/spreadsheets/d/10tVioy-VOQa1FwWoRl5e1pMbGpiymA0iycNcoDFkvks/edit#gid=365266172 """ name = 'uganda_releases' data_type = 'release_package' total_pages_pointer = '/data/last_page' yield_list_results = False formatter = staticmethod(parameters('page')) base_url = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes' download_delay = 0.9 def start_requests(self): yield scrapy.Request('https://gpp.ppda.go.ug/adminapi/public/api/pdes', meta={'file_name': 'page-1.json'}, callback=self.parse_list, cb_kwargs={'callback': self.parse_data}) @handle_http_error def parse_data(self, response): pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}' data = json.loads(response.text) for pdes in data['data']['data']: for plans in pdes['procurement_plans']: for tag in ('planning', 'tender', 'award', 'contract'): yield self.build_request( pattern.format(tag, plans['financial_year'], plans['pde_id']), formatter=join(components(-1), parameters('fy', 'pde')))
def start_requests(self): pattern = 'https://tenders.nsw.gov.au/?event=public.api.{}.search&ResultsPerPage=1000' for release_type in ('planning', 'tender', 'contract'): yield self.build_request(pattern.format(release_type), formatter=parameters('event'), meta={'release_type': release_type}, callback=self.parse_list)
class MexicoQuienEsQuien(IndexSpider): """ Domain QuiénEsQuién.Wiki API documentation https://quienesquienapi.readthedocs.io/es/latest/ Swagger API documentation https://api.quienesquien.wiki/v2/docs/ """ name = 'mexico_quien_es_quien' download_delay = 0.9 count_pointer = '/data/0/collections/contracts/count' limit = 1000 base_url = 'https://api.quienesquien.wiki/v2/contracts' formatter = staticmethod(parameters('offset')) data_type = 'record_package_list' def start_requests(self): yield scrapy.Request('https://api.quienesquien.wiki/v2/sources', meta={'file_name': 'list.json'}, callback=self.parse_list) @handle_http_error def parse(self, response): data = json.loads(response.text) yield self.build_file_from_response(response, data=json.dumps( data['data']).encode(), data_type=self.data_type)
class MexicoAdministracionPublicaFederal(IndexSpider): """ Domain Administración Pública Federal (APF) Bulk download documentation https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf """ name = 'mexico_administracion_publica_federal' # BaseSpider root_path = 'results.item' # SimpleSpider data_type = 'record_package' # IndexSpider count_pointer = '/pagination/total' limit = '/pagination/pageSize' use_page = True formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://api.datos.gob.mx/v1/contratacionesabiertas' yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
class KenyaMakueni(IndexSpider): """ Domain Makueni County Swagger API documentation https://opencontracting.makueni.go.ke/swagger-ui.html#/ocds-controller """ name = 'kenya_makueni' data_type = 'release_package_list' limit = 10 additional_params = {'pageSize': limit} yield_list_results = False param_page = 'pageNumber' formatter = staticmethod(parameters('pageNumber')) base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={limit}&pageNumber={page}' def start_requests(self): yield scrapy.Request( 'https://opencontracting.makueni.go.ke/api/ocds/release/count', meta={'file_name': 'count.json'}, callback=self.parse_list ) def range_generator(self, data, response): return range(ceil(int(response.text) / self.limit)) def url_builder(self, value, data, response): return self.pages_url_builder(value, data, response)
def parse_redirect(self, response): if response.status == 301: url = response.headers['Location'].decode('utf-8').replace( 'open?', 'uc?export=download&') yield self.build_request(url, formatter=parameters('id')) else: yield self.build_file_error_from_response(response)
def parse(self, response): data = json.loads(response.text) pattern = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}' # If is the first URL, we need to iterate over all the pages to get all the process ids to query if response.request.meta['first']: total = data['meta']['totalPages'] for page in range(2, total + 1): yield self.build_request(self.base_list_url.format(page), formatter=parameters('page'), meta={ 'meta': True, 'first': False, }, dont_filter=True) # if is a meta request it means that is the page that have the process ids to query if response.request.meta['meta']: # Now that we have the ids we iterate over them, without duplicate them, and make the # final requests for the release_package this time for row in data['results']: if row['idLlamado'] and row[ 'idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) yield self.build_request(pattern.format(row['idLlamado']), formatter=components(-1), meta={ 'meta': False, 'first': False, }, dont_filter=True) else: yield self.build_file_from_response(response, data_type=self.data_type)
class DominicanRepublicPortal(LinksSpider): """ Domain Dirección General de Contrataciones Públicas (DGCP) Spider arguments from_date Download only data from this date onward (YYYY-MM-DD format). If ``until_date`` is provided, defaults to '2018-01-01'. until_date Download only data until this date (YYYY-MM-DD format). If ``from_date`` is provided, defaults to today. API documentation http://148.101.176.123:48080/ocdsdr/docs """ name = 'dominican_republic_api' # BaseSpider default_from_date = '2018-01-01' # SimpleSpider data_type = 'release_package' # LinksSpider next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'http://148.101.176.123:48080/ocdsdr/api/v1/releases' if self.from_date and self.until_date: url = f"{url}/byDatesBetween/{self.from_date.strftime('%Y-%m-%d')}/{self.until_date.strftime('%Y-%m-%d')}" yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
class Colombia(LinksSpider): """ Domain Colombia Compra Eficiente (CCE) Spider arguments from_date Download only data from this date onward (YYYY-MM-DD format). If ``until_date`` is provided, defaults to '2011-01-01'. until_date Download only data until this date (YYYY-MM-DD format). If ``from_date`` is provided, defaults to today. start_page The page number from which to start crawling. API documentation https://www.colombiacompra.gov.co/transparencia/api Swagger API documentation https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/ """ name = 'colombia' next_page_formatter = staticmethod(parameters('_id')) default_from_date = '2011-01-01' data_type = 'release_package' def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' if self.from_date and self.until_date: from_date = self.from_date.strftime(self.date_format) until_date = self.until_date.strftime(self.date_format) base_url += f'/dates/{from_date}/{until_date}' base_url += '?page={}' start_page = 1 if hasattr(self, 'start_page'): start_page = int(self.start_page) yield self.build_request(base_url.format(start_page), formatter=parameters('page')) def retry(self, response, reason): url = response.request.url self.logger.info(reason.format(url=url, status=response.status)) time.sleep(120 * 60) yield scrapy.Request(url, dont_filter=True, meta=response.request.meta) def parse(self, response): # In Colombia, every day at certain hour they run a process in their system that drops the database and make # the services unavailable for about 120 minutes, as Colombia has a lot of data, # the spider takes more than one day to scrape all the data, # so eventually the spider will always face the service problems. For that, when the problem occurs, (503 # status or invalid json) we wait 120 minutes and then continue try: if self.is_http_success(response): yield self.build_file_from_response(response, data_type=self.data_type) yield self.next_link(response) elif response.status == 503: self.retry(response, 'Sleeping due to HTTP error {status} from {url}') else: yield self.build_file_error_from_response(response) except JSONDecodeError: self.retry(response, 'Sleeping due to JSONDecodeError from {url}')
def parse_list(self, response): yield from self.parse(response) if not self.sample: data = json.loads(response.text) total = data['maxPage'] for page in range(2, total + 1): url = replace_parameter(response.request.url, 'page', page) yield self.build_request(url, formatter=parameters('page'))
class PortugalBase(LinksSpider): # BaseSpider default_from_date = '2010-01-01' # LinksSpider next_page_formatter = staticmethod(parameters('offset')) # We will wait 1, 2, 4, 8, 16 minutes (31 minutes total). max_retries = 5 initial_wait_time = 60 def start_requests(self): url = self.url if self.from_date and self.until_date: url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}' yield scrapy.Request(url, meta={'file_name': 'offset-1.json'}) # https://github.com/scrapy/scrapy/blob/master/scrapy/downloadermiddlewares/retry.py def parse(self, response): retries = response.request.meta.get('retries', 0) + 1 wait_time = response.request.meta.get('wait_time', self.initial_wait_time // 2) * 2 # Every ~36,000 requests, the API returns HTTP errors. After a few minutes, it starts working again. # The number of failed attempts in the log messages includes the original request. # https://github.com/open-contracting/kingfisher-collect/issues/545#issuecomment-762768460 if self.is_http_success(response): yield from super().parse(response) elif retries <= self.max_retries: request = response.request.copy() request.meta['retries'] = retries request.meta['wait_time'] = wait_time request.dont_filter = True self.logger.debug( 'Retrying %(request)s in %(wait_time)ds (failed %(failures)d times): HTTP %(status)d', { 'request': response.request, 'failures': retries, 'status': response.status, 'wait_time': wait_time }, extra={'spider': self}) yield request else: self.logger.error( 'Gave up retrying %(request)s (failed %(failures)d times): HTTP %(status)d', { 'request': response.request, 'failures': retries, 'status': response.status }, extra={'spider': self}) yield self.build_file_error_from_response(response)
def start_requests(self): if self.sample: url = self.url.format(step=self.step, page=0) yield self.build_request(url, formatter=parameters('pageNumber')) else: yield scrapy.Request( 'https://opencontracting.makueni.go.ke/api/ocds/release/count', meta={'file_name': 'count.json'}, callback=self.parse_count)
def request_range(self, start_date, end_date, search_h): return self.build_request( self.base_page_url.format(start_date, end_date), formatter=parameters('releasedate__gte', 'releasedate__lte'), meta={ 'release_date': start_date, 'search_h': search_h, }, headers={'Accept': '*/*', 'Content-Type': 'application/json'} )
def parse_list(self, response): yield from self.parse(response) if not self.sample: data = json.loads(response.text) offset = data['meta']['pagination']['limit'] total = data['meta']['count'] for offset in range(offset, total, self.step): url = replace_parameter(response.request.url, 'offset', offset) yield self.build_request(url, formatter=parameters('offset'))
def parse_list(self, response): pattern = 'https://api.quienesquien.wiki/v2/contracts?limit={limit}&offset={offset}' limit = 1000 count = json.loads(response.text)['data'][0]['collections']['contracts']['count'] for offset in range(ceil(count / limit)): url = pattern.format(limit=limit, offset=offset * limit) yield self.build_request(url, formatter=parameters('offset')) if self.sample: break
def start_requests(self): url = f'{self.base_url}/search/processes?tipo_fecha=fecha_release&' \ f'fecha_desde={self.from_date.strftime(self.date_format)}-04:00&' \ f'fecha_hasta={self.until_date.strftime(self.date_format)}-04:00' yield self.build_request( url, formatter=parameters('fecha_desde'), # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, callback=self.parse_pages)
def parse_list(self, response): base_url = 'http://public.eprocurement.systems/ocds/tenders/' data = json.loads(response.text) # The last page returns an empty JSON object. if not data: return for item in data['data']: yield self.build_request(base_url + item['ocid'], formatter=components(-1)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=parameters('offset'), callback=self.parse_list)
def parse_data(self, response): pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}' data = response.json() for pdes in data['data']['data']: for plans in pdes['procurement_plans']: for tag in ('planning', 'tender', 'award', 'contract'): yield self.build_request( pattern.format(tag, plans['financial_year'], plans['pde_id']), formatter=join(components(-1), parameters('fy', 'pde')) )
def parse_list(self, response): data = response.json() for item in data['data']: for resource in item['resources']: description = resource['description'] if description and 'ocds' in description.lower(): yield self.build_request(resource['url'], formatter=components(-2)) next_page = data.get('next_page') if next_page: yield self.build_request(next_page, formatter=parameters('page'), callback=self.parse_list)
def parse_list(self, response): yield from self.parse(response) if not self.sample: data = json.loads(response.text) page = data['pagination']['page'] total = data['pagination']['total'] limit = data['pagination']['pageSize'] for page in range(page + 1, ceil(total / limit)): url = replace_parameter(response.request.url, 'page', page) yield self.build_request(url, formatter=parameters('page'))
class Kyrgyzstan(LinksSpider): """ Domain Ministry of Finance """ name = 'kyrgyzstan' data_type = 'release_package' next_page_formatter = staticmethod(parameters('offset')) def start_requests(self): yield scrapy.Request('http://ocds.zakupki.gov.kg/api/tendering', meta={'file_name': 'offset-0.json'})
def parse_list(self, response): data = response.json() # The last page returns an empty JSON object. if not data: return for item in data['data']: url = replace_parameters(response.request.url, offset=None) + item['ocid'] yield self.build_request(url, formatter=components(-2)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' if self.from_date and self.until_date: from_date = self.from_date.strftime(self.date_format) until_date = self.until_date.strftime(self.date_format) base_url += f'/dates/{from_date}/{until_date}' base_url += '?page={}' start_page = 1 if hasattr(self, 'start_page'): start_page = int(self.start_page) yield self.build_request(base_url.format(start_page), formatter=parameters('page'))
def start_requests(self): # Paraguay Hacienda has a service that return all the ids that we need to get the releases packages # so we first iterate over this list that is paginated yield self.build_request( self.base_list_url.format(1), formatter=parameters('page'), meta={ 'meta': True, 'first': True, }, # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, )
def parse_list(self, response): pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes?page={}' if self.sample: total = 1 else: data = json.loads(response.text) total = data['data']['last_page'] for page in range(2, total + 1): yield self.build_request(pattern.format(page), formatter=parameters('page'), callback=self.parse_data)
class PortugalBase(LinksSpider): default_from_date = '2010-01-01' next_page_formatter = staticmethod(parameters('offset')) # The API return 429 error after a certain number of requests download_delay = 1 # The API returns 503 error sometimes custom_settings = {'RETRY_TIMES': 10} def start_requests(self): url = self.url if self.from_date and self.until_date: url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}' yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})
class Armenia(LinksSpider): """ Spider arguments sample Download only the first release package in the dataset. """ name = 'armenia' data_type = 'release_package' next_pointer = '/next_page/uri' next_page_formatter = staticmethod(parameters('offset')) def start_requests(self): url = 'https://armeps.am/ocds/release' yield scrapy.Request(url, meta={'file_name': 'offset-0.json'})
def parse_list(self, response): data = response.json() for item in data['data']: url = item['uri'] if url: yield self.build_request(url, self.get_formatter()) else: next_page_url = data.get('next_page_url') if next_page_url: yield self.build_request(next_page_url, formatter=join( self.get_formatter(), parameters('page')), callback=self.parse_list)
class GeorgiaReleases(LinksSpider): """ Domain State Procurement Agency (SPA) Swagger API documentation https://odapi.spa.ge/api/swagger.ui """ name = 'georgia_releases' data_type = 'release_package' next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://odapi.spa.ge/api/releases.json' yield scrapy.Request(url, meta={'file_name': 'page-1.json'})