def url_builder(self, value, data, response): system = components(-5, -4)(response.request.url) year = int(components(-4, -3)(response.request.url)) month = int(components(-3, -2)(response.request.url).lstrip('0')) return self.pattern.format(system, date(year, month, 1), value, self.limit)
def parse_list(self, response): data = response.json() # The last page returns an empty JSON object. if not data: return for item in data['data']: url = replace_parameters(response.request.url, offset=None) + item['ocid'] yield self.build_request(url, formatter=components(-2)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
def parse_list(self, response): data = json.loads(response.text) for item in data['data']: url = item['uri'] if url: yield self.build_request(url, formatter=components(-1)) if self.sample: break else: next_page_url = data.get('next_page_url') if next_page_url: yield self.build_request(next_page_url, formatter=join(components(-1), parameters('page')), callback=self.parse_list)
def parse_list(self, response): # remove the last item in the list to fix the str JSON format urls = json.loads( response.xpath('//body//text()').getall()[6].replace( ",\r\n\r\nhttps://www.ppra.org.pk", "")) for url in urls: yield self.build_request(url, formatter=components(-2))
def parse_list(self, response): urls = response.css('.fileLink::attr(href)').getall() json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls)) for url in json_urls: if '/JSON_DGCP_' in url: yield self.build_request(url, formatter=components(-1))
def parse_list(self, response): urls = response.xpath('//item/link/text()').getall() if self.sample: urls = [urls[0]] for url in urls: yield self.build_request(url, formatter=components(-1))
def parse(self, response): data = json.loads(response.text) pattern = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}' # If is the first URL, we need to iterate over all the pages to get all the process ids to query if response.request.meta['first']: total = data['meta']['totalPages'] for page in range(2, total + 1): yield self.build_request(self.base_list_url.format(page), formatter=parameters('page'), meta={ 'meta': True, 'first': False, }, dont_filter=True) # if is a meta request it means that is the page that have the process ids to query if response.request.meta['meta']: # Now that we have the ids we iterate over them, without duplicate them, and make the # final requests for the release_package this time for row in data['results']: if row['idLlamado'] and row[ 'idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) yield self.build_request(pattern.format(row['idLlamado']), formatter=components(-1), meta={ 'meta': False, 'first': False, }, dont_filter=True) else: yield self.build_file_from_response(response, data_type=self.data_type)
def start_requests(self): url = self.url # date parameter obtained url = url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005 yield self.build_request(url, formatter=components(-2))
def parse_list(self, response): urls = response.xpath('//a[@class="enlaces_contenido"]/@href').getall() if self.sample: urls = [urls[0]] for url in urls: # URL looks like https://apiocds.colombiacompra.gov.co:8443/ArchivosSECOP/Archivos/SI2011.zip yield self.build_request(url, formatter=components(-1))
def parse_list(self, response): data = json.loads(response.text) for resource in data['result']['resources']: if resource['format'].upper() == 'JSON': # Presently, only one URL matches. yield self.build_request(resource['url'], formatter=components(-1))
def start_requests(self): today = date.today() if hasattr(self, 'year'): year = int(self.year) start = date(year, 1, 1) stop = date(year, 12, 1) if year == today.year: stop = stop.replace(month=today.month) else: start = date(2008, 1, 1) stop = today if self.sample: start = stop for d in date_range_by_month(start, stop): yield self.build_request( self.base_list_url.format(d, 0, self.limit), formatter=components(-4, -1), meta={ 'year': d.year, 'month': d.month, }, callback=self.parse_list )
def parse_list(self, response): for path in response.xpath('//div[@role="rowheader"]/span/a/@href').getall(): if path.endswith('.xlsx'): yield self.build_request( f'https://github.com{path}?raw=true', formatter=components(-1) )
def parse_list(self, response): urls = response.xpath('//a[contains(., "[json]")]/@href').getall() if self.sample: urls = [urls[0]] for url in urls: # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1))
def parse_list(self, response): urls = json.loads(response.text) if self.sample: urls = [urls[0]] for url in urls: # URL looks like https://ocds.ageops.net/api/record/5ed2a62c4192f32c8c74a4e5 yield self.build_request(url, formatter=components(-1))
def parse_record_package(self, response): yield self.build_file_from_response(response, data_type='record_package') data = json.loads(response.text) if 'packages' in data: for url in data['packages']: yield self.build_request(url, formatter=components(-1))
def parse_list(self, response): urls = response.json()['packagesPerMonth'] for url in urls: # URL looks like https://www.zppa.org.zm/ocds/services/recordpackage/getrecordpackage/2016/7 yield self.build_request(url, formatter=join(components(-2), extension='zip'))
def parse_list(self, response): datas = response.json() for result in datas['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': # http://bit.ly/ConcentradoINAI yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True}, callback=self.parse_redirect)
def parse_list(self, response): data = response.json() for item in data['data']: for resource in item['resources']: description = resource['description'] if description and 'ocds' in description.lower(): # Presently, only one URL matches. yield self.build_request(resource['url'], formatter=components(-2))
def parse_list(self, response): urls = response.json() for url in urls: if self.from_date and self.until_date: date = datetime.strptime(url[-10:], self.date_format) if not (self.from_date <= date <= self.until_date): continue yield self.build_request(url, formatter=components(-2), callback=self.parse_release_list)
def parse_list(self, response): html_urls = response.xpath('//a/@href').getall() if html_urls: # Each link contains different versions of SERCOP's emergency dataset, only the newest should be downloaded # URL format: ./archivos/ocds-YYYY-MM-DD.json html_urls.sort(reverse=True) yield self.build_request(f'{response.request.url}{html_urls[0]}', formatter=components(-1))
def start_requests(self): pattern = 'https://birms.bandung.go.id/api/packages/year/{}' start = 2013 stop = date.today().year for year in date_range_by_year(start, stop): yield self.build_request(pattern.format(year), formatter=components(-1), callback=self.parse_list)
def parse_list(self, response): items = json.loads(response.text) if self.sample: items = [items[0]] for item in items: url = item['urls']['json'] yield self.build_request(url, formatter=components(-1))
def parse_list(self, response): urls = json.loads(response.text) for url in urls: # A JSON array of URL strings, in reverse chronological order. # URL looks like https://ocds.ageops.net/api/ocds/releases/2020-05-30 yield self.build_request(url, formatter=components(-1), callback=self.parse_release_list)
def parse_list(self, response): urls = json.loads(response.text)['packagesPerMonth'] if self.sample: urls = [urls[0]] for url in urls: # URL looks like https://www.zppa.org.zm/ocds/services/recordpackage/getrecordpackage/2016/7 yield self.build_request(url, formatter=components(-2))
def parse_list(self, response): pattern = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json' data = response.json() for item in data['data']['fiscal_years']: # A URL might redirect to https://admin.ims.susasan.org/login yield self.build_request(pattern.format(item['name']), formatter=components(-1), meta={'dont_redirect': True})
def start_requests(self): url = 'https://ocds.blob.core.windows.net/ocds/{0.year:d}{0.month:02d}.zip' start = date(2009, 1, 1) stop = date.today().replace(day=1) for d in date_range_by_month(start, stop): yield self.build_request(url.format(d), formatter=components(-1))
def parse_list(self, response): pattern = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json' data = json.loads(response.text) for item in data['data']['fiscal_years']: yield self.build_request(pattern.format(item['name']), formatter=components(-1)) if self.sample: break
def parse_list(self, response): urls = response.json()['packagesPerMonth'] netloc = urlsplit(response.request.url).netloc for url in urls: # URL looks like http://malta-demo-server.eurodyn.com/ocds/services/recordpackage/getrecordpackage/2020/1 yield self.build_request( urlsplit(url)._replace(netloc=netloc).geturl(), formatter=join(components(-2), extension='zip'))
def parse_list(self, response): data = json.loads(response.text) for resource in data['result']['resources']: if 'ocds' in resource['description']: # Presently, only one URL matches. yield scrapy.Request( resource['url'], meta={'file_name': components(-1)(resource['url'])}, callback=self.parse_data)
def parse_list(self, response): datas = json.loads(response.text) for result in datas['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True}, callback=self.parse_redirect)