Пример #1
0
    def url_builder(self, value, data, response):
        system = components(-5, -4)(response.request.url)
        year = int(components(-4, -3)(response.request.url))
        month = int(components(-3, -2)(response.request.url).lstrip('0'))

        return self.pattern.format(system, date(year, month, 1), value,
                                   self.limit)
Пример #2
0
    def parse_list(self, response):
        data = response.json()
        # The last page returns an empty JSON object.
        if not data:
            return

        for item in data['data']:
            url = replace_parameters(response.request.url, offset=None) + item['ocid']
            yield self.build_request(url, formatter=components(-2))

        url = replace_parameters(response.request.url, offset=data['offset'])
        yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
 def parse_list(self, response):
     data = json.loads(response.text)
     for item in data['data']:
         url = item['uri']
         if url:
             yield self.build_request(url, formatter=components(-1))
             if self.sample:
                 break
     else:
         next_page_url = data.get('next_page_url')
         if next_page_url:
             yield self.build_request(next_page_url, formatter=join(components(-1), parameters('page')),
                                      callback=self.parse_list)
Пример #4
0
 def parse_list(self, response):
     # remove the last item in the list to fix the str JSON format
     urls = json.loads(
         response.xpath('//body//text()').getall()[6].replace(
             ",\r\n\r\nhttps://www.ppra.org.pk", ""))
     for url in urls:
         yield self.build_request(url, formatter=components(-2))
Пример #5
0
    def parse_list(self, response):
        urls = response.css('.fileLink::attr(href)').getall()
        json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls))

        for url in json_urls:
            if '/JSON_DGCP_' in url:
                yield self.build_request(url, formatter=components(-1))
Пример #6
0
    def parse_list(self, response):
        urls = response.xpath('//item/link/text()').getall()
        if self.sample:
            urls = [urls[0]]

        for url in urls:
            yield self.build_request(url, formatter=components(-1))
Пример #7
0
    def parse(self, response):
        data = json.loads(response.text)
        pattern = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}'

        # If is the first URL, we need to iterate over all the pages to get all the process ids to query
        if response.request.meta['first']:
            total = data['meta']['totalPages']
            for page in range(2, total + 1):
                yield self.build_request(self.base_list_url.format(page),
                                         formatter=parameters('page'),
                                         meta={
                                             'meta': True,
                                             'first': False,
                                         },
                                         dont_filter=True)

        # if is a meta request it means that is the page that have the process ids to query
        if response.request.meta['meta']:
            # Now that we have the ids we iterate over them, without duplicate them, and make the
            # final requests for the release_package this time
            for row in data['results']:
                if row['idLlamado'] and row[
                        'idLlamado'] not in self.release_ids:
                    self.release_ids.append(row['idLlamado'])
                    yield self.build_request(pattern.format(row['idLlamado']),
                                             formatter=components(-1),
                                             meta={
                                                 'meta': False,
                                                 'first': False,
                                             },
                                             dont_filter=True)
        else:
            yield self.build_file_from_response(response,
                                                data_type=self.data_type)
Пример #8
0
 def start_requests(self):
     url = self.url
     # date parameter obtained
     url = url.format(self.from_date.strftime("%Y%m%d"),
                      self.until_date.strftime("%Y%m%d"))
     # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005
     yield self.build_request(url, formatter=components(-2))
Пример #9
0
 def parse_list(self, response):
     urls = response.xpath('//a[@class="enlaces_contenido"]/@href').getall()
     if self.sample:
         urls = [urls[0]]
     for url in urls:
         # URL looks like https://apiocds.colombiacompra.gov.co:8443/ArchivosSECOP/Archivos/SI2011.zip
         yield self.build_request(url, formatter=components(-1))
 def parse_list(self, response):
     data = json.loads(response.text)
     for resource in data['result']['resources']:
         if resource['format'].upper() == 'JSON':
             # Presently, only one URL matches.
             yield self.build_request(resource['url'],
                                      formatter=components(-1))
Пример #11
0
    def start_requests(self):
        today = date.today()
        if hasattr(self, 'year'):
            year = int(self.year)
            start = date(year, 1, 1)
            stop = date(year, 12, 1)
            if year == today.year:
                stop = stop.replace(month=today.month)
        else:
            start = date(2008, 1, 1)
            stop = today

        if self.sample:
            start = stop

        for d in date_range_by_month(start, stop):
            yield self.build_request(
                self.base_list_url.format(d, 0, self.limit),
                formatter=components(-4, -1),
                meta={
                    'year': d.year,
                    'month': d.month,
                },
                callback=self.parse_list
            )
Пример #12
0
 def parse_list(self, response):
     for path in response.xpath('//div[@role="rowheader"]/span/a/@href').getall():
         if path.endswith('.xlsx'):
             yield self.build_request(
                 f'https://github.com{path}?raw=true',
                 formatter=components(-1)
             )
Пример #13
0
 def parse_list(self, response):
     urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
     if self.sample:
         urls = [urls[0]]
     for url in urls:
         # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
         yield self.build_request(url, formatter=components(-1))
Пример #14
0
 def parse_list(self, response):
     urls = json.loads(response.text)
     if self.sample:
         urls = [urls[0]]
     for url in urls:
         # URL looks like https://ocds.ageops.net/api/record/5ed2a62c4192f32c8c74a4e5
         yield self.build_request(url, formatter=components(-1))
Пример #15
0
    def parse_record_package(self, response):
        yield self.build_file_from_response(response,
                                            data_type='record_package')

        data = json.loads(response.text)
        if 'packages' in data:
            for url in data['packages']:
                yield self.build_request(url, formatter=components(-1))
Пример #16
0
    def parse_list(self, response):
        urls = response.json()['packagesPerMonth']

        for url in urls:
            # URL looks like https://www.zppa.org.zm/ocds/services/recordpackage/getrecordpackage/2016/7
            yield self.build_request(url,
                                     formatter=join(components(-2),
                                                    extension='zip'))
Пример #17
0
 def parse_list(self, response):
     datas = response.json()
     for result in datas['result']['results']:
         for resource in result['resources']:
             if resource['format'] == 'JSON':
                 # http://bit.ly/ConcentradoINAI
                 yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True},
                                          callback=self.parse_redirect)
Пример #18
0
 def parse_list(self, response):
     data = response.json()
     for item in data['data']:
         for resource in item['resources']:
             description = resource['description']
             if description and 'ocds' in description.lower():
                 # Presently, only one URL matches.
                 yield self.build_request(resource['url'], formatter=components(-2))
Пример #19
0
 def parse_list(self, response):
     urls = response.json()
     for url in urls:
         if self.from_date and self.until_date:
             date = datetime.strptime(url[-10:], self.date_format)
             if not (self.from_date <= date <= self.until_date):
                 continue
         yield self.build_request(url, formatter=components(-2), callback=self.parse_release_list)
Пример #20
0
 def parse_list(self, response):
     html_urls = response.xpath('//a/@href').getall()
     if html_urls:
         # Each link contains different versions of SERCOP's emergency dataset, only the newest should be downloaded
         # URL format: ./archivos/ocds-YYYY-MM-DD.json
         html_urls.sort(reverse=True)
         yield self.build_request(f'{response.request.url}{html_urls[0]}',
                                  formatter=components(-1))
    def start_requests(self):
        pattern = 'https://birms.bandung.go.id/api/packages/year/{}'

        start = 2013
        stop = date.today().year

        for year in date_range_by_year(start, stop):
            yield self.build_request(pattern.format(year), formatter=components(-1), callback=self.parse_list)
Пример #22
0
    def parse_list(self, response):
        items = json.loads(response.text)
        if self.sample:
            items = [items[0]]

        for item in items:
            url = item['urls']['json']
            yield self.build_request(url, formatter=components(-1))
 def parse_list(self, response):
     urls = json.loads(response.text)
     for url in urls:
         # A JSON array of URL strings, in reverse chronological order.
         # URL looks like https://ocds.ageops.net/api/ocds/releases/2020-05-30
         yield self.build_request(url,
                                  formatter=components(-1),
                                  callback=self.parse_release_list)
Пример #24
0
    def parse_list(self, response):
        urls = json.loads(response.text)['packagesPerMonth']
        if self.sample:
            urls = [urls[0]]

        for url in urls:
            # URL looks like https://www.zppa.org.zm/ocds/services/recordpackage/getrecordpackage/2016/7
            yield self.build_request(url, formatter=components(-2))
Пример #25
0
 def parse_list(self, response):
     pattern = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json'
     data = response.json()
     for item in data['data']['fiscal_years']:
         # A URL might redirect to https://admin.ims.susasan.org/login
         yield self.build_request(pattern.format(item['name']),
                                  formatter=components(-1),
                                  meta={'dont_redirect': True})
    def start_requests(self):
        url = 'https://ocds.blob.core.windows.net/ocds/{0.year:d}{0.month:02d}.zip'

        start = date(2009, 1, 1)
        stop = date.today().replace(day=1)

        for d in date_range_by_month(start, stop):
            yield self.build_request(url.format(d), formatter=components(-1))
Пример #27
0
 def parse_list(self, response):
     pattern = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json'
     data = json.loads(response.text)
     for item in data['data']['fiscal_years']:
         yield self.build_request(pattern.format(item['name']),
                                  formatter=components(-1))
         if self.sample:
             break
Пример #28
0
    def parse_list(self, response):
        urls = response.json()['packagesPerMonth']

        netloc = urlsplit(response.request.url).netloc
        for url in urls:
            # URL looks like http://malta-demo-server.eurodyn.com/ocds/services/recordpackage/getrecordpackage/2020/1
            yield self.build_request(
                urlsplit(url)._replace(netloc=netloc).geturl(),
                formatter=join(components(-2), extension='zip'))
Пример #29
0
 def parse_list(self, response):
     data = json.loads(response.text)
     for resource in data['result']['resources']:
         if 'ocds' in resource['description']:
             # Presently, only one URL matches.
             yield scrapy.Request(
                 resource['url'],
                 meta={'file_name': components(-1)(resource['url'])},
                 callback=self.parse_data)
Пример #30
0
 def parse_list(self, response):
     datas = json.loads(response.text)
     for result in datas['result']['results']:
         for resource in result['resources']:
             if resource['format'] == 'JSON':
                 yield self.build_request(resource['url'],
                                          formatter=components(-1),
                                          meta={'dont_redirect': True},
                                          callback=self.parse_redirect)