Пример #1
0
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:gacm&rows=500'
     )
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     data = r.json()
     urls = []
     for result in data['result']['results']:
         for resource in result['resources']:
             if not self.sample or (self.sample and len(urls) < 10):
                 if resource['format'] == 'JSON' and \
                         resource['url'] != "http://datos.gob.mx/adela/api/v1/organizations/gacm/documents":
                     urls.append({
                         'url':
                         resource['url'],
                         'filename':
                         'file-%s.json' % hashlib.md5(
                             resource['url'].encode('utf-8')).hexdigest(),
                         'data_type':
                         'release_package_list'
                         if resource['name'] == "CONCENTRADO ARCHIVO JSON"
                         else 'release_package',
                     })
     return urls
Пример #2
0
    def gather_all_download_urls(self):
        url = 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d'
        if self.sample:
            return [{
                'url': url % 1,
                'filename': 'sample.json',
                'data_type': 'record_package_list_in_results',
            }]

        r = util.get_url_request(url % 2)
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['pagination']['total']
        page = 1
        out = []
        limit = data['pagination']['pageSize']
        while ((page - 1) * limit) < total:
            out.append({
                'url': url % page,
                'filename': 'page%d.json' % page,
                'data_type': 'record_package_list_in_results',
            })
            page += 1
        return out
Пример #3
0
    def gather_all_download_urls(self):
        if self.sample:
            return [{
                'url': 'https://api.colombiacompra.gov.co/releases/?page=1',
                'filename': 'sample.json',
                'data_type': 'release_package',
            }]

        r = util.get_url_request('https://api.colombiacompra.gov.co/releases/?page=1')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['links']['count']
        page = 1
        out = []
        # this limit is not passed to the API via the URL - but the API is currently returning 1000
        # results per page, so we hard code it
        limit = 1000
        while ((page-1)*limit) < total:
            out.append({
                'url': 'https://api.colombiacompra.gov.co/releases/?page=%d' % page,
                'filename': 'page%d.json' % page,
                'data_type': 'release_package',
            })
            page += 1
        return out
Пример #4
0
    def gather_all_download_urls(self):

        if self.sample:
            return [{
                'url':
                'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1000&offset=0',
                'filename': 'offset0.json',
                'data_type': 'release_package',
            }]

        url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1'
        r = util.get_url_request(url)
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        data = r.json()
        total = data['meta']['count']
        offset = 0
        out = []
        limit = 10000
        while offset < total:
            out.append({
                'url':
                'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d'
                % (limit, offset),
                'filename':
                'offset%d.json' % offset,
                'data_type':
                'release_package',
            })
            offset += limit
        return out
Пример #5
0
    def save_url(self, filename, data, file_path):
        if data['data_type'] == 'meta':

            r = util.get_url_request(data['url'])
            if r[1]:
                raise Exception(r[1])
            r = r[0]
            doc = lxml.html.fromstring(r.text)

            additional = []

            for item in doc.xpath('//li'):
                url_bit = item.xpath('a')[0].get('href')
                if url_bit != 'index.html':
                    url = '%s/%s' % (data['url'], url_bit)
                    if not self.sample or (self.sample and len(additional) < 3):
                        additional.append({
                            'url': url,
                            'filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest(),
                            'data_type': 'release_package',
                        })

            return additional, []

        else:
            return [], save_content(data['url'], file_path)
Пример #6
0
    def gather_all_download_urls(self):
        if self.sample:
            return [{
                'url':
                'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1',
                'filename': 'page1.json',
                'data_type': 'release_package_list_in_results',
                'encoding': "ISO-8859-1"
            }]

        url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1'
        r = util.get_url_request(url)
        if r[1]:
            raise Exception(r[1])
        r = r[0]

        data = r.json()
        total = data['maxPage']
        out = []
        for page in range(1, total + 1):
            out.append({
                'url':
                'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d'
                % page,
                'filename':
                'page%d.json' % page,
                'data_type':
                'release_package_list_in_results',
                'encoding':
                "ISO-8859-1"
            })
        return out
Пример #7
0
 def gather_all_download_urls(self):
     url = 'http://datos.gob.mx/busca/api/3/action/'
     url += 'package_search?q=organization:inai&rows=500'
     r = util.get_url_request(url)
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     data = r.json()
     out = []
     for result in data['result']['results']:
         for resource in result['resources']:
             if resource['format'] == 'JSON':
                 temp = resource['url'].split("//")[1]
                 conn = http.client.HTTPConnection(temp.split("/")[0])
                 name = temp.split("/")[1]
                 conn.request('HEAD', "/" + name)
                 response = conn.getresponse()
                 url = response.getheader('Location').replace(
                     "open?", "uc?export=download&")
                 out.append({
                     'url': url,
                     'filename': '{}.json'.format(name),
                     'data_type': 'release_package_list',
                     'encoding': 'utf-8-sig',  # ignore BOM
                 })
                 if self.sample:
                     return out
     return out
Пример #8
0
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos')
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     datas = r.json()
     out = []
     for data in datas:
         if not self.sample or (self.sample and len(out) < 10):
             out.append({
                 'url': data['uri'],
                 'filename': 'id%s.json' % data['id'],
                 'data_type': 'release_package',
             })
     return out
Пример #9
0
 def fetchRecordPackageIDs(self, year):
     '''
     Download the CSV file for a particular year, and
     extract the list of record package IDs.
     '''
     url = 'https://www.contrataciones.gov.py/'
     url += 'images/opendata/planificaciones/%s.csv' % year
     r = util.get_url_request(url)
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     decoded_content = r.content.decode('utf-8')
     cr = csv.reader(decoded_content.splitlines(), delimiter=',')
     id_list = []
     for row in cr:
         id_list.append(row[2])
     return id_list[1:]
Пример #10
0
 def gather_all_download_urls(self):
     r = util.get_url_request(
         'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts'
     )
     if r[1]:
         raise Exception(r[1])
     r = r[0]
     datas = r.json()
     out = []
     for data in datas:
         if not self.sample or (self.sample and len(out) < 10):
             out.append({
                 'url': data['URIContract'],
                 'filename': 'id%s.json' % data['ocid'],
                 'data_type': 'record_package',
             })
     return out
Пример #11
0
    def save_content(self, url, filepath, headers=None):
        request, errors = get_url_request(url, stream=True, headers=headers)
        if any('Request exception (Code %s): %s' %
               (401, 'Invalid or expired token') in s for s in errors):
            self.access_token = None
            errors = self.save_content(
                url,
                filepath,
                headers={"Authorization": self.getAccessToken()})
        if not request:
            return errors

        try:
            with open(filepath, 'wb') as f:
                for chunk in request.iter_content(1024 ^ 2):
                    f.write(chunk)
            return []
        except Exception as e:
            return [str(e)]
Пример #12
0
    def gather_all_download_urls(self):
        r = util.get_url_request('http://ocds.prozorro.openprocurement.io/')
        if r[1]:
            raise Exception(r[1])
        r = r[0]
        doc = lxml.html.fromstring(r.text)

        last_url = None
        for item in doc.xpath('//li'):
            url = item.xpath('a')[0].get('href')
            last_url = {
                'url': 'http://ocds.prozorro.openprocurement.io/%s' % url,
                'filename': 'meta-%s.json' % url,
                'data_type': 'meta',
            }
            if self.argument_date and url == 'merged_with_extensions_' + self.argument_date:
                return [last_url]

        if self.argument_date:
            raise Exception("You requested the Ukraine data dated " + self.argument_date + " but we couldn't find that!")
        else:
            return [last_url]
Пример #13
0
    def gather_all_download_urls(self):
        tags = [
            'planning', 'tender'
        ]  # , 'award', 'contract' <-- original fetcher also has these but these return 500?
        out = []

        for tag in tags:
            if self.sample:
                out.append({
                    'url':
                    'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' %
                    tag,
                    'filename':
                    'tag%spage1.json' % tag,
                    'data_type':
                    'release_package',
                })
            else:
                r = util.get_url_request(
                    'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' %
                    tag)
                if r[1]:
                    raise Exception(r[1])
                r = r[0]
                data = r.json()
                last_page = data['pagination']['last_page']
                for page in range(1, last_page + 1):
                    out.append({
                        'url':
                        'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=%d'
                        % (tag, page),
                        'filename':
                        'tag-%s-page-%d.json' % (tag, page),
                        'data_type':
                        'release_package',
                    })

        return out