def gather_all_download_urls(self): r = util.get_url_request( 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:gacm&rows=500' ) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() urls = [] for result in data['result']['results']: for resource in result['resources']: if not self.sample or (self.sample and len(urls) < 10): if resource['format'] == 'JSON' and \ resource['url'] != "http://datos.gob.mx/adela/api/v1/organizations/gacm/documents": urls.append({ 'url': resource['url'], 'filename': 'file-%s.json' % hashlib.md5( resource['url'].encode('utf-8')).hexdigest(), 'data_type': 'release_package_list' if resource['name'] == "CONCENTRADO ARCHIVO JSON" else 'release_package', }) return urls
def gather_all_download_urls(self): url = 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' if self.sample: return [{ 'url': url % 1, 'filename': 'sample.json', 'data_type': 'record_package_list_in_results', }] r = util.get_url_request(url % 2) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['pagination']['total'] page = 1 out = [] limit = data['pagination']['pageSize'] while ((page - 1) * limit) < total: out.append({ 'url': url % page, 'filename': 'page%d.json' % page, 'data_type': 'record_package_list_in_results', }) page += 1 return out
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://api.colombiacompra.gov.co/releases/?page=1', 'filename': 'sample.json', 'data_type': 'release_package', }] r = util.get_url_request('https://api.colombiacompra.gov.co/releases/?page=1') if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['links']['count'] page = 1 out = [] # this limit is not passed to the API via the URL - but the API is currently returning 1000 # results per page, so we hard code it limit = 1000 while ((page-1)*limit) < total: out.append({ 'url': 'https://api.colombiacompra.gov.co/releases/?page=%d' % page, 'filename': 'page%d.json' % page, 'data_type': 'release_package', }) page += 1 return out
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1000&offset=0', 'filename': 'offset0.json', 'data_type': 'release_package', }] url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=1' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['meta']['count'] offset = 0 out = [] limit = 10000 while offset < total: out.append({ 'url': 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % (limit, offset), 'filename': 'offset%d.json' % offset, 'data_type': 'release_package', }) offset += limit return out
def save_url(self, filename, data, file_path): if data['data_type'] == 'meta': r = util.get_url_request(data['url']) if r[1]: raise Exception(r[1]) r = r[0] doc = lxml.html.fromstring(r.text) additional = [] for item in doc.xpath('//li'): url_bit = item.xpath('a')[0].get('href') if url_bit != 'index.html': url = '%s/%s' % (data['url'], url_bit) if not self.sample or (self.sample and len(additional) < 3): additional.append({ 'url': url, 'filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest(), 'data_type': 'release_package', }) return additional, [] else: return [], save_content(data['url'], file_path)
def gather_all_download_urls(self): if self.sample: return [{ 'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1', 'filename': 'page1.json', 'data_type': 'release_package_list_in_results', 'encoding': "ISO-8859-1" }] url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() total = data['maxPage'] out = [] for page in range(1, total + 1): out.append({ 'url': 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' % page, 'filename': 'page%d.json' % page, 'data_type': 'release_package_list_in_results', 'encoding': "ISO-8859-1" }) return out
def gather_all_download_urls(self): url = 'http://datos.gob.mx/busca/api/3/action/' url += 'package_search?q=organization:inai&rows=500' r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() out = [] for result in data['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': temp = resource['url'].split("//")[1] conn = http.client.HTTPConnection(temp.split("/")[0]) name = temp.split("/")[1] conn.request('HEAD', "/" + name) response = conn.getresponse() url = response.getheader('Location').replace( "open?", "uc?export=download&") out.append({ 'url': url, 'filename': '{}.json'.format(name), 'data_type': 'release_package_list', 'encoding': 'utf-8-sig', # ignore BOM }) if self.sample: return out return out
def gather_all_download_urls(self): r = util.get_url_request( 'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos') if r[1]: raise Exception(r[1]) r = r[0] datas = r.json() out = [] for data in datas: if not self.sample or (self.sample and len(out) < 10): out.append({ 'url': data['uri'], 'filename': 'id%s.json' % data['id'], 'data_type': 'release_package', }) return out
def fetchRecordPackageIDs(self, year): ''' Download the CSV file for a particular year, and extract the list of record package IDs. ''' url = 'https://www.contrataciones.gov.py/' url += 'images/opendata/planificaciones/%s.csv' % year r = util.get_url_request(url) if r[1]: raise Exception(r[1]) r = r[0] decoded_content = r.content.decode('utf-8') cr = csv.reader(decoded_content.splitlines(), delimiter=',') id_list = [] for row in cr: id_list.append(row[2]) return id_list[1:]
def gather_all_download_urls(self): r = util.get_url_request( 'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts' ) if r[1]: raise Exception(r[1]) r = r[0] datas = r.json() out = [] for data in datas: if not self.sample or (self.sample and len(out) < 10): out.append({ 'url': data['URIContract'], 'filename': 'id%s.json' % data['ocid'], 'data_type': 'record_package', }) return out
def save_content(self, url, filepath, headers=None): request, errors = get_url_request(url, stream=True, headers=headers) if any('Request exception (Code %s): %s' % (401, 'Invalid or expired token') in s for s in errors): self.access_token = None errors = self.save_content( url, filepath, headers={"Authorization": self.getAccessToken()}) if not request: return errors try: with open(filepath, 'wb') as f: for chunk in request.iter_content(1024 ^ 2): f.write(chunk) return [] except Exception as e: return [str(e)]
def gather_all_download_urls(self): r = util.get_url_request('http://ocds.prozorro.openprocurement.io/') if r[1]: raise Exception(r[1]) r = r[0] doc = lxml.html.fromstring(r.text) last_url = None for item in doc.xpath('//li'): url = item.xpath('a')[0].get('href') last_url = { 'url': 'http://ocds.prozorro.openprocurement.io/%s' % url, 'filename': 'meta-%s.json' % url, 'data_type': 'meta', } if self.argument_date and url == 'merged_with_extensions_' + self.argument_date: return [last_url] if self.argument_date: raise Exception("You requested the Ukraine data dated " + self.argument_date + " but we couldn't find that!") else: return [last_url]
def gather_all_download_urls(self): tags = [ 'planning', 'tender' ] # , 'award', 'contract' <-- original fetcher also has these but these return 500? out = [] for tag in tags: if self.sample: out.append({ 'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag, 'filename': 'tag%spage1.json' % tag, 'data_type': 'release_package', }) else: r = util.get_url_request( 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=1' % tag) if r[1]: raise Exception(r[1]) r = r[0] data = r.json() last_page = data['pagination']['last_page'] for page in range(1, last_page + 1): out.append({ 'url': 'http://gpp.ppda.go.ug/api/v1/releases?tag=%s&page=%d' % (tag, page), 'filename': 'tag-%s-page-%d.json' % (tag, page), 'data_type': 'release_package', }) return out