def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}} datapackage["resources"].append({ PROP_STREAMING: True, "name": "collections", "path": "collections.csv", "schema": { "fields": [{ "name": "id", "type": "string" }, { "name": "label", "type": "string" }, { "name": "json", "type": "string" }] } }) def get_resource(): root_data = json.loads(requests.get(parameters["root_url"]).content) for collection in root_data["collections"]: yield { "id": get_collection_id(collection), "label": collection["label"], "json": collection["@id"] } spew(datapackage, [get_resource()], aggregations["stats"])
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: field, op, arg = parameters['field'], parameters['op'], parameters[ 'arg'] spew(dp, logger.log_rows(dp, process_resources(res_iter, field, op, arg)))
def main(): params, datapackage, res_iter = ingest() key = params['key'] url_key = params['url-key'] resource_name = params['resource-name'] resource = { 'name': resource_name, PROP_STREAMING: True, 'path': 'data/{}.csv'.format(resource_name), 'schema': { 'fields': [ { 'name': '{}_Number'.format(key), 'type': 'string' }, { 'name': '{}_Name'.format(key), 'type': 'string' }, { 'name': '{}_Registration_Date'.format(key), 'type': 'string' }, ] } } datapackage['resources'].append(resource) spew(datapackage, [get_entities(url_key)])
def main(): params, dp, res_iter = ingest() os.makedirs('/var/datapackages/sitemaps', exist_ok=True) kind = params['kind'] db_table = params['db-table'] doc_id = params['doc-id'] page_title = params['page-title'] if not dp.get('resources'): dp['resources'] = [ { 'name': 'sitemaps', 'path': 'sitemaps.csv', PROP_STREAMING: True, 'schema': { 'fields': [ { 'name': 'filename', 'type': 'string' } ] } } ] spew(dp, [process_rows(res_iter, kind, db_table, doc_id, page_title)])
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: dp = process_datapackage(dp, parameters) spew( dp, logger.log_rows(dp, process_resources(res_iter, parameters, logger)))
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) schema, sheets = load_sheets() datapackage["resources"] = get_datapackage_resources(schema) spew(datapackage, [ get_resource(deleted=False, sheets=sheets), get_resource(deleted=True, sheets=sheets) ], stats)
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: amount = int(parameters['amount']) spew( dp, logger.log_rows(dp, map(lambda r: islice(r, amount, None), res_iter)))
def __call__(self, *args, **kwargs): parameters, datapackage, res_iter = ingest() self.get_parameters(parameters) if self.resource_id: resource_show_url = self.get_resource_show_url() resource = self.get_ckan_resource(resource_show_url) self.update_ckan_resource(resource) datapackage['resources'].append(resource) spew(datapackage, res_iter)
def main(): parameters, datapackage, resources, stats = ingest() + ({},) get_resources(resources) datapackage["resources"] = [datapackage['resources'][1]] datapackage['resources'][0].update(name='bill_tazkirim', path='bill_tazkirim.csv') fields = [{'name': 'tazkir_offices', 'type': 'string'}] datapackage['resources'][0]['schema']['fields'] += fields spew(datapackage, [get_resources(resources)], stats)
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: spew( dp, logger.log_rows( dp, chain(res_iter, [ load(logger, dp, parameters['url'], parameters['res_name']) ])))
def process(): stats = { 'num_parts': 0, 'num_parts_from_cache': 0, 'num_sessions': 0 } parameters, datapackage, resource_iterator = ingest(debug=False) datapackage = modify_datapackage(datapackage, parameters, stats) new_iter = generic_process_resources(resource_iterator, parameters, stats, process_row) spew(datapackage, new_iter, stats)
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: first_res = next(res_iter) headers = next(first_res) dp = process_datapackage(dp, headers) spew( dp, logger.log_rows( dp, process_resources(chain([first_res], res_iter), headers)))
def main(): parameters, datapackage, resources = ingest() stats = {} all_rows = [] for descriptor, resource in zip(datapackage["resources"], resources): for row in resource: all_rows.append(row) all_appointment_dates = [] appointment_dates = [] for row_num, row in enumerate( sorted(all_rows, key=get_sort_key, reverse=True)): if len(appointment_dates) == 0: cur_date, appointment_dates = init_dates(row["date"], all_appointment_dates) elif appointment_dates[-1]["date"] != row["date"]: if sum([len(d["appointments"]) for d in appointment_dates ]) >= parameters["appointments-per-page"]: cur_date, appointment_dates = init_dates( row["date"], all_appointment_dates) else: appointment_dates.append({ "date": row["date"], "appointments": [] }) if not row.get("when") and row.get("when_from"): if row.get("when_to"): row["when"] = "{}-{}".format(row["when_from"], row["when_to"]) else: row["when"] = row["when_from"] row["bg"] = str(row_num % 2 + 1) row["month_heb"] = [ "ינואר", "פברואר", "מרץ", "אפריל", "מאי", "יוני", "יולי", "אוגוסט", "ספטמבר", "אוקטובר", "נובמבר", "דצמבר" ][row["date"].month - 1] appointment_dates[-1]["appointments"].append(row) jinja_env = get_jinja_env() context = parameters["context"] first_page = get_page(1, all_appointment_dates) last_page = get_page(len(all_appointment_dates), all_appointment_dates) for page_num, appointment_dates in enumerate(all_appointment_dates, start=1): if len(appointment_dates) > 0: context["dates"] = appointment_dates context["pages"] = { "first": first_page, "prev": get_page(page_num - 1, all_appointment_dates), "cur": get_page(page_num, all_appointment_dates), "next": get_page(page_num + 1, all_appointment_dates), "last": last_page } output_file_name = get_page_output_filename( page_num, all_appointment_dates) build_template(jinja_env, "appointments.html", context, output_file_name) spew({}, [], stats)
def main(): parameters, dp, res_iter = ingest() with Logger(parameters) as logger: columns_to_remove = [] for _ in range(parameters['amount']): f = dp['resources'][0]['schema']['fields'].pop(0) columns_to_remove.append(f['name']) spew( dp, logger.log_rows(dp, process_resources(res_iter, columns_to_remove)))
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) fields = [{ 'name': 'num_pages', 'type': 'number' }, { 'name': 'num_pages_comment', 'type': 'string' }] datapackage['resources'][0]['schema']['fields'] += fields datapackage['resources'][0].update(name='bill_count_pages', path='bill_count_pages.csv') spew(datapackage, [get_resource(resources)], stats)
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}, "sysnum_images": {}} fields = [{ "name": "manifest_label", "type": "string" }, { "name": "manifest_sysnum", "type": "string" }, { "name": "resource_id", "type": "string" }, { "name": "resource_type", "type": "string" }, { "name": "resource_format", "type": "string" }, { "name": "resource_width", "type": "number" }, { "name": "resource_height", "type": "number" }, { "name": "resource_filepath", "type": "string" }, { "name": "url", "type": "string" }, { "name": "downloaded", "type": "boolean" }] output_resources = [] output_descriptors = [] for resource, descriptor in zip(resources, datapackage["resources"]): logging.info("creating images archive for collection {}".format( descriptor["name"])) output_resources.append( get_resource(resource, aggregations, descriptor["name"])) output_descriptors.append({ PROP_STREAMING: True, "name": descriptor["name"], "path": "{}.csv".format(descriptor["name"]), "schema": { "fields": fields } }) datapackage["resources"] = output_descriptors spew(datapackage, output_resources, aggregations["stats"])
def main(): parameters, datapackage, resources = ingest() for resource in datapackage["resources"]: if resource["name"] == "manifests": for field in resource["schema"]["fields"]: if field["name"] in ["attribution", "subject", "alternative_title", "title", "the_creator", "publisher", "label", "description"]: field["es:type"] = "text" elif field["name"] in ["map", "sysnum", "language", "collection", "base"]: field["es:type"] = "keyword" else: field["es:type"] = "text" spew(datapackage, resources)
def main(): parameters, datapackage, resources, stats = ingest() + (defaultdict(int), ) max_year = parameters.get('max-year') file_path_template = parameters.get('file-path-template') missing_image = parameters.get('missing-image') datapackage['resources'] = [] for resource in resources: for rownum, row in enumerate(resource): if max_year and row['year'] > max_year: stats['invalid year'] += 1 continue if parameters.get('download-thumbnails'): if not row['thumb_url']: stats['missing thumb_url'] += 1 continue name = 'rownum_{}'.format(rownum) if file_path_template: photo_filename = file_path_template.format(rownum=rownum) if not path.exists(photo_filename): stats['full size photo missing'] += 1 continue if missing_image: if filecmp.cmp(photo_filename, missing_image, shallow=False): stats['photo is missing_image photo'] += 1 continue stats['valid thumbnail'] += 1 url = row['thumb_url'] datapackage['resources'].append({ PROP_STREAMED_FROM: url, 'name': name, 'path': ['files/' + name + '.jpg'], }) else: if row['image_url']: url = parameters['image_url_prefix'] + row['image_url'] name = 'rownum_{}'.format(rownum) datapackage['resources'].append({ PROP_STREAMED_FROM: url, 'name': name, 'path': ['files/' + name + '.png'], }) spew(datapackage, [], stats)
def main(): parameters, datapackage, resources = ingest() resources = list(resources) stats = {} mk_individuals = _get_resource_from_datapackage(datapackage, resources, 'mk_individual') votes = _get_resource_from_datapackage(datapackage, resources, 'vote_rslts_kmmbr_shadow') mk_individuals = list(mk_individuals) stats["total votes"] = 0 datapackage["resources"][1]["schema"]["fields"].append({ "name": "mk_individual_id", "type": "integer" }) spew(datapackage, [mk_individuals, get_resource(votes, mk_individuals, stats)], stats)
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}, "sysnum_images": {}} resources = list(resources) for descriptor in datapackage["resources"]: descriptor["schema"] = get_resource_row_image_schema() def get_resource(resource, descriptor): init_resource_stats(aggregations["stats"], descriptor) bucket = get_bucket(*list( map(os.environ.get, [ "GCS_SERVICE_ACCOUNT_B64_KEY", "GCS_IMAGES_BUCKET", "GCS_PROJECT" ]))) queue, threads = None, None if not os.environ.get("GCS_DISABLE_DOWNLOAD"): numthreads = int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5")) poolsize = 20 if numthreads < 50 else int(numthreads / 2) logging.info("poolsize={}, numthreads={}".format( poolsize, numthreads)) queue, threads = start_downloader( poolsize, numthreads, worker=partial(download_blob, bucket, aggregations, descriptor["name"]), max_retries=5) yield from get_images(resource, aggregations, descriptor["name"], bucket, queue) if queue: stop_downloader( queue, threads, int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5"))) def get_resources(): for resource, descriptor in zip(resources, datapackage["resources"]): yield get_resource(resource, descriptor) spew(datapackage, get_resources(), aggregations["stats"])
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) datapackage['resources'] = [{ PROP_STREAMING: True, "name": "zio", "path": "zio.csv", "schema": { "fields": [{ "name": "description", "type": "string" }, { "name": "year", "type": "year" }, { "name": "id", "type": "string" }, { "name": "thumb_url", "type": "string" }, { "name": "details_url", "type": "string" }, { "name": "scrape_year", "type": "year" }, { "name": "page_number", "type": "integer" }, { "name": "rownum", "type": "integer" }, { 'name': 'error', 'type': 'string' }] } }] spew(datapackage, [get_resource(parameters)], stats)
def main(): parameters, datapackage, resources = ingest() stats = {} aggregations = {"stats": stats} jinja_env = get_jinja_env() committees = {} committees_descriptor = None for descriptor, resource in zip(datapackage["resources"], resources): if descriptor["name"] == "kns_committee": committees_descriptor = descriptor for committee in resource: committees[int(committee["CommitteeID"])] = committee elif descriptor["name"] == "kns_committeesession": build_meeting_templates(resource, committees, jinja_env, descriptor, committees_descriptor, aggregations) build_committee_templates(jinja_env, committees, committees_descriptor, aggregations) build_committee_knessets_list_template(jinja_env, committees, aggregations) build_committees_index_template(jinja_env, committees, aggregations) spew({}, [], stats)
def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"], self.datapackage) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema }
def main(): params, dp, res_iter = ingest() dp['name'] = 'category-explanations' dp['resources'] = [{ 'name': 'category-explanations', 'path': 'data/category-explanations.csv', PROP_STREAMING: True, 'schema': { 'fields': [ { 'name': 'budget_code', 'type': 'string' }, { 'name': 'explanation', 'type': 'string' }, { 'name': 'explanation_short', 'type': 'string' }, { 'name': 'source', 'type': 'string' }, ] } }] spew(dp, [ itertools.chain( process_file('category-explanations.md', 'explanation'), process_file('category-explanations-short.md', 'explanation_short'), ) ])
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}} resources = list(resources) for descriptor in datapackage["resources"]: descriptor["schema"]["fields"] = [{ "name": "doc_id", "type": "string" }, { "name": "system_number", "type": "string" }, { "name": "manifest_url", "type": "string" }, { "name": "manifest_file", "type": "string" }] def get_resources(): for resource, descriptor in zip(resources, datapackage["resources"]): yield (parse_row(row) for row in resource) spew(datapackage, get_resources(), aggregations["stats"])
def main(): parameters, datapackage, resources, stats = ingest() + (defaultdict(int),) resource_names = [r['name'] for r in datapackage['resources']] datapackage['resources'] = [r for r in datapackage['resources'] if r['name'] == 'foi_offices'] datapackage['resources'][0]['schema']['fields'] += [{'name': 'update_type', 'type': 'string'}, {'name': 'update_title', 'type': 'string'}, {'name': 'entity_id', 'type': 'string'},] def get_resources(): existing_entities = {} for resource_name, resource in zip(resource_names, resources): if resource_name == 'existing_entities': for row in get_existing_entities(resource, existing_entities, stats): pass elif resource_name == 'foi-groups-matching': for row in get_foi_groups_matching(resource, existing_entities, stats): pass elif resource_name == 'foi_offices': yield get_foi_offices_resource(resource, existing_entities, stats, parameters.get('dry-run')) else: for row in resource: pass spew(datapackage, get_resources(), stats)
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}} collections = {} for descriptor, resource in zip(datapackage["resources"], resources): if descriptor["name"] == "collections": collections = list(resource) else: list(resource) datapackage["resources"] = [] for collection in collections: datapackage["resources"].append({ PROP_STREAMING: True, "name": collection["id"], "path": "{}.csv".format(collection["id"]), "schema": { "fields": [{ "name": "label", "type": "string" }, { "name": "manifest", "type": "string" }] } }) def get_resource(collection): for member in json.loads(requests.get( collection["json"]).content)["members"]: yield {"label": member["label"], "manifest": member["@id"]} spew(datapackage, (get_resource(collection) for collection in collections), aggregations["stats"])
def main(): parameters, dp, res_iter = ingest() connection_string = get_connection_string() existing_ids = None resource_name = parameters['resource-name'] input_key_fields = parameters['key-fields'] input_hash_fields = parameters.get('hash-fields') for res in dp['resources']: if resource_name == res['name']: if input_hash_fields is None: input_hash_fields = set(f['name'] for f in res['schema']['fields']) input_hash_fields = set(input_hash_fields) - set(input_key_fields) if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0: res['schema']['fields'].extend(STATUS_FIELDS) db_key_fields = parameters.get('db-key-fields', input_key_fields) db_hash_fields = parameters.get('db-hash-fields', input_hash_fields) existing_ids = \ get_all_existing_ids(connection_string, parameters['db-table'], db_key_fields, db_hash_fields) break assert existing_ids is not None logging.info('Found %d ids', len(list(existing_ids.keys()))) spew(dp, process_resources(res_iter, resource_name, input_key_fields, input_hash_fields, existing_ids))
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) bills = {} israel_law_bill_ids = {} for bill in next(resources): bill['law_ministry_ids'] = [] bills[bill['BillID']] = bill if bill['IsraelLawID']: for israel_law_id in bill['IsraelLawID']: israel_law_bill_ids.setdefault(israel_law_id, []) israel_law_bill_ids[israel_law_id].append(bill['BillID']) for law_ministry in next(resources): for bill_id in israel_law_bill_ids.get(law_ministry['IsraelLawID'], []): if law_ministry['GovMinistryID'] not in bills[bill_id][ 'law_ministry_ids']: bills[bill_id]['law_ministry_ids'].append( law_ministry['GovMinistryID']) gov_ministries = {} for gov_ministry in next(resources): gov_ministries[gov_ministry['GovMinistryID']] = gov_ministry['Name'] for bill in bills.values(): ministry_names = set() for ministry_id in bill['law_ministry_ids']: ministry_names.add(gov_ministries[ministry_id]) bill['law_ministry_names'] = ', '.join(ministry_names) datapackage["resources"] = [datapackage['resources'][0]] fields = [{ 'name': 'law_ministry_ids', 'type': 'array' }, { 'name': 'law_ministry_names', 'type': 'string' }] datapackage["resources"][0]['schema']['fields'] += fields spew(datapackage, [bills.values()], stats)
"""Map the raw columns names to fiscal fields where indicated.""" from datapackage_pipelines.wrapper import ingest, spew parameters_, datapackage_, resources_ = ingest() thresholds = parameters_['thresholds'] allowed_values = parameters_['allowed_values'] threshold_columns = thresholds.keys() allowed_value_columns = allowed_values.keys() def is_empty(value): if value is None: return True if type(value) is str and value.strip() == '': return True return False def process(resources): def process_single(resource): counter = 0 nones = dict((c, 0) for c in threshold_columns) for row in resource: counter += 1 for column in threshold_columns: value = row.get(column) if is_empty(value): nones[column] += 1 for column in allowed_value_columns:
from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resources import PROP_STREAMING import elasticsearch.helpers from elasticsearch import Elasticsearch from temp_loglevel import temp_loglevel import settings parameters, datapackage, resource_iterator = ingest() stats = {"total loaded rows": 0, "not allowed rows": 0} def is_allowed_row(row): return row["StatusDesc"] == "Completed" and \ row["RightsDesc"] == "Full" and \ row["DisplayStatusDesc"] != "Internal Use" and \ (row["UnitText1_En"] not in [None, ''] or row["UnitText1_He"] not in [None, '']) def filter_row(row): global stats stats["total loaded rows"] += 1 if is_allowed_row(row): yield {k: str(v) for k, v in row.items()} else: stats["not allowed rows"] += 1 def get_resources(): yield get_resource()
"""A processor to concatenate resources that have a common set of fields.""" from datapackage_pipelines.wrapper import ingest, spew def concatenate(resources): """Concatenate multiple resources.""" for resource in resources: for row in resource: yield row if __name__ == '__main__': _, datapackage, resources_ = ingest() single_resource = concatenate(resources_) datapackage['resources'] = [datapackage['resources'][0]] spew(datapackage, [single_resource])
if index != 1: yield index, headers, values @staticmethod def _fixed_points(rows): """Convert floats to 2-digit fixed precision strings""" for index, headers, values in rows: values = [ '%.2f' % value if type(value) is float else value for value in values ] yield index, headers, values XLSXIngestor = XLSIngestor def ingest_resources(datapackage): """Ingest each resource one by one into the pipeline.""" for resource in datapackage['resources']: ingestor = BaseIngestor.load(resource) yield ingestor.rows if __name__ == '__main__': _, datapackage_, _ = ingest() resources = list(ingest_resources(datapackage_)) spew(datapackage_, resources)
def main(cls): from datapackage_pipelines.wrapper import ingest, spew spew(*cls(*ingest()).spew())
from arrow.parser import ParserError from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def parse_currencies(row): """Clean up and convert currency fields to floats.""" date_columns = ( 'Datum van laatste bijwerking', 'Einddatum', 'Begindatum' ) for key in date_columns: try: row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm') except ParserError: if row[key] != '0000-00-00 00:00:00': message = 'Could not parse %s to a date, returning None' logging.warning(message, row[key]) row[key] = None return row if __name__ == '__main__': parameters, datapackage_, resources = ingest() new_resources_ = process(resources, parse_currencies) spew(datapackage_, new_resources_)
import yaml from datapackage_pipelines.utilities.resources import PROP_STREAMED_FROM, PATH_PLACEHOLDER from datapackage_pipelines.wrapper import ingest, spew FILENAME = 'pipeline-spec.yaml' resources = [] datapackage = { 'name': 'placeholder', 'resources': resources, 'profile': 'data-package', } parameters, _, _ = ingest() country = parameters.get('country').lower() userid = gobble.user.User().id for dirpath, dirnames, filenames in os.walk('.'): if dirpath == '.': continue if FILENAME in filenames: pipeline = yaml.load(open(os.path.join(dirpath, FILENAME))) dataset_name = pipeline[list(pipeline.keys())[0]]['pipeline'][0]['parameters']['datapackage']['name'] url_base = 'http://datastore.openspending.org/{}/{}'.format(userid, dataset_name) resp = requests.get(url_base+'/datapackage.json') if resp.status_code == 200: datapackage_json = resp.json() if len(country) > 0: if datapackage_json.get('geo', {}).get('country_code', 'xx').lower() != country:
def main(cls): # can be used like this in datapackage processor files: # if __name__ == '__main__': # Processor.main() spew(*cls(*ingest()).spew())
Datapackage mutation -------------------- None. """ from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def add_geocodes(row, **kw): """Fill up the country and region fields.""" row['beneficiary_country_code'] = kw['country_code'] row['beneficiary_country'] = kw['country'] row['beneficiary_nuts_code'] = kw['nuts_code'] row['beneficiary_nuts_region'] = kw['region'] return row if __name__ == '__main__': parameters_, datapackage, resources = ingest() new_resources = process(resources, add_geocodes, **parameters_) spew(datapackage, new_resources)
for key, lookup in lookup_tables.items(): if row[key] in lookup: row[key] = lookup[row[key]] else: warning('%s mapped to None because no alias was found', row[key]) row[key] = None return row def build_lookup_tables(mappings): """Build the lookup tables.""" def lookup_table(mapping): for key, aliases in mapping.items(): for alias in aliases: yield alias, key return { mapping['field']: dict(lookup_table(mapping['mapping'])) for mapping in mappings } if __name__ == '__main__': parameters, _, resources = ingest() lookup_tables_ = build_lookup_tables(parameters['mappings']) new_resources = process(resources, map_aliases, lookup_tables=lookup_tables_) spew(_, new_resources)
"""The template for writing PDF and web scrapers.""" from datapackage_pipelines.wrapper import ingest, spew from logging import debug def scrape_beneficiaries(**params): """Return a generator of beneficiaries. Each beneficiary is a dictionary whose keys match the fields described in source.description.yaml. Parameters come from pipeline-specs.yaml. """ debug('%s', **params) beneficiaries = [ {'field1': 'foo', 'field2': 'spam'}, {'field1': 'bar', 'field2': 'eggs'}, ] for beneficiary in beneficiaries: yield beneficiary if __name__ == '__main__': parameters, datapackage, _ = ingest() rows = scrape_beneficiaries(**parameters) spew(datapackage, [rows])
from datapackage_pipelines.wrapper import ingest, spew import json import os print(os.getcwd()) datapackage_file, _, _ = ingest() datapackage = json.loads(datapackage_file) spew(datapackage, [])
import simplejson import datetime import itertools import time import requests from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines_measure.config import settings import logging log = logging.getLogger(__name__) parameters, datapackage, res_iter = ingest() # 30 authenticated requests per minute, so wait 3 secs (or use # GITHUB_REQUEST_WAIT_INTERVAL env var) before each request # (https://developer.github.com/v3/search/#rate-limit) REQUEST_WAIT_INTERVAL = int(settings.get('GITHUB_REQUEST_WAIT_INTERVAL', 3)) def _make_github_request(url): try: headers = { 'Authorization': 'token {}'.format(settings['GITHUB_API_TOKEN']) } response = requests.get(url, headers=headers) json_response = response.json() except simplejson.scanner.JSONDecodeError: log.error('Expected JSON in response from: {}'.format(url)) raise
parameters['post_parse'] = [get_skip_rows(row_numbers)] parameters.update(**resource.get('parser_options')) if extension == '.csv': parameters['post_parse'].append(drop_bad_rows) parameters.update(encoding=get_encoding(parameters, resource)) if extension in ('.xls', '.xlsx'): parameters['post_parse'].append(force_strings) if extension == '.json': fill_missing_fields(path) parameters['post_parse'].append(force_strings) info('Ingesting file = %s', path) info('Ingestion parameters = %s', format_to_json(parameters)) parameters.update(headers=get_headers(parameters, path)) with Stream(path, **parameters) as stream: check_fields_match(resource, stream) log_sample_table(stream) yield stream.iter(keyed=True) if __name__ == '__main__': parameters_, datapackage_, _ = ingest() parameters_ = {} if parameters_ is None else parameters_ resources = stream_local_file(datapackage_, **parameters_) spew(datapackage_, resources)