def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}, "sysnum_images": {}} fields = [{ "name": "manifest_label", "type": "string" }, { "name": "manifest_sysnum", "type": "string" }, { "name": "resource_id", "type": "string" }, { "name": "resource_type", "type": "string" }, { "name": "resource_format", "type": "string" }, { "name": "resource_width", "type": "number" }, { "name": "resource_height", "type": "number" }, { "name": "resource_filepath", "type": "string" }, { "name": "url", "type": "string" }, { "name": "downloaded", "type": "boolean" }] output_resources = [] output_descriptors = [] for resource, descriptor in zip(resources, datapackage["resources"]): logging.info("creating images archive for collection {}".format( descriptor["name"])) output_resources.append( get_resource(resource, aggregations, descriptor["name"])) output_descriptors.append({ PROP_STREAMING: True, "name": descriptor["name"], "path": "{}.csv".format(descriptor["name"]), "schema": { "fields": fields } }) datapackage["resources"] = output_descriptors spew(datapackage, output_resources, aggregations["stats"])
def main(): parameters, datapackage, resources = ingest() for resource in datapackage["resources"]: if resource["name"] == "manifests": for field in resource["schema"]["fields"]: if field["name"] in ["attribution", "subject", "alternative_title", "title", "the_creator", "publisher", "label", "description"]: field["es:type"] = "text" elif field["name"] in ["map", "sysnum", "language", "collection", "base"]: field["es:type"] = "keyword" else: field["es:type"] = "text" spew(datapackage, resources)
def main(): parameters, datapackage, resources, stats = ingest() + (defaultdict(int), ) max_year = parameters.get('max-year') file_path_template = parameters.get('file-path-template') missing_image = parameters.get('missing-image') datapackage['resources'] = [] for resource in resources: for rownum, row in enumerate(resource): if max_year and row['year'] > max_year: stats['invalid year'] += 1 continue if parameters.get('download-thumbnails'): if not row['thumb_url']: stats['missing thumb_url'] += 1 continue name = 'rownum_{}'.format(rownum) if file_path_template: photo_filename = file_path_template.format(rownum=rownum) if not path.exists(photo_filename): stats['full size photo missing'] += 1 continue if missing_image: if filecmp.cmp(photo_filename, missing_image, shallow=False): stats['photo is missing_image photo'] += 1 continue stats['valid thumbnail'] += 1 url = row['thumb_url'] datapackage['resources'].append({ PROP_STREAMED_FROM: url, 'name': name, 'path': ['files/' + name + '.jpg'], }) else: if row['image_url']: url = parameters['image_url_prefix'] + row['image_url'] name = 'rownum_{}'.format(rownum) datapackage['resources'].append({ PROP_STREAMED_FROM: url, 'name': name, 'path': ['files/' + name + '.png'], }) spew(datapackage, [], stats)
def __call__(self): url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True orig_res_iter = orig_res.iter(keyed=True) if limit_rows: orig_res_iter = itertools.islice( orig_res_iter, limit_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def test_spew_finalizer_runs_before_we_signal_that_were_done(self): '''Assert that the finalizer param is executed before spew is finished. We signal to other processors that we're done by writing an empty line to STDOUT. The finalizer parameter to spew() must be executed before that, as there can be processors that depend on us finishing our processing before they're able to run. For example, a processor that depends on `dump_to_zip` must wait until it has finished writing to the local filesystem. ''' datapackage = {} resources_iterator = iter([]) with mock.patch( 'datapackage_pipelines.wrapper.wrapper.stdout') as stdout_mock: def finalizer(): last_call_args = stdout_mock.write.call_args_list[-1] assert last_call_args != mock.call('\n') spew(datapackage, resources_iterator, finalizer=finalizer)
def main(): parameters, datapackage, resources = ingest() resources = list(resources) stats = {} mk_individuals = _get_resource_from_datapackage(datapackage, resources, 'mk_individual') votes = _get_resource_from_datapackage(datapackage, resources, 'vote_rslts_kmmbr_shadow') mk_individuals = list(mk_individuals) stats["total votes"] = 0 datapackage["resources"][1]["schema"]["fields"].append({ "name": "mk_individual_id", "type": "integer" }) spew(datapackage, [mk_individuals, get_resource(votes, mk_individuals, stats)], stats)
def main(): params, datapackage, res_iter = ingest() key = params['key'] url_key = params['url-key'] resource_name = params['resource-name'] resource = { 'name': resource_name, PROP_STREAMING: True, 'path': 'data/{}.csv'.format(resource_name), 'schema': { 'fields': [ {'name': '{}_Number'.format(key), 'type': 'string'}, {'name': '{}_Name'.format(key), 'type': 'string'}, {'name': '{}_Registration_Date'.format(key), 'type': 'string'}, ] } } datapackage['resources'].append(resource) spew(datapackage, [get_entities(url_key)])
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}, "sysnum_images": {}} resources = list(resources) for descriptor in datapackage["resources"]: descriptor["schema"] = get_resource_row_image_schema() def get_resource(resource, descriptor): init_resource_stats(aggregations["stats"], descriptor) bucket = get_bucket(*list( map(os.environ.get, [ "GCS_SERVICE_ACCOUNT_B64_KEY", "GCS_IMAGES_BUCKET", "GCS_PROJECT" ]))) queue, threads = None, None if not os.environ.get("GCS_DISABLE_DOWNLOAD"): numthreads = int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5")) poolsize = 20 if numthreads < 50 else int(numthreads / 2) logging.info("poolsize={}, numthreads={}".format( poolsize, numthreads)) queue, threads = start_downloader( poolsize, numthreads, worker=partial(download_blob, bucket, aggregations, descriptor["name"]), max_retries=5) yield from get_images(resource, aggregations, descriptor["name"], bucket, queue) if queue: stop_downloader( queue, threads, int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5"))) def get_resources(): for resource, descriptor in zip(resources, datapackage["resources"]): yield get_resource(resource, descriptor) spew(datapackage, get_resources(), aggregations["stats"])
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) datapackage['resources'] = [{ PROP_STREAMING: True, "name": "zio", "path": "zio.csv", "schema": { "fields": [{ "name": "description", "type": "string" }, { "name": "year", "type": "year" }, { "name": "id", "type": "string" }, { "name": "thumb_url", "type": "string" }, { "name": "details_url", "type": "string" }, { "name": "scrape_year", "type": "year" }, { "name": "page_number", "type": "integer" }, { "name": "rownum", "type": "integer" }, { 'name': 'error', 'type': 'string' }] } }] spew(datapackage, [get_resource(parameters)], stats)
def main(): parameters, datapackage, resources = ingest() stats = {} aggregations = {"stats": stats} jinja_env = get_jinja_env() committees = {} committees_descriptor = None for descriptor, resource in zip(datapackage["resources"], resources): if descriptor["name"] == "kns_committee": committees_descriptor = descriptor for committee in resource: committees[int(committee["CommitteeID"])] = committee elif descriptor["name"] == "kns_committeesession": build_meeting_templates(resource, committees, jinja_env, descriptor, committees_descriptor, aggregations) build_committee_templates(jinja_env, committees, committees_descriptor, aggregations) build_committee_knessets_list_template(jinja_env, committees, aggregations) build_committees_index_template(jinja_env, committees, aggregations) spew({}, [], stats)
def main(): params, dp, res_iter = ingest() dp['name'] = 'category-explanations' dp['resources'] = [{ 'name': 'category-explanations', 'path': 'data/category-explanations.csv', PROP_STREAMING: True, 'schema': { 'fields': [ { 'name': 'budget_code', 'type': 'string' }, { 'name': 'explanation', 'type': 'string' }, { 'name': 'explanation_short', 'type': 'string' }, { 'name': 'source', 'type': 'string' }, ] } }] spew(dp, [ itertools.chain( process_file('category-explanations.md', 'explanation'), process_file('category-explanations-short.md', 'explanation_short'), ) ])
def main(): parameters, datapackage, resources, stats = ingest() + ({}, ) bills = {} israel_law_bill_ids = {} for bill in next(resources): bill['law_ministry_ids'] = [] bills[bill['BillID']] = bill if bill['IsraelLawID']: for israel_law_id in bill['IsraelLawID']: israel_law_bill_ids.setdefault(israel_law_id, []) israel_law_bill_ids[israel_law_id].append(bill['BillID']) for law_ministry in next(resources): for bill_id in israel_law_bill_ids.get(law_ministry['IsraelLawID'], []): if law_ministry['GovMinistryID'] not in bills[bill_id][ 'law_ministry_ids']: bills[bill_id]['law_ministry_ids'].append( law_ministry['GovMinistryID']) gov_ministries = {} for gov_ministry in next(resources): gov_ministries[gov_ministry['GovMinistryID']] = gov_ministry['Name'] for bill in bills.values(): ministry_names = set() for ministry_id in bill['law_ministry_ids']: ministry_names.add(gov_ministries[ministry_id]) bill['law_ministry_names'] = ', '.join(ministry_names) datapackage["resources"] = [datapackage['resources'][0]] fields = [{ 'name': 'law_ministry_ids', 'type': 'array' }, { 'name': 'law_ministry_names', 'type': 'string' }] datapackage["resources"][0]['schema']['fields'] += fields spew(datapackage, [bills.values()], stats)
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}} collections = {} for descriptor, resource in zip(datapackage["resources"], resources): if descriptor["name"] == "collections": collections = list(resource) else: list(resource) datapackage["resources"] = [] for collection in collections: datapackage["resources"].append({ PROP_STREAMING: True, "name": collection["id"], "path": "{}.csv".format(collection["id"]), "schema": { "fields": [{ "name": "label", "type": "string" }, { "name": "manifest", "type": "string" }] } }) def get_resource(collection): for member in json.loads(requests.get( collection["json"]).content)["members"]: yield {"label": member["label"], "manifest": member["@id"]} spew(datapackage, (get_resource(collection) for collection in collections), aggregations["stats"])
def main(): parameters, datapackage, resources, stats = ingest() + (defaultdict(int),) resource_names = [r['name'] for r in datapackage['resources']] datapackage['resources'] = [r for r in datapackage['resources'] if r['name'] == 'foi_offices'] datapackage['resources'][0]['schema']['fields'] += [{'name': 'update_type', 'type': 'string'}, {'name': 'update_title', 'type': 'string'}, {'name': 'entity_id', 'type': 'string'},] def get_resources(): existing_entities = {} for resource_name, resource in zip(resource_names, resources): if resource_name == 'existing_entities': for row in get_existing_entities(resource, existing_entities, stats): pass elif resource_name == 'foi-groups-matching': for row in get_foi_groups_matching(resource, existing_entities, stats): pass elif resource_name == 'foi_offices': yield get_foi_offices_resource(resource, existing_entities, stats, parameters.get('dry-run')) else: for row in resource: pass spew(datapackage, get_resources(), stats)
def main(): params, dp, res_iter = ingest() os.makedirs('/var/datapackages/sitemaps', exist_ok=True) kind = params['kind'] db_table = params['db-table'] doc_id = params['doc-id'] page_title = params['page-title'] if not dp.get('resources'): dp['resources'] = [{ 'name': 'sitemaps', 'path': 'sitemaps.csv', PROP_STREAMING: True, 'schema': { 'fields': [{ 'name': 'filename', 'type': 'string' }] } }] spew(dp, [process_rows(res_iter, kind, db_table, doc_id, page_title)])
def main(): parameters, dp, res_iter = ingest() connection_string = get_connection_string() existing_ids = None resource_name = parameters['resource-name'] input_key_fields = parameters['key-fields'] input_hash_fields = parameters.get('hash-fields') for res in dp['resources']: if resource_name == res['name']: if input_hash_fields is None: input_hash_fields = set(f['name'] for f in res['schema']['fields']) input_hash_fields = set(input_hash_fields) - set(input_key_fields) if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0: res['schema']['fields'].extend(STATUS_FIELDS) db_key_fields = parameters.get('db-key-fields', input_key_fields) db_hash_fields = parameters.get('db-hash-fields', input_hash_fields) existing_ids = \ get_all_existing_ids(connection_string, parameters['db-table'], db_key_fields, db_hash_fields) break assert existing_ids is not None logging.info('Found %d ids', len(list(existing_ids.keys()))) spew(dp, process_resources(res_iter, resource_name, input_key_fields, input_hash_fields, existing_ids))
def main(): parameters, datapackage, resources = ingest() aggregations = {"stats": {}} resources = list(resources) for descriptor in datapackage["resources"]: descriptor["schema"]["fields"] = [{ "name": "doc_id", "type": "string" }, { "name": "system_number", "type": "string" }, { "name": "manifest_url", "type": "string" }, { "name": "manifest_file", "type": "string" }] def get_resources(): for resource, descriptor in zip(resources, datapackage["resources"]): yield (parse_row(row) for row in resource) spew(datapackage, get_resources(), aggregations["stats"])
def process_single(resource): counter = 0 nones = dict((c, 0) for c in threshold_columns) for row in resource: counter += 1 for column in threshold_columns: value = row.get(column) if is_empty(value): nones[column] += 1 for column in allowed_value_columns: value = row.get(column) if not is_empty(value) and value != 'unknown': if value not in allowed_values[column]: raise ValueError( '%s: Got %r whereas allowed values for this column are %r' % (column, value, allowed_values[column])) yield row for column in threshold_columns: ratio_percent = 100 - (100 * nones[column]) // counter if ratio_percent < thresholds[column]: raise ValueError( '%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)' % (column, counter - nones[column], counter, ratio_percent, thresholds[column])) for resource_ in resources: yield process_single(resource_) spew(datapackage_, process(resources_))
from datapackage_pipelines.wrapper import spew, ingest parameters, datapackage, res_iter = ingest() res_name = parameters.get('resource', datapackage['resources'][0]['name']) def show_sample(res): logging.info('SAMPLE OF LINES from %s', res.spec['name']) for i, row in enumerate(res): if i < 10: if isinstance(row, LazyJsonLine): logging.info('#%s: %s', i, row._evaluate()) else: logging.info('#%s: %r', i, row) yield row def process_resources(res_iter_): for res in res_iter_: logging.info('? from %s', res.spec['name']) if res.spec['name'] == res_name: yield show_sample(res) else: yield res logging.info(json.dumps(datapackage, indent=2)) spew(datapackage, process_resources(res_iter))
logging.info('Subprocess: "' + ' '.join(command_line_args) + '"') try: command_line_process = subprocess.Popen( command_line_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) with command_line_process.stdout: log_subprocess_output(command_line_process.stdout) except (OSError, subprocess.CalledProcessError) as exception: logging.info('Exception occured: ' + str(exception)) logging.info('Subprocess failed') raise exception else: # no exception was raised logging.info('Subprocess finished') return True parameters, datapackage, res_iter = ingest() run_shell_command( parameters["arguments"] ) spew(datapackage, res_iter)
fields_to_keep = [f['name'] for f in fields] fields.extend(extra_keys) fields.append(extra_value) resource['schema']['fields'] = fields return unpivot_fields_without_regex, fields_to_keep def unpivot(rows, fields_to_unpivot_, fields_to_keep_): for row in rows: for unpivot_field in fields_to_unpivot_: new_row = copy.deepcopy(unpivot_field['keys']) for field in fields_to_keep_: new_row[field] = row[field] new_row[extra_value['name']] = row.get(unpivot_field['name']) yield new_row def process_resources(resource_iterator_, fields_to_unpivot, fields_to_keep): for resource in resource_iterator_: spec = resource.spec if not resources.match(spec['name']): yield resource else: yield unpivot(resource, fields_to_unpivot, fields_to_keep) old_fields, keep_fields = process_datapackage(datapackage) spew(datapackage, process_resources(resource_iterator, old_fields, keep_fields))
from datapackage_pipelines.wrapper import spew, ingest import time, logging, datetime, sys def filter_resource(resource, sleep_seconds): yield from resource time.sleep(sleep_seconds) def filter_resources(datapackage, resources, parameters): input_resource_name = parameters.get("resource") sleep_seconds = float(parameters.get( "sleep-seconds", 2)) # sleep 2 seconds between resources for resource_descriptor, resource in zip(datapackage["resources"], resources): if not input_resource_name or input_resource_name == resource_descriptor[ "name"]: logging.info("throttling resource {}: sleep_seconds={}".format( resource_descriptor["name"], sleep_seconds)) yield filter_resource(resource, sleep_seconds) else: yield resource parameters, datapackage, resources = ingest() spew(datapackage, filter_resources(datapackage, resources, parameters))
userid = gobble.user.User().id for dirpath, dirnames, filenames in os.walk('.'): if dirpath == '.': continue if FILENAME in filenames: pipeline = yaml.load(open(os.path.join(dirpath, FILENAME))) dataset_name = pipeline[list(pipeline.keys())[0]]['pipeline'][0]['parameters']['datapackage']['name'] url_base = 'http://datastore.openspending.org/{}/{}'.format(userid, dataset_name) resp = requests.get(url_base+'/datapackage.json') if resp.status_code == 200: datapackage_json = resp.json() if len(country) > 0: if datapackage_json.get('geo', {}).get('country_code', 'xx').lower() != country: continue resource = datapackage_json['resources'][0] resource_url = '{}/{}'.format(url_base, resource['path']) resources.append({ PROP_STREAMED_FROM: resource_url, 'path': PATH_PLACEHOLDER, 'name': dataset_name, 'encoding': 'utf-8', 'delimiter': ',', 'doublequote': True, 'quotechar': '"', 'skipinitialspace': False }) logging.error(resource_url) spew(datapackage, [])
def main(cls): from datapackage_pipelines.wrapper import ingest, spew spew(*cls(*ingest()).spew())
"""Map the raw columns names to fiscal fields where indicated.""" import logging from datapackage_pipelines.wrapper import ingest, spew from common.utilities import get_fiscal_field_names def update_datapackage(datapackage): valid_fiscal_fields = get_fiscal_field_names() for resource in datapackage['resources']: for field in resource['schema']['fields']: if field['maps_to'] in valid_fiscal_fields: field['name'] = field.pop('maps_to') else: logging.info('Unmapped = %s', field['name']) return _, datapackage_, resources_ = ingest() spew(update_datapackage(datapackage_), resources_)
return html def filter_resource(descriptor, data, stats): for row in data: stats[descriptor["name"]] += 1 yield row def filter_resources(datapackage, resources, parameters, stats): tables = [] for resource_descriptor, resource_data in zip(datapackage["resources"], resources): schema = resource_descriptor["schema"] stats[resource_descriptor["name"]] = 0 tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"])) yield filter_resource(resource_descriptor, resource_data, stats) html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables)) save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA) if save_schema: save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html") save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json") s3 = object_storage.get_s3() object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True) object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True) stats = {} spew(datapackage, filter_resources(datapackage, resources, parameters, stats), stats)
for key, lookup in lookup_tables.items(): if row[key] in lookup: row[key] = lookup[row[key]] else: warning('%s mapped to None because no alias was found', row[key]) row[key] = None return row def build_lookup_tables(mappings): """Build the lookup tables.""" def lookup_table(mapping): for key, aliases in mapping.items(): for alias in aliases: yield alias, key return { mapping['field']: dict(lookup_table(mapping['mapping'])) for mapping in mappings } if __name__ == '__main__': parameters, _, resources = ingest() lookup_tables_ = build_lookup_tables(parameters['mappings']) new_resources = process(resources, map_aliases, lookup_tables=lookup_tables_) spew(_, new_resources)
mk_attendance.append(mk_aggs) except Exception: logging.exception("Failed to process mk_individual name {}".format(mk_name)) raise else: raise Exception("Failed to find mk_individual name for mk_individual id {}".format(mk_individual["mk_individual_id"])) meeting_aggs_fields = [{"name": "knesset_num", "type": "integer"}, {"name": "committee_id", "type": "integer"}, {"name": "committee_name", "type": "string"}, {"name": "meeting_start_date", "type": "datetime"}, {"name": "meeting_topics", "type": "string"}, ] datapackage["resources"] = [] datapackage["resources"] += [{"name": "errors", "path": "errors.csv", PROP_STREAMING: True, "schema": {"fields": [{"name": "error", "type": "string"}, ]}}] datapackage["resources"] += [{PROP_STREAMING: True, "name": "mk_attendance", "path": "mk_attendance.csv", "schema": {"fields": meeting_aggs_fields + [{"name": "mk_id", "type": "integer"}, {"name": "mk_name", "type": "string"}, {"name": "mk_membership_committee_names", "type": "string"}, {"name": "mk_faction_id", "type": "integer"}, {"name": "mk_faction_name", "type": "string"}, ]}}] spew(datapackage, [errors, mk_attendance])
def get_project_urls(): """Return the complete list of project URLS.""" counter = 0 paths = [] while True: counter += 1 project = PAGINATION_URL.format(counter=counter) response = session.get(project) if response.text: doc = fromstring(response.content) more_links = doc.findall(PROJECT_URLS_XPATH) more_paths = list(map(lambda x: x.get('href'), more_links)) paths.extend(more_paths) info('Collected %s urls on page %s', len(more_paths), counter) else: return paths if __name__ == '__main__': _, datapackage, _ = ingest() project_paths = get_project_urls() project_rows = scrape_projects(project_paths) spew(datapackage, [project_rows])
"""A processor to inject constant values into the data.""" from datapackage_pipelines.wrapper import ingest, spew row_count = 0 def process_rows(prefix, rows): global row_count for row in rows: row['internal_id'] = '{}-{}'.format(prefix, row_count) yield row row_count += 1 def process(prefix, resources): for resource in resources: yield process_rows(prefix, resource) if __name__ == '__main__': """Ingest, process and spew out.""" parameters_, datapackage_, resources_ = ingest() spew(datapackage_, process(parameters_['prefix'], resources_))
"""Grab the source description and convert it into a datapackage""" import json import logging from datapackage_pipelines.wrapper import ingest, spew from common.config import JSON_FORMAT from common.utilities import get_fiscal_datapackage if __name__ == '__main__': _, datapackage, resources = ingest() fiscal_datapackage = get_fiscal_datapackage(source=datapackage) fiscal_datapackage_as_json = json.dumps(fiscal_datapackage, **JSON_FORMAT) logging.debug('Loaded fiscal datapackage:\n%s', fiscal_datapackage_as_json) spew(fiscal_datapackage, resources)
"""The template for writing PDF and web scrapers.""" from datapackage_pipelines.wrapper import ingest, spew from logging import debug def scrape_beneficiaries(**params): """Return a generator of beneficiaries. Each beneficiary is a dictionary whose keys match the fields described in source.description.yaml. Parameters come from pipeline-specs.yaml. """ debug('%s', **params) beneficiaries = [ {'field1': 'foo', 'field2': 'spam'}, {'field1': 'bar', 'field2': 'eggs'}, ] for beneficiary in beneficiaries: yield beneficiary if __name__ == '__main__': parameters, datapackage, _ = ingest() rows = scrape_beneficiaries(**parameters) spew(datapackage, [rows])
from arrow.parser import ParserError from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def parse_currencies(row): """Clean up and convert currency fields to floats.""" date_columns = ( 'Datum van laatste bijwerking', 'Einddatum', 'Begindatum' ) for key in date_columns: try: row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm') except ParserError: if row[key] != '0000-00-00 00:00:00': message = 'Could not parse %s to a date, returning None' logging.warning(message, row[key]) row[key] = None return row if __name__ == '__main__': parameters, datapackage_, resources = ingest() new_resources_ = process(resources, parse_currencies) spew(datapackage_, new_resources_)
'fields': [{ 'name': h, 'type': 'string' } for h in headers] } datapackage['resources'].append(resource) def process_resources(res_iter, datapackage, domain): def get_latest_row(first): latest_row = None my_rows = [] for row in first: if row['domain'] == domain and row['source'] == 'discourse': latest_row = row my_rows.append(row) return latest_row, iter(my_rows) if len(datapackage['resources']): if datapackage['resources'][0]['name'] == 'latest-project-entries': latest_row, latest_iter = get_latest_row(next(res_iter)) yield latest_iter else: latest_row = None yield from res_iter yield discourse_collector(domain, latest_row) spew(datapackage, process_resources(res_iter, datapackage, domain))
yield field_['name'], converters[field_['type']] converter = dict(get_fiscal_types()) dump = {k: v.__name__ for k, v in converter.items()} logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4)) def cast_values(row): """Cast values to fiscal types.""" for key, value in row.items(): if value: try: if value is None or (type(value) is str and len(value.strip()) == 0): row[key] = None else: row[key] = converter[key](value) except (ValueError, arrow.parser.ParserError): message = 'Could not cast %s = %s to %s, returning None' % (key, row[key], converter[key]) logging.warning(message) assert False, message return row if __name__ == '__main__': _, datapackage, resources = ingest() new_resources = process(resources, cast_values) spew(datapackage, new_resources)
all_attendee_names = set() for attendee_names in (meeting["mks"], meeting["invitees"], meeting["legal_advisors"], meeting["manager"]): if attendee_names and len(attendee_names) > 0: for attendee_name in attendee_names: if type(attendee_name) == str: all_attendee_names.add(attendee_name) else: all_attendee_names.add(attendee_name["name"]) attended_mk_individual_ids = set() for attendee_name in all_attendee_names: for mk_individual in filter(lambda mk: meeting["KnessetNum"] in mk["knesset_nums"], map(get_mk_individual, mk_individuals)): if meeting["KnessetNum"] in mk_individual["knesset_nums"]: name_equals, name_in = False, False for name in mk_individual["mk_names"]: if name == attendee_name: name_equals += 1 if name in attendee_name: name_in += 1 if name_equals or name_in: attended_mk_individual_ids.add(mk_individual["mk_individual_id"]) meeting["attended_mk_individual_ids"] = list(attended_mk_individual_ids) yield meeting datapackage["resources"] = [datapackage["resources"][1]] datapackage["resources"][0]["schema"]["fields"] += [{"name": "attended_mk_individual_ids", "type": "array"}] spew(datapackage, [get_resource()])
args = filename, format_data_sample(stream) info('Concatenated %s:\n%s', *args) info('Done concatenating %s files', nb_files) def assemble_fiscal_datapackage(): """Assemble the fiscal datapackage for the concatenated dataset.""" with open(FISCAL_METADATA_FILE) as stream: fdp = yaml.load(stream.read()) with open(FISCAL_MODEL_FILE) as stream: fdp['model'] = yaml.load(stream.read()) with open(FISCAL_SCHEMA_FILE) as stream: fdp['resources'][0]['schema'] = yaml.load(stream.read()) message = 'Fiscal datapackage: \n%s' info(message, format_to_json(fdp)) return fdp if __name__ == '__main__': parameters, datapackage, _ = ingest() datapackage = assemble_fiscal_datapackage() datasets = collect_local_datasets(**parameters) resource = concatenate(datasets, **parameters) spew(datapackage, [resource])
collated_field_name)] = { 'fields': inner_fields } def val(v): if isinstance(v, Decimal): v = float(v) elif isinstance(v, date): v = v.isoformat() return v def process_resource(res): for row in res: inner = dict((k, val(v)) for k, v in row.items() if k not in key) outer = dict((k, v) for k, v in row.items() if k in key) outer[collated_field_name] = inner yield outer def process_resources(res_iter_): for res in res_iter_: if resource_matcher.match(res.spec['name']): yield process_resource(res) else: yield res spew(dp, process_resources(res_iter))
if index != 1: yield index, headers, values @staticmethod def _fixed_points(rows): """Convert floats to 2-digit fixed precision strings""" for index, headers, values in rows: values = [ '%.2f' % value if type(value) is float else value for value in values ] yield index, headers, values XLSXIngestor = XLSIngestor def ingest_resources(datapackage): """Ingest each resource one by one into the pipeline.""" for resource in datapackage['resources']: ingestor = BaseIngestor.load(resource) yield ingestor.rows if __name__ == '__main__': _, datapackage_, _ = ingest() resources = list(ingest_resources(datapackage_)) spew(datapackage_, resources)
headers = [hdr_num, hdr_name, hdr_reg_date] for data in datums: yield dict(zip(headers, treat(data))) resource = { 'name': resource_name, PROP_STREAMING: True, 'path': 'data/{}.csv'.format(resource_name), 'schema': { 'fields': [ { 'name': hdr_num, 'type': 'string' }, { 'name': hdr_name, 'type': 'string' }, { 'name': hdr_reg_date, 'type': 'string' }, ] } } datapackage['resources'].append(resource) spew(datapackage, [get_entities()])
"""A processor to concatenate resources that have a common set of fields.""" from datapackage_pipelines.wrapper import ingest, spew def concatenate(resources): """Concatenate multiple resources.""" for resource in resources: for row in resource: yield row if __name__ == '__main__': _, datapackage, resources_ = ingest() single_resource = concatenate(resources_) datapackage['resources'] = [datapackage['resources'][0]] spew(datapackage, [single_resource])
resources = ResourceMatcher(parameters.get('resources'), datapackage) ignore_missing = parameters.get('ignore-missing', False) limit_rows = parameters.get('limit-rows', -1) new_resource_iterator = [] for resource in datapackage['resources']: if streamable(resource): url = resource[PROP_STREAMED_FROM] name = resource['name'] if not resources.match(name): continue path = get_path(resource) if path is None or path == PATH_PLACEHOLDER: path = os.path.join('data', name + '.csv') resource['path'] = path resource[PROP_STREAMING] = True rows = stream_reader(resource, url, ignore_missing or url == "", limit_rows, resource.pop('http_headers', None)) new_resource_iterator.append(rows) elif streaming(resource): new_resource_iterator.append(next(resource_iterator)) spew(datapackage, new_resource_iterator)
"UnitText1_En": doc["_source"]["UnitText1"].get("En") if doc["_source"].get("UnitText1") else "", "UnitText1_He": doc["_source"]["UnitText1"].get("He") if doc["_source"].get("UnitText1") else "", "Header_En": doc["_source"]["Header"].get("En") if doc["_source"].get("Header") else "", "Header_He": doc["_source"]["Header"].get("He") if doc["_source"].get("Header") else "", }) yield from filtered_row else: break datapackage = {"name": "_", "resources": [{"name": "es_data", "path": "es_data.csv", PROP_STREAMING: True, "schema": {"fields": [{"name": "index", "type": "string"}, {"name": "doc_type", "type": "string"}, {"name": "doc_id", "type": "string"}, {"name": "UnitId", "type": "string"}, {"name": "RightsCode", "type": "string"}, {"name": "RightsDesc", "type": "string"}, {"name": "StatusDesc", "type": "string"}, {"name": "DisplayStatusDesc", "type": "string"}, {"name": "UnitType", "type": "string"}, {"name": "Slug_En", "type": "string"}, {"name": "Slug_He", "type": "string"}, {"name": "UnitText1_En", "type": "string"}, {"name": "UnitText1_He", "type": "string"}, {"name": "Header_En", "type": "string"}, {"name": "Header_He", "type": "string"}]}}]} spew(datapackage, get_resources(), stats)
def main(): parameters, dp, res_iter = ingest() connection_string = get_connection_string() existing_ids = None resource_name = parameters['resource-name'] input_key_fields = parameters['key-fields'] input_hash_fields = parameters.get('hash-fields') prefix = parameters.get('prefix', '') STATUS_FIELDS = [ { 'name': prefix + '__last_updated_at', 'type': 'datetime' }, { 'name': prefix + '__last_modified_at', 'type': 'datetime' }, { 'name': prefix + '__created_at', 'type': 'datetime' }, { 'name': prefix + '__is_new', 'type': 'boolean' }, { 'name': prefix + '__is_stale', 'type': 'boolean' }, { 'name': prefix + '__staleness', 'type': 'integer' }, { 'name': prefix + '__next_update_days', 'type': 'integer' }, { 'name': prefix + '__hash', 'type': 'string' }, ] STATUS_FIELD_NAMES = list(f['name'] for f in STATUS_FIELDS) for res in dp['resources']: if resource_name == res['name']: if input_hash_fields is None: input_hash_fields = set(f['name'] for f in res['schema']['fields']) input_hash_fields = set(input_hash_fields) - set(input_key_fields) if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0: res['schema']['fields'].extend(STATUS_FIELDS) input_hash_fields = set(input_hash_fields) - set( STATUS_FIELD_NAMES) db_key_fields = parameters.get('db-key-fields', input_key_fields) db_hash_fields = parameters.get('db-hash-fields', input_hash_fields) existing_ids = \ get_all_existing_ids(connection_string, parameters['db-table'], db_key_fields, [ prefix + '__last_updated_at', prefix + '__next_update_days', prefix + '__hash', prefix + '__created_at', ] ) break assert existing_ids is not None logging.info('Found %d ids', len(list(existing_ids.keys()))) spew( dp, process_resources(res_iter, resource_name, input_key_fields, input_hash_fields, existing_ids, prefix))
def process(resources): def process_single(resource): counter = 0 nones = dict((c, 0) for c in threshold_columns) for row in resource: counter += 1 for column in threshold_columns: value = row.get(column) if is_empty(value): nones[column] += 1 for column in allowed_value_columns: value = row.get(column) if not is_empty(value) and value != 'unknown': if value not in allowed_values[column]: raise ValueError('%s: Got %r whereas allowed values for this column are %r' % (column, value, allowed_values[column])) yield row for column in threshold_columns: ratio_percent = 100 - (100*nones[column])//counter if ratio_percent < thresholds[column]: raise ValueError('%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)' % (column, counter-nones[column], counter, ratio_percent, thresholds[column])) for resource_ in resources: yield process_single(resource_) spew(datapackage_, process(resources_))
# ISSUES & PR COUNTS base_issue_url = '{}/search/issues?q=repo:{}'.format(base_url, current_repo_name) row['open_prs'] = _get_issue_count_for_request( '{}%20state:open%20is:pr'.format(base_issue_url)) row['closed_prs'] = _get_issue_count_for_request( '{}%20state:closed%20is:pr'.format(base_issue_url)) row['open_issues'] = _get_issue_count_for_request( '{}%20state:open%20is:issue'.format(base_issue_url)) row['closed_issues'] = _get_issue_count_for_request( '{}%20state:closed%20is:issue'.format(base_issue_url)) resource = {'name': name, 'path': 'data/{}.csv'.format(name)} # Temporarily set all types to string, will use `set_types` processor in # pipeline to assign correct types resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in row.keys()] } datapackage['resources'].append(resource) resource_content.append(row) spew(datapackage, itertools.chain(res_iter, [resource_content]))
def main(cls): # can be used like this in datapackage processor files: # if __name__ == '__main__': # Processor.main() spew(*cls(*ingest()).spew())