from arrow.parser import ParserError from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def parse_currencies(row): """Clean up and convert currency fields to floats.""" date_columns = ( 'Datum van laatste bijwerking', 'Einddatum', 'Begindatum' ) for key in date_columns: try: row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm') except ParserError: if row[key] != '0000-00-00 00:00:00': message = 'Could not parse %s to a date, returning None' logging.warning(message, row[key]) row[key] = None return row if __name__ == '__main__': parameters, datapackage_, resources = ingest() new_resources_ = process(resources, parse_currencies) spew(datapackage_, new_resources_)
fields = datapackage_['resources'][0]['schema']['fields'] fiscal_fields = get_fiscal_field_names() for field in fields: checks = ('maps_to' in field, field.get('maps_to'), field.get('maps_to') in fiscal_fields) if all(checks): yield field['name'], field['maps_to'] info('%s mapped to %s', field['name'], field['maps_to']) else: info('Skipping %s', field['name']) def apply_mapping(row, mapping=None): """Apply the mapping to one row.""" if mapping: for old, new in mapping: row[new] = row.pop(old) return row if __name__ == '__main__': _, datapackage, resources = ingest() lookup_table = tuple(build_lookup_table(datapackage)) new_resources = process(resources, apply_mapping, mapping=lookup_table) spew(datapackage, new_resources)
fields = datapackage_['resources'][0]['schema']['fields'] fiscal_fields = get_fiscal_field_names() for field in fields: checks = ('maps_to' in field, field.get('maps_to'), field.get('maps_to') in fiscal_fields) if all(checks): yield field['name'], field['maps_to'] info('%s mapped to %s', field['name'], field['maps_to']) else: info('Skipping %s', field['name']) def apply_mapping(row, mapping=None): """Apply the mapping to one row.""" if mapping: for old, new in mapping: row[new] = row.pop(old) return row if __name__ == '__main__': _, datapackage, resources = ingest() lookup_table = tuple(build_lookup_table(datapackage)) new_resources = process(resources, apply_mapping, mapping=lookup_table) spew(datapackage, new_resources)
if 'translates_to' in field: del field['translates_to'] fields.append(field) resource['schema']['fields'] = fields return datapackage def apply_mapping(row, mappings=None, resource_index=None): """Rename data keys with a valid mapping and drop the rest.""" for raw_key, fiscal_key in mappings[resource_index].items(): if fiscal_key in ('_ignored', '_unknown'): del row[raw_key] else: row[fiscal_key] = row.pop(raw_key) return row if __name__ == '__main__': _, datapackage_, resources_ = ingest() mappings_ = build_mapping_tables(datapackage_) datapackage_ = update_datapackage(datapackage_, mappings_) new_resources_ = process(resources_, apply_mapping, mappings=mappings_, pass_resource_index=True) spew(datapackage_, new_resources_)
converter = dict(get_fiscal_types()) dump = {k: v.__name__ for k, v in converter.items()} logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4)) def cast_values(row): """Cast values to fiscal types.""" for key, value in row.items(): if value: try: if value is None or (type(value) is str and len(value.strip()) == 0): row[key] = None else: row[key] = converter[key](value) except (ValueError, arrow.parser.ParserError): message = 'Could not cast %s = %s to %s, returning None' % ( key, row[key], converter[key]) logging.warning(message) assert False, message return row if __name__ == '__main__': _, datapackage, resources = ingest() new_resources = process(resources, cast_values) spew(datapackage, new_resources)
Datapackage mutation -------------------- None. """ from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def add_geocodes(row, **kw): """Fill up the country and region fields.""" row['beneficiary_country_code'] = kw['country_code'] row['beneficiary_country'] = kw['country'] row['beneficiary_nuts_code'] = kw['nuts_code'] row['beneficiary_nuts_region'] = kw['region'] return row if __name__ == '__main__': parameters_, datapackage, resources = ingest() new_resources = process(resources, add_geocodes, **parameters_) spew(datapackage, new_resources)
for key, lookup in lookup_tables.items(): if row[key] in lookup: row[key] = lookup[row[key]] else: warning('%s mapped to None because no alias was found', row[key]) row[key] = None return row def build_lookup_tables(mappings): """Build the lookup tables.""" def lookup_table(mapping): for key, aliases in mapping.items(): for alias in aliases: yield alias, key return { mapping['field']: dict(lookup_table(mapping['mapping'])) for mapping in mappings } if __name__ == '__main__': parameters, _, resources = ingest() lookup_tables_ = build_lookup_tables(parameters['mappings']) new_resources = process(resources, map_aliases, lookup_tables=lookup_tables_) spew(_, new_resources)
"""A processor to fill in fields which have constant values.""" from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def fill_columns(row, constants=None): """Fill columns whose value is constant.""" if constants: for key, constant_value in constants.items(): row[key] = constant_value return row if __name__ == '__main__': parameters, datapackage, resources = ingest() new_resources = process(resources, fill_columns, **parameters) spew(datapackage, new_resources)
for i, row in enumerate(resource): data_sample.append(row) if i + 1 == sample_size: break return data_sample, resource def concatenate_data_sample(data_sample, resource): """Concatenate the sample rows back with the rest of the resource.""" i = 0 for row in data_sample: i+=1 yield row for row in resource: i+=1 yield row if __name__ == '__main__': parameters_, datapackage_, resources_ = ingest() resources_ = list(resources_) resource_ = resources_[0] assert(len(resources_) == 1) resource_sample_, resource_left_over_ = extract_data_sample(resource_) casters_ = get_casters(datapackage_, resource_sample_, parameters=parameters_) resource_ = concatenate_data_sample(resource_sample_, resource_left_over_) kwargs = dict(casters=casters_, pass_row_index=True) new_resources_ = process([resource_], cast_values, **kwargs) spew(datapackage_, new_resources_)
raw_date = row[date_field] first_token = raw_date.split(',')[0].lower() if first_token in weekdays: row[date_field] = arrow.get(raw_date, long_format).date() elif '/' in raw_date: row[date_field] = arrow.get(raw_date, short_format).date() elif 'Q' in raw_date: match = quarter_pattern.search(raw_date) if match: quarter = int(match.group(1)) year = int(match.group(2)) month = quarter_month[quarter] day = quarter_end_day[quarter] if 'End' in date_field else 1 row[date_field] = date(year, month, day) else: year = int(raw_date) row[date_field] = date(year, 1, 1) return row if __name__ == '__main__': parameters, datapackage, resources = ingest() new_resources = process(resources, parse_dates, **parameters) spew(datapackage, new_resources)
"""This processor modifies the datapackage without modifying the resources.""" from datapackage_pipelines.wrapper import ingest from datapackage_pipelines.wrapper import spew from os.path import splitext from common.utilities import process def set_extension_to_csv(package): """Rename the data file with a CSV extension.""" for resource in package['resources']: _, extension = splitext(resource['path']) resource['path'] = resource['path'].replace(extension, '.csv') return package if __name__ == '__main__': _, datapackage, resources = ingest() new_datapackage = set_extension_to_csv(datapackage) new_resources = process(resources, lambda x: x) spew(new_datapackage, new_resources)
raw_date = row[date_field] first_token = raw_date.split(',')[0].lower() if first_token in weekdays: row[date_field] = arrow.get(raw_date, long_format).date() elif '/' in raw_date: row[date_field] = arrow.get(raw_date, short_format).date() elif 'Q' in raw_date: match = quarter_pattern.search(raw_date) if match: quarter = int(match.group(1)) year = int(match.group(2)) month = quarter_month[quarter] day = quarter_end_day[quarter] if 'End' in date_field else 1 row[date_field] = date(year, month, day) else: year = int(raw_date) row[date_field] = date(year, 1, 1) return row if __name__ == '__main__': parameters, datapackage, resources = ingest() new_resources = process(resources, parse_dates, **parameters) spew(datapackage, new_resources)
for i, row in enumerate(resource): data_sample.append(row) if i + 1 == sample_size: break return data_sample, resource def concatenate_data_sample(data_sample, resource): """Concatenate the sample rows back with the rest of the resource.""" i = 0 for row in data_sample: i += 1 yield row for row in resource: i += 1 yield row if __name__ == '__main__': parameters_, datapackage_, resources_ = ingest() resources_ = list(resources_) resource_ = resources_[0] assert(len(resources_) == 1) resource_sample_, resource_left_over_ = extract_data_sample(resource_) casters_ = get_casters(datapackage_, resource_sample_, parameters=parameters_) resource_ = concatenate_data_sample(resource_sample_, resource_left_over_) kwargs = dict(casters=casters_, pass_row_index=True) new_resources_ = process([resource_], cast_values, **kwargs) spew(datapackage_, new_resources_)
yield field_['name'], converters[field_['type']] converter = dict(get_fiscal_types()) dump = {k: v.__name__ for k, v in converter.items()} logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4)) def cast_values(row): """Cast values to fiscal types.""" for key, value in row.items(): if value: try: if value is None or (type(value) is str and len(value.strip()) == 0): row[key] = None else: row[key] = converter[key](value) except (ValueError, arrow.parser.ParserError): message = 'Could not cast %s = %s to %s, returning None' % (key, row[key], converter[key]) logging.warning(message) assert False, message return row if __name__ == '__main__': _, datapackage, resources = ingest() new_resources = process(resources, cast_values) spew(datapackage, new_resources)
import logging import arrow from arrow.parser import ParserError from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def parse_currencies(row): """Clean up and convert currency fields to floats.""" date_columns = ('Datum van laatste bijwerking', 'Einddatum', 'Begindatum') for key in date_columns: try: row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm') except ParserError: if row[key] != '0000-00-00 00:00:00': message = 'Could not parse %s to a date, returning None' logging.warning(message, row[key]) row[key] = None return row if __name__ == '__main__': parameters, datapackage_, resources = ingest() new_resources_ = process(resources, parse_currencies) spew(datapackage_, new_resources_)
- beneficiary_nuts_region Datapackage mutation -------------------- None. """ from datapackage_pipelines.wrapper import ingest, spew from common.utilities import process def add_geocodes(row, **kw): """Fill up the country and region fields.""" row['beneficiary_country_code'] = kw['country_code'] row['beneficiary_country'] = kw['country'] row['beneficiary_nuts_code'] = kw['nuts_code'] row['beneficiary_nuts_region'] = kw['region'] return row if __name__ == '__main__': parameters_, datapackage, resources = ingest() new_resources = process(resources, add_geocodes, **parameters_) spew(datapackage, new_resources)
def test_process_returns_a_generator_of_generators(): assert next(next(process([['foo']], lambda x: x))) == 'foo'
def test_process_returns_a_generator_of_generators(): assert next(next(process([['foo']], lambda x: x))) == 'foo'
def parse_currencies(row, fields=None, characters=None): """Clean up and convert currency fields to floats.""" assert fields, 'Missing `fields` parameter' assert characters, 'Missing `characters` parameter' for key in fields: if row[key] is not None: row[key] = str(row[key]) if not row[key].strip(): row[key] = None else: try: row[key] = float(row[key].replace( characters['currency'], '').replace(characters['grouping'], '').replace(characters['decimal'], '.').strip()) except ValueError as error: warning('%s in row\n%s', error, format_to_json(row)) return row if __name__ == '__main__': parameters, datapackage_, resources = ingest() new_resources_ = process(resources, parse_currencies, **parameters) spew(datapackage_, new_resources_)