Пример #1
0
from arrow.parser import ParserError
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import process


def parse_currencies(row):
    """Clean up and convert currency fields to floats."""

    date_columns = (
        'Datum van laatste bijwerking',
        'Einddatum',
        'Begindatum'
    )
    for key in date_columns:
        try:
            row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm')
        except ParserError:
            if row[key] != '0000-00-00 00:00:00':
                message = 'Could not parse %s to a date, returning None'
                logging.warning(message, row[key])

            row[key] = None

    return row


if __name__ == '__main__':
    parameters, datapackage_, resources = ingest()
    new_resources_ = process(resources, parse_currencies)
    spew(datapackage_, new_resources_)
Пример #2
0
    fields = datapackage_['resources'][0]['schema']['fields']
    fiscal_fields = get_fiscal_field_names()

    for field in fields:
        checks = ('maps_to' in field,
                  field.get('maps_to'),
                  field.get('maps_to') in fiscal_fields)

        if all(checks):
            yield field['name'], field['maps_to']
            info('%s mapped to %s', field['name'], field['maps_to'])
        else:
            info('Skipping %s', field['name'])


def apply_mapping(row, mapping=None):
    """Apply the mapping to one row."""

    if mapping:
        for old, new in mapping:
            row[new] = row.pop(old)
    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    lookup_table = tuple(build_lookup_table(datapackage))
    new_resources = process(resources, apply_mapping, mapping=lookup_table)
    spew(datapackage, new_resources)
Пример #3
0
    fields = datapackage_['resources'][0]['schema']['fields']
    fiscal_fields = get_fiscal_field_names()

    for field in fields:
        checks = ('maps_to'
                  in field, field.get('maps_to'), field.get('maps_to')
                  in fiscal_fields)

        if all(checks):
            yield field['name'], field['maps_to']
            info('%s mapped to %s', field['name'], field['maps_to'])
        else:
            info('Skipping %s', field['name'])


def apply_mapping(row, mapping=None):
    """Apply the mapping to one row."""

    if mapping:
        for old, new in mapping:
            row[new] = row.pop(old)
    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    lookup_table = tuple(build_lookup_table(datapackage))
    new_resources = process(resources, apply_mapping, mapping=lookup_table)
    spew(datapackage, new_resources)
                if 'translates_to' in field:
                    del field['translates_to']
                fields.append(field)

        resource['schema']['fields'] = fields

    return datapackage


def apply_mapping(row, mappings=None, resource_index=None):
    """Rename data keys with a valid mapping and drop the rest."""

    for raw_key, fiscal_key in mappings[resource_index].items():
        if fiscal_key in ('_ignored', '_unknown'):
            del row[raw_key]
        else:
            row[fiscal_key] = row.pop(raw_key)

    return row


if __name__ == '__main__':
    _, datapackage_, resources_ = ingest()
    mappings_ = build_mapping_tables(datapackage_)
    datapackage_ = update_datapackage(datapackage_, mappings_)
    new_resources_ = process(resources_,
                             apply_mapping,
                             mappings=mappings_,
                             pass_resource_index=True)
    spew(datapackage_, new_resources_)
converter = dict(get_fiscal_types())
dump = {k: v.__name__ for k, v in converter.items()}
logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4))


def cast_values(row):
    """Cast values to fiscal types."""

    for key, value in row.items():
        if value:
            try:
                if value is None or (type(value) is str
                                     and len(value.strip()) == 0):
                    row[key] = None
                else:
                    row[key] = converter[key](value)
            except (ValueError, arrow.parser.ParserError):
                message = 'Could not cast %s = %s to %s, returning None' % (
                    key, row[key], converter[key])
                logging.warning(message)
                assert False, message

    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    new_resources = process(resources, cast_values)
    spew(datapackage, new_resources)
Пример #6
0
Datapackage mutation
--------------------

None.

"""

from datapackage_pipelines.wrapper import ingest, spew

from common.utilities import process


def add_geocodes(row, **kw):
    """Fill up the country and region fields."""

    row['beneficiary_country_code'] = kw['country_code']
    row['beneficiary_country'] = kw['country']
    row['beneficiary_nuts_code'] = kw['nuts_code']
    row['beneficiary_nuts_region'] = kw['region']

    return row


if __name__ == '__main__':
    parameters_, datapackage, resources = ingest()

    new_resources = process(resources, add_geocodes,
                            **parameters_)
    spew(datapackage, new_resources)
Пример #7
0
    for key, lookup in lookup_tables.items():
        if row[key] in lookup:
            row[key] = lookup[row[key]]
        else:
            warning('%s mapped to None because no alias was found', row[key])
            row[key] = None
    return row


def build_lookup_tables(mappings):
    """Build the lookup tables."""

    def lookup_table(mapping):
        for key, aliases in mapping.items():
            for alias in aliases:
                yield alias, key

    return {
        mapping['field']:
            dict(lookup_table(mapping['mapping']))
        for mapping in mappings
        }


if __name__ == '__main__':
    parameters, _, resources = ingest()
    lookup_tables_ = build_lookup_tables(parameters['mappings'])
    new_resources = process(resources, map_aliases,
                            lookup_tables=lookup_tables_)
    spew(_, new_resources)
"""A processor to fill in fields which have constant values."""

from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import process


def fill_columns(row, constants=None):
    """Fill columns whose value is constant."""

    if constants:
        for key, constant_value in constants.items():
            row[key] = constant_value
    return row


if __name__ == '__main__':
    parameters, datapackage, resources = ingest()
    new_resources = process(resources, fill_columns, **parameters)
    spew(datapackage, new_resources)
    for i, row in enumerate(resource):
        data_sample.append(row)
        if i + 1 == sample_size:
            break
    return data_sample, resource


def concatenate_data_sample(data_sample, resource):
    """Concatenate the sample rows back with the rest of the resource."""

    i = 0
    for row in data_sample:
        i+=1
        yield row
    for row in resource:
        i+=1
        yield row


if __name__ == '__main__':
    parameters_, datapackage_, resources_ = ingest()
    resources_ = list(resources_)
    resource_ = resources_[0]
    assert(len(resources_) == 1)
    resource_sample_, resource_left_over_ = extract_data_sample(resource_)
    casters_ = get_casters(datapackage_, resource_sample_, parameters=parameters_)
    resource_ = concatenate_data_sample(resource_sample_, resource_left_over_)
    kwargs = dict(casters=casters_, pass_row_index=True)
    new_resources_ = process([resource_], cast_values, **kwargs)
    spew(datapackage_, new_resources_)
Пример #10
0
        raw_date = row[date_field]
        first_token = raw_date.split(',')[0].lower()

        if first_token in weekdays:
            row[date_field] = arrow.get(raw_date, long_format).date()

        elif '/' in raw_date:
            row[date_field] = arrow.get(raw_date, short_format).date()

        elif 'Q' in raw_date:
            match = quarter_pattern.search(raw_date)

            if match:
                quarter = int(match.group(1))
                year = int(match.group(2))
                month = quarter_month[quarter]
                day = quarter_end_day[quarter] if 'End' in date_field else 1
                row[date_field] = date(year, month, day)

        else:
            year = int(raw_date)
            row[date_field] = date(year, 1, 1)

    return row


if __name__ == '__main__':
    parameters, datapackage, resources = ingest()
    new_resources = process(resources, parse_dates, **parameters)
    spew(datapackage, new_resources)
"""This processor modifies the datapackage without modifying the resources."""

from datapackage_pipelines.wrapper import ingest
from datapackage_pipelines.wrapper import spew
from os.path import splitext

from common.utilities import process


def set_extension_to_csv(package):
    """Rename the data file with a CSV extension."""

    for resource in package['resources']:
        _, extension = splitext(resource['path'])
        resource['path'] = resource['path'].replace(extension, '.csv')
    return package


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    new_datapackage = set_extension_to_csv(datapackage)
    new_resources = process(resources, lambda x: x)
    spew(new_datapackage, new_resources)
Пример #12
0
        raw_date = row[date_field]
        first_token = raw_date.split(',')[0].lower()

        if first_token in weekdays:
            row[date_field] = arrow.get(raw_date, long_format).date()

        elif '/' in raw_date:
            row[date_field] = arrow.get(raw_date, short_format).date()

        elif 'Q' in raw_date:
            match = quarter_pattern.search(raw_date)

            if match:
                quarter = int(match.group(1))
                year = int(match.group(2))
                month = quarter_month[quarter]
                day = quarter_end_day[quarter] if 'End' in date_field else 1
                row[date_field] = date(year, month, day)

        else:
            year = int(raw_date)
            row[date_field] = date(year, 1, 1)

    return row


if __name__ == '__main__':
    parameters, datapackage, resources = ingest()
    new_resources = process(resources, parse_dates, **parameters)
    spew(datapackage, new_resources)
Пример #13
0
    for i, row in enumerate(resource):
        data_sample.append(row)
        if i + 1 == sample_size:
            break
    return data_sample, resource


def concatenate_data_sample(data_sample, resource):
    """Concatenate the sample rows back with the rest of the resource."""

    i = 0
    for row in data_sample:
        i += 1
        yield row
    for row in resource:
        i += 1
        yield row


if __name__ == '__main__':
    parameters_, datapackage_, resources_ = ingest()
    resources_ = list(resources_)
    resource_ = resources_[0]
    assert(len(resources_) == 1)
    resource_sample_, resource_left_over_ = extract_data_sample(resource_)
    casters_ = get_casters(datapackage_, resource_sample_, parameters=parameters_)
    resource_ = concatenate_data_sample(resource_sample_, resource_left_over_)
    kwargs = dict(casters=casters_, pass_row_index=True)
    new_resources_ = process([resource_], cast_values, **kwargs)
    spew(datapackage_, new_resources_)
        yield field_['name'], converters[field_['type']]


converter = dict(get_fiscal_types())
dump = {k: v.__name__ for k, v in converter.items()}
logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4))


def cast_values(row):
    """Cast values to fiscal types."""

    for key, value in row.items():
        if value:
            try:
                if value is None or (type(value) is str and len(value.strip()) == 0):
                    row[key] = None
                else:
                    row[key] = converter[key](value)
            except (ValueError, arrow.parser.ParserError):
                message = 'Could not cast %s = %s to %s, returning None' % (key, row[key], converter[key])
                logging.warning(message)
                assert False, message

    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    new_resources = process(resources, cast_values)
    spew(datapackage, new_resources)
import logging
import arrow

from arrow.parser import ParserError
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import process


def parse_currencies(row):
    """Clean up and convert currency fields to floats."""

    date_columns = ('Datum van laatste bijwerking', 'Einddatum', 'Begindatum')
    for key in date_columns:
        try:
            row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm')
        except ParserError:
            if row[key] != '0000-00-00 00:00:00':
                message = 'Could not parse %s to a date, returning None'
                logging.warning(message, row[key])

            row[key] = None

    return row


if __name__ == '__main__':
    parameters, datapackage_, resources = ingest()
    new_resources_ = process(resources, parse_currencies)
    spew(datapackage_, new_resources_)
Пример #16
0
    - beneficiary_nuts_region

Datapackage mutation
--------------------

None.

"""

from datapackage_pipelines.wrapper import ingest, spew

from common.utilities import process


def add_geocodes(row, **kw):
    """Fill up the country and region fields."""

    row['beneficiary_country_code'] = kw['country_code']
    row['beneficiary_country'] = kw['country']
    row['beneficiary_nuts_code'] = kw['nuts_code']
    row['beneficiary_nuts_region'] = kw['region']

    return row


if __name__ == '__main__':
    parameters_, datapackage, resources = ingest()

    new_resources = process(resources, add_geocodes, **parameters_)
    spew(datapackage, new_resources)
Пример #17
0
def test_process_returns_a_generator_of_generators():
    assert next(next(process([['foo']], lambda x: x))) == 'foo'
Пример #18
0
def test_process_returns_a_generator_of_generators():
    assert next(next(process([['foo']], lambda x: x))) == 'foo'
Пример #19
0

def parse_currencies(row, fields=None, characters=None):
    """Clean up and convert currency fields to floats."""

    assert fields, 'Missing `fields` parameter'
    assert characters, 'Missing `characters` parameter'

    for key in fields:
        if row[key] is not None:
            row[key] = str(row[key])

            if not row[key].strip():
                row[key] = None
            else:
                try:
                    row[key] = float(row[key].replace(
                        characters['currency'],
                        '').replace(characters['grouping'],
                                    '').replace(characters['decimal'],
                                                '.').strip())
                except ValueError as error:
                    warning('%s in row\n%s', error, format_to_json(row))
    return row


if __name__ == '__main__':
    parameters, datapackage_, resources = ingest()
    new_resources_ = process(resources, parse_currencies, **parameters)
    spew(datapackage_, new_resources_)