示例#1
0
def test_inspector_warnings_row_limit():
    inspector = Inspector(row_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'row(s) limit' in report['warnings'][0]
    assert 'row(s) limit' in report['warnings'][1]
def test_inspector_datapackage_invalid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == [
        (1, 3, None, 'blank-row'),
        (2, 4, None, 'blank-row'),
    ]
def test_nested_presets_set_default_preset():
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {'source': 'data/datapackages/valid/datapackage.json'},
    ], preset='nested')
    assert report['valid']
    assert report['warnings'] == []
def test_inspector_catch_all_iter_exceptions(log):
    inspector = Inspector()
    # Reducing sample size to get raise on iter, not on open
    report = inspector.inspect([['h'], [1], 'bad'], sample_size=1)
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
def test_inspector_warnings_row_limit():
    inspector = Inspector(row_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'row(s) limit' in report['warnings'][0]
    assert 'row(s) limit' in report['warnings'][1]
def test_inspector_table_invalid_error_limit(log):
    inspector = Inspector(error_limit=2, infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
    ]
示例#7
0
def test_inspector_table_invalid_error_limit(log):
    inspector = Inspector(error_limit=2, infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
    ]
示例#8
0
def test_inspector_warnings_table_and_error_limit():
    inspector = Inspector(table_limit=1, error_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'table(s) limit' in report['warnings'][0]
    assert 'error(s) limit' in report['warnings'][1]
示例#9
0
def test_inspector_datapackage_invalid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == [
        (1, 3, None, 'blank-row'),
        (2, 4, None, 'blank-row'),
    ]
def test_inspector_warnings_table_and_error_limit():
    inspector = Inspector(table_limit=1, error_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'table(s) limit' in report['warnings'][0]
    assert 'error(s) limit' in report['warnings'][1]
示例#11
0
def test_inspector_tables_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {
            'source': 'data/valid.csv',
            'schema': {
                'fields': [{
                    'name': 'id'
                }, {
                    'name': 'name'
                }]
            }
        },
        {
            'source': 'data/invalid.csv'
        },
    ],
                               preset='nested')
    assert log(report) == [
        (2, None, 3, 'blank-header'),
        (2, None, 4, 'duplicate-header'),
        (2, 2, 3, 'missing-value'),
        (2, 2, 4, 'missing-value'),
        (2, 3, None, 'duplicate-row'),
        (2, 4, None, 'blank-row'),
        (2, 5, 5, 'extra-value'),
    ]
示例#12
0
def test_inspector_datapackage_invalid_table_limit(log):
    inspector = Inspector(table_limit=1)
    report = inspector.inspect('data/datapackages/invalid/datapackage.json',
                               preset='datapackage')
    assert log(report) == [
        (1, 3, None, 'blank-row'),
    ]
示例#13
0
def test_inspector_catch_all_iter_exceptions(log):
    inspector = Inspector()
    # Reducing sample size to get raise on iter, not on open
    report = inspector.inspect([['h'], [1], 'bad'], sample_size=1)
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
示例#14
0
    def validate(self, data_dict, action):

        records = data_dict.get('records', None)

        if not records:
            return self.error('records')

        if action == 'datastore_create':
            fields = data_dict.get('fields', None)

            if not fields:
                return self.error('fields')

            for field in fields:
                type = field.get('type', None)
                type = schema_mapped_types.get(type, type)

                field_descriptor = {
                    'name': field.get('id', None),
                    'type': type
                }

                self.schema['fields'].append(field_descriptor)
        elif action == 'datastore_upsert':
            resource_id = data_dict.get('resource_id', None)

            if not resource_id:
                return self.error('resource_id')

            try:
                data_dict = {'id': resource_id}
                result = get_action('datastore_info')(None, data_dict)
                schema = result['schema']

                for field in schema:
                    type = schema[field]
                    type = schema_mapped_types.get(type, type)

                    field_descriptor = {'name': field, 'type': type}

                    self.schema['fields'].append(field_descriptor)
            except NotFound:
                return False

        fallback_storage_path = os.path.dirname(os.path.realpath(__file__))
        schema_file = '{0}/{1}'.format(
            config.get('ckan.storage_path', fallback_storage_path),
            'schema.json')

        with open(schema_file, 'w') as fp:
            json.dump(self.schema, fp)

        inspector = Inspector(order_fields=True)
        report = inspector.inspect(records, schema=schema_file)

        os.remove(schema_file)

        return report
示例#15
0
 def inspect(self):
     """ inspect the data frame and return an error report """
     inspector = Inspector(custom_checks=self.custom_checks,
                           order_fields=True)
     report = inspector.inspect(self.file_name,
                                preset='table',
                                schema=self.schema)
     if not report['valid']:
         raise ValueError(json.dumps(report, indent=4))
示例#16
0
def validation(csv, cust):
    inspector = Inspector()
    inspector.__init__(row_limit=100000,
                       error_limit=100000)  # arbitrary row limit
    report = inspector.inspect(csv)
    email_data = []
    pretty_str = ''
    if not report[
            'valid']:  # an error report will only be sent if there are issues to be found
        for table in report['tables']:
            s = ast.literal_eval(table['datapackage'])
            filename = s['name'] + "_error_dump.txt"
            with open(
                    filename,
                    'w',
            ) as fp:
                for error in table['errors']:
                    row = error['row-number']
                    col = error['column-number']
                    err_str = error['message']
                    code = ""
                    for err in cust:
                        if col in err['columns'] and error[
                                'code'] != 'required-constraint' and error[
                                    'code'] != 'type-or-format-error':
                            err_str = err_str[:err_str.find(
                                "\"",
                                err_str.find("\"") + 1,
                            ) + 1]
                            value = err_str[err_str.find("\"") + 1:]
                            value = value[:len(value) - 1]
                            err_str = err_str + " in row " + str(
                                row) + " and column " + str(
                                    col) + err['message']
                            code = err['name']
                            #print(code)
                            break
                            # multiple codes are possible, but the custom code should be given advantage non-constraints or type errors.
                        elif error['code'] == 'required-constraint':
                            value = ''
                            code = error['code']
                        else:
                            new_err_str = err_str[:err_str.find(
                                "\"",
                                err_str.find("\"") + 1,
                            ) + 1]
                            value = new_err_str[new_err_str.find("\"") + 1:]
                            value = value[:len(value) - 1]
                            code = error['code']
                    pretty_str = pretty_str + err_str + "\n"
                    email_data.append({
                        'code': code,
                        'row': row,
                        'col': col,
                        'value': value
                    })
            notification(owner, email_data, pretty_str, s['name'])
示例#17
0
def test_nested_presets_set_default_preset():
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {
            'source': 'data/datapackages/valid/datapackage.json'
        },
    ],
                               preset='nested')
    assert report['valid']
    assert report['warnings'] == []
def test_inspector_table_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
        (1, 2, 3, 'missing-value'),
        (1, 2, 4, 'missing-value'),
        (1, 3, None, 'duplicate-row'),
        (1, 4, None, 'blank-row'),
        (1, 5, 5, 'extra-value'),
    ]
示例#19
0
def test_inspector_table_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
        (1, 2, 3, 'missing-value'),
        (1, 2, 4, 'missing-value'),
        (1, 3, None, 'duplicate-row'),
        (1, 4, None, 'blank-row'),
        (1, 5, 5, 'extra-value'),
    ]
示例#20
0
 def __init__(self, inspector=None):
     if inspector is not None:
         module, name = inspector.rsplit('.', 1)
         inspector = getattr(import_module(module), name)
     else:
         inspector = Inspector()
     self.__inspector = inspector
def test_inspector_tables_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {'source': 'data/valid.csv',
         'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}},
        {'source': 'data/invalid.csv'},
    ], preset='nested')
    assert log(report) == [
        (2, None, 3, 'blank-header'),
        (2, None, 4, 'duplicate-header'),
        (2, 2, 3, 'missing-value'),
        (2, 2, 4, 'missing-value'),
        (2, 3, None, 'duplicate-row'),
        (2, 4, None, 'blank-row'),
        (2, 5, 5, 'extra-value'),
    ]
示例#22
0
def validate_data(datapackage):
    # Start timer
    start = datetime.datetime.now()

    tables = []
    for resource in datapackage.resources:
        is_tabular = resource.descriptor.get('format', None) == 'csv' \
                or resource.descriptor.get('mediatype', None) == 'text/csv' \
                or resource.local_data_path.endswith('csv')

        if is_tabular:
            path = resource.remote_data_path or resource.local_data_path
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': Schema(resource.descriptor['schema']),
                'extra': {}
            })
    inspector = Inspector()

    reports = []
    errors = []
    for table in tables:
        report = inspector._Inspector__inspect_table(table)
        errors.extend(report['errors'])
        reports.append(report)

    # Stop timer
    stop = datetime.datetime.now()
    errors = errors[:1000]
    report = {
        'time':
        round((stop - start).total_seconds(), 3),
        'valid':
        True if len(reports) == 0 else all(report['valid']
                                           for report in reports),
        'table-count':
        len(tables),
        'error-count':
        sum(len(report['errors']) for report in reports),
        'errors':
        errors,
        'tables':
        reports,
    }
    return report
示例#23
0
def validate(validation_conf, job_id):
    """Main validation task.

    Args:
        validation_conf (dict): VERIFIED validation conf

    See `schemas/validation-conf.yml`.

    """

    # Get job
    job = models.job.get(job_id)

    # TODO: job not found
    if job['status'] == 'created':
        params = {
            'id': job_id,
            'status': 'running'
        }
        models.job.update(params)

    # Get report
    inspector = Inspector(**validation_conf.get('settings', {}))
    report = inspector.inspect(validation_conf['source'], preset='nested')

    # Save report
    params = {
        'id': job_id,
        'report': report,
        'finished': datetime.datetime.utcnow(),
        'status': 'success' if report['valid'] else 'failure'
    }

    models.job.update(params)

    job.update(params)

    return job
def test_inspector_catch_all_open_exceptions(log):
    inspector = Inspector()
    report = inspector.inspect('data/latin1.csv', encoding='utf-8')
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
def test_inspector_datapackage_valid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == []
示例#26
0
def test_inspector_datapackage_valid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == []
示例#27
0
def test_inspector_no_headers():
    inspector = Inspector()
    report = inspector.inspect('data/invalid_no_headers.csv', headers=None)
    assert report['tables'][0]['row-count'] == 3
    assert report['tables'][0]['error-count'] == 1
    assert report['tables'][0]['errors'][0]['code'] == 'extra-value'
示例#28
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report = inspector.inspect([
    {'source': 'data/valid.csv', 'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}},
    {'source': 'data/invalid.csv', 'preset': 'table'},
    {'source': 'data/datapackages/valid/datapackage.json', 'preset': 'datapackage'},
    {'source': 'data/datapackages/invalid/datapackage.json', 'preset': 'datapackage'},
], preset='nested')
pprint(report)
示例#29
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report1 = inspector.inspect('data/valid.csv')
report2 = inspector.inspect('data/invalid.csv')
pprint(report1)
pprint(report2)
示例#30
0
def test_inspector_datapackage_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/datapackages/valid/datapackage.json',
                               preset='datapackage')
    assert log(report) == []
示例#31
0
import os
from pprint import pprint
from tabulator import Stream
from jsontableschema import Schema
from goodtables import Inspector, preset


@preset('csvdir')
def csvdir(source):
    warnings = []
    tables = []
    for name in os.listdir(source):
        path = os.path.join(source, name)
        if name.endswith('.csv'):
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': None,
                'extra': {
                    'filename': name,
                },
            })
    return warnings, tables


inspector = Inspector(custom_presets=[csvdir])
report = inspector.inspect('data', preset='csvdir')
pprint(report)
示例#32
0
from pprint import pprint
from tabulator import Stream
from goodtables import Inspector, preset


@preset('ckan')
def ckan_preset(source, **options):
    warnings = []
    tables = []
    url = '%s/api/3/action/package_search' % source
    data = requests.get(url).json()
    for package in data['result']['results']:
        for resource in package['resources']:
            if resource['url'].endswith('.csv'):
                tables.append({
                    'source': resource['url'],
                    'stream': Stream(resource['url'], headers=1),
                    'schema': None,
                    'extra': {
                        'dataset': package['title'],
                        'resource': resource['name'],
                        'publisher': package['organization']['name']
                    },
                })
    return warnings, tables


inspector = Inspector(custom_presets=[ckan_preset])
report = inspector.inspect('http://data.surrey.ca', preset='ckan')
pprint(report)
def test_inspector_warnings_bad_datapackage_json():
    inspector = Inspector()
    source = 'data/invalid_json.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 1
    assert 'Unable to parse JSON' in report['warnings'][0]
示例#34
0
import requests
from pprint import pprint
from tabulator import Stream
from goodtables import Inspector, preset

@preset('ckan')
def ckan_preset(source, **options):
    warnings = []
    tables = []
    url = '%s/api/3/action/package_search' % source
    data = requests.get(url).json()
    for package in data['result']['results']:
        for resource in package['resources']:
            if resource['url'].endswith('.csv'):
                tables.append({
                    'source': resource['url'],
                    'stream': Stream(resource['url'], headers=1),
                    'schema': None,
                    'extra': {
                        'dataset': package['title'],
                        'resource': resource['name'],
                        'publisher': package['organization']['name']
                    },
                })
    return warnings, tables

inspector = Inspector(custom_presets=[ckan_preset])
report = inspector.inspect('http://data.surrey.ca', preset='ckan')
pprint(report)
def test_inspector_empty_source():
    inspector = Inspector()
    report = inspector.inspect('data/empty.csv')
    assert report['tables'][0]['row-count'] == 0
    assert report['tables'][0]['error-count'] == 0
示例#36
0
def test_inspector_warnings_no():
    inspector = Inspector()
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 0
def test_inspector_warnings_no():
    inspector = Inspector()
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 0
示例#38
0
def test_inspector_warnings_bad_datapackage_json():
    inspector = Inspector()
    source = 'data/invalid_json.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 1
    assert 'Unable to parse JSON' in report['warnings'][0]
示例#39
0
from pprint import pprint
from goodtables import Inspector, check


@check('unicode-found',
       type='structure',
       context='body',
       after='duplicate-row')
def unicode_found(errors, columns, row_number, state=None):
    for column in columns:
        if len(column) == 4:
            if column['value'] == '中国人':
                message = 'Row {row_number} has unicode in column {column_number}'
                message = message.format(row_number=row_number,
                                         column_number=column['column-number'])
                errors.append({
                    'code': 'unicode-found',
                    'message': message,
                    'row-number': row_number,
                    'column-number': column['column-number'],
                })


inspector = Inspector(custom_checks=[unicode_found])
report = inspector.inspect('data/valid.csv')
pprint(report)
示例#40
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report1 = inspector.inspect(
    'data/datapackages/valid/datapackage.json', preset='datapackage')
report2 = inspector.inspect(
    'data/datapackages/invalid/datapackage.json', preset='datapackage')
pprint(report1)
pprint(report2)
示例#41
0
from goodtables import Inspector, preset

# 1. Create app to generate access token at https://www.dropbox.com/developers/apps
# 2. And add some csv files to FOLDER on dropbox
ACCESS_TOKEN = '<insert-access-token>'
FOLDER = '/goodtables'

# Get dropbox client
client = dropbox.dropbox.Dropbox(ACCESS_TOKEN)

@preset('dropbox')
def dropbox_preset(source, **options):
    warnings = []
    tables = []
    for item in client.files_list_folder(source).entries:
        if item.path_lower.endswith('.csv'):
            url = client.files_get_temporary_link(item.path_lower).link
            tables.append({
                'source': url,
                'stream': Stream(url, headers=1, format='csv'),
                'schema': None,
                'extra': {
                    'folder': source,
                },
            })
    return warnings, tables

inspector = Inspector(custom_presets=[dropbox_preset])
report = inspector.inspect(FOLDER, preset='dropbox')
pprint(report)
示例#42
0
def test_inspector_empty_source():
    inspector = Inspector()
    report = inspector.inspect('data/empty.csv')
    assert report['tables'][0]['row-count'] == 0
    assert report['tables'][0]['error-count'] == 0
def test_inspector_table_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/valid.csv')
    assert log(report) == []
示例#44
0
def test_features(log, name, feature):
    inspector = Inspector(**feature.pop('settings', {}))
    expect = list(map(lambda item: tuple(item), feature.pop('report')))
    actual = log(inspector.inspect(**feature))
    assert actual == expect
示例#45
0
def test_inspector_table_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/valid.csv')
    assert log(report) == []
示例#46
0
from pprint import pprint
from goodtables import Inspector, check

@check('unicode-found', type='structure', context='body', after='duplicate-row')
def unicode_found(errors, columns, row_number, state=None):
    for column in columns:
        if len(column) == 4:
            if column['value'] == '中国人':
                message = 'Row {row_number} has unicode in column {column_number}'
                message = message.format(
                    row_number=row_number,
                    column_number=column['column-number'])
                errors.append({
                    'code': 'unicode-found',
                    'message': message,
                    'row-number': row_number,
                    'column-number': column['column-number'],
                })


inspector = Inspector(custom_checks=[unicode_found])
report = inspector.inspect('data/valid.csv')
pprint(report)
示例#47
0
def test_inspector_catch_all_open_exceptions(log):
    inspector = Inspector()
    report = inspector.inspect('data/latin1.csv', encoding='utf-8')
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
import os
from pprint import pprint
from tabulator import Stream
from jsontableschema import Schema
from goodtables import Inspector, preset

@preset('csvdir')
def csvdir(source):
    warnings = []
    tables = []
    for name in os.listdir(source):
        path = os.path.join(source, name)
        if name.endswith('.csv'):
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': None,
                'extra': {
                    'filename': name,
                },
            })
    return warnings, tables


inspector = Inspector(custom_presets=[csvdir])
report = inspector.inspect('data', preset='csvdir')
pprint(report)
def test_inspector_no_headers():
    inspector = Inspector()
    report = inspector.inspect('data/invalid_no_headers.csv', headers=None)
    assert report['tables'][0]['row-count'] == 3
    assert report['tables'][0]['error-count'] == 1
    assert report['tables'][0]['errors'][0]['code'] == 'extra-value'