Exemplo n.º 1
0
def test_inspector_datapackage_invalid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == [
        (1, 3, None, 'blank-row'),
        (2, 4, None, 'blank-row'),
    ]
Exemplo n.º 2
0
def test_inspector_datapackage_invalid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == [
        (1, 3, None, 'blank-row'),
        (2, 4, None, 'blank-row'),
    ]
Exemplo n.º 3
0
def test_inspector_tables_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {
            'source': 'data/valid.csv',
            'schema': {
                'fields': [{
                    'name': 'id'
                }, {
                    'name': 'name'
                }]
            }
        },
        {
            'source': 'data/invalid.csv'
        },
    ],
                               preset='nested')
    assert log(report) == [
        (2, None, 3, 'blank-header'),
        (2, None, 4, 'duplicate-header'),
        (2, 2, 3, 'missing-value'),
        (2, 2, 4, 'missing-value'),
        (2, 3, None, 'duplicate-row'),
        (2, 4, None, 'blank-row'),
        (2, 5, 5, 'extra-value'),
    ]
Exemplo n.º 4
0
def test_nested_presets_set_default_preset():
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {'source': 'data/datapackages/valid/datapackage.json'},
    ], preset='nested')
    assert report['valid']
    assert report['warnings'] == []
Exemplo n.º 5
0
def test_inspector_datapackage_invalid_table_limit(log):
    inspector = Inspector(table_limit=1)
    report = inspector.inspect('data/datapackages/invalid/datapackage.json',
                               preset='datapackage')
    assert log(report) == [
        (1, 3, None, 'blank-row'),
    ]
Exemplo n.º 6
0
def test_inspector_catch_all_iter_exceptions(log):
    inspector = Inspector()
    # Reducing sample size to get raise on iter, not on open
    report = inspector.inspect([['h'], [1], 'bad'], sample_size=1)
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
Exemplo n.º 7
0
def test_inspector_catch_all_iter_exceptions(log):
    inspector = Inspector()
    # Reducing sample size to get raise on iter, not on open
    report = inspector.inspect([['h'], [1], 'bad'], sample_size=1)
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
Exemplo n.º 8
0
def test_inspector_table_invalid_error_limit(log):
    inspector = Inspector(error_limit=2, infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
    ]
Exemplo n.º 9
0
def test_inspector_warnings_table_and_error_limit():
    inspector = Inspector(table_limit=1, error_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'table(s) limit' in report['warnings'][0]
    assert 'error(s) limit' in report['warnings'][1]
Exemplo n.º 10
0
def test_inspector_warnings_row_limit():
    inspector = Inspector(row_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'row(s) limit' in report['warnings'][0]
    assert 'row(s) limit' in report['warnings'][1]
Exemplo n.º 11
0
def test_inspector_warnings_table_and_error_limit():
    inspector = Inspector(table_limit=1, error_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'table(s) limit' in report['warnings'][0]
    assert 'error(s) limit' in report['warnings'][1]
Exemplo n.º 12
0
def test_inspector_table_invalid_error_limit(log):
    inspector = Inspector(error_limit=2, infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
    ]
Exemplo n.º 13
0
def test_inspector_warnings_row_limit():
    inspector = Inspector(row_limit=1)
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 2
    assert 'row(s) limit' in report['warnings'][0]
    assert 'row(s) limit' in report['warnings'][1]
Exemplo n.º 14
0
    def validate(self, data_dict, action):

        records = data_dict.get('records', None)

        if not records:
            return self.error('records')

        if action == 'datastore_create':
            fields = data_dict.get('fields', None)

            if not fields:
                return self.error('fields')

            for field in fields:
                type = field.get('type', None)
                type = schema_mapped_types.get(type, type)

                field_descriptor = {
                    'name': field.get('id', None),
                    'type': type
                }

                self.schema['fields'].append(field_descriptor)
        elif action == 'datastore_upsert':
            resource_id = data_dict.get('resource_id', None)

            if not resource_id:
                return self.error('resource_id')

            try:
                data_dict = {'id': resource_id}
                result = get_action('datastore_info')(None, data_dict)
                schema = result['schema']

                for field in schema:
                    type = schema[field]
                    type = schema_mapped_types.get(type, type)

                    field_descriptor = {'name': field, 'type': type}

                    self.schema['fields'].append(field_descriptor)
            except NotFound:
                return False

        fallback_storage_path = os.path.dirname(os.path.realpath(__file__))
        schema_file = '{0}/{1}'.format(
            config.get('ckan.storage_path', fallback_storage_path),
            'schema.json')

        with open(schema_file, 'w') as fp:
            json.dump(self.schema, fp)

        inspector = Inspector(order_fields=True)
        report = inspector.inspect(records, schema=schema_file)

        os.remove(schema_file)

        return report
Exemplo n.º 15
0
 def inspect(self):
     """ inspect the data frame and return an error report """
     inspector = Inspector(custom_checks=self.custom_checks,
                           order_fields=True)
     report = inspector.inspect(self.file_name,
                                preset='table',
                                schema=self.schema)
     if not report['valid']:
         raise ValueError(json.dumps(report, indent=4))
Exemplo n.º 16
0
def validation(csv, cust):
    inspector = Inspector()
    inspector.__init__(row_limit=100000,
                       error_limit=100000)  # arbitrary row limit
    report = inspector.inspect(csv)
    email_data = []
    pretty_str = ''
    if not report[
            'valid']:  # an error report will only be sent if there are issues to be found
        for table in report['tables']:
            s = ast.literal_eval(table['datapackage'])
            filename = s['name'] + "_error_dump.txt"
            with open(
                    filename,
                    'w',
            ) as fp:
                for error in table['errors']:
                    row = error['row-number']
                    col = error['column-number']
                    err_str = error['message']
                    code = ""
                    for err in cust:
                        if col in err['columns'] and error[
                                'code'] != 'required-constraint' and error[
                                    'code'] != 'type-or-format-error':
                            err_str = err_str[:err_str.find(
                                "\"",
                                err_str.find("\"") + 1,
                            ) + 1]
                            value = err_str[err_str.find("\"") + 1:]
                            value = value[:len(value) - 1]
                            err_str = err_str + " in row " + str(
                                row) + " and column " + str(
                                    col) + err['message']
                            code = err['name']
                            #print(code)
                            break
                            # multiple codes are possible, but the custom code should be given advantage non-constraints or type errors.
                        elif error['code'] == 'required-constraint':
                            value = ''
                            code = error['code']
                        else:
                            new_err_str = err_str[:err_str.find(
                                "\"",
                                err_str.find("\"") + 1,
                            ) + 1]
                            value = new_err_str[new_err_str.find("\"") + 1:]
                            value = value[:len(value) - 1]
                            code = error['code']
                    pretty_str = pretty_str + err_str + "\n"
                    email_data.append({
                        'code': code,
                        'row': row,
                        'col': col,
                        'value': value
                    })
            notification(owner, email_data, pretty_str, s['name'])
Exemplo n.º 17
0
def test_nested_presets_set_default_preset():
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {
            'source': 'data/datapackages/valid/datapackage.json'
        },
    ],
                               preset='nested')
    assert report['valid']
    assert report['warnings'] == []
Exemplo n.º 18
0
def test_inspector_table_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
        (1, 2, 3, 'missing-value'),
        (1, 2, 4, 'missing-value'),
        (1, 3, None, 'duplicate-row'),
        (1, 4, None, 'blank-row'),
        (1, 5, 5, 'extra-value'),
    ]
Exemplo n.º 19
0
def test_inspector_table_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect('data/invalid.csv')
    assert log(report) == [
        (1, None, 3, 'blank-header'),
        (1, None, 4, 'duplicate-header'),
        (1, 2, 3, 'missing-value'),
        (1, 2, 4, 'missing-value'),
        (1, 3, None, 'duplicate-row'),
        (1, 4, None, 'blank-row'),
        (1, 5, 5, 'extra-value'),
    ]
Exemplo n.º 20
0
def test_inspector_tables_invalid(log):
    inspector = Inspector(infer_schema=True)
    report = inspector.inspect([
        {'source': 'data/valid.csv',
         'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}},
        {'source': 'data/invalid.csv'},
    ], preset='nested')
    assert log(report) == [
        (2, None, 3, 'blank-header'),
        (2, None, 4, 'duplicate-header'),
        (2, 2, 3, 'missing-value'),
        (2, 2, 4, 'missing-value'),
        (2, 3, None, 'duplicate-row'),
        (2, 4, None, 'blank-row'),
        (2, 5, 5, 'extra-value'),
    ]
Exemplo n.º 21
0
def validate(validation_conf, job_id):
    """Main validation task.

    Args:
        validation_conf (dict): VERIFIED validation conf

    See `schemas/validation-conf.yml`.

    """

    # Get job
    job = models.job.get(job_id)

    # TODO: job not found
    if job['status'] == 'created':
        params = {
            'id': job_id,
            'status': 'running'
        }
        models.job.update(params)

    # Get report
    inspector = Inspector(**validation_conf.get('settings', {}))
    report = inspector.inspect(validation_conf['source'], preset='nested')

    # Save report
    params = {
        'id': job_id,
        'report': report,
        'finished': datetime.datetime.utcnow(),
        'status': 'success' if report['valid'] else 'failure'
    }

    models.job.update(params)

    job.update(params)

    return job
Exemplo n.º 22
0
def test_inspector_table_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/valid.csv')
    assert log(report) == []
Exemplo n.º 23
0
def test_inspector_empty_source():
    inspector = Inspector()
    report = inspector.inspect('data/empty.csv')
    assert report['tables'][0]['row-count'] == 0
    assert report['tables'][0]['error-count'] == 0
Exemplo n.º 24
0
def test_inspector_datapackage_valid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == []
Exemplo n.º 25
0
def test_inspector_no_headers():
    inspector = Inspector()
    report = inspector.inspect('data/invalid_no_headers.csv', headers=None)
    assert report['tables'][0]['row-count'] == 3
    assert report['tables'][0]['error-count'] == 1
    assert report['tables'][0]['errors'][0]['code'] == 'extra-value'
Exemplo n.º 26
0
from pprint import pprint
from goodtables import Inspector, check

@check('unicode-found', type='structure', context='body', after='duplicate-row')
def unicode_found(errors, columns, row_number, state=None):
    for column in columns:
        if len(column) == 4:
            if column['value'] == '中国人':
                message = 'Row {row_number} has unicode in column {column_number}'
                message = message.format(
                    row_number=row_number,
                    column_number=column['column-number'])
                errors.append({
                    'code': 'unicode-found',
                    'message': message,
                    'row-number': row_number,
                    'column-number': column['column-number'],
                })


inspector = Inspector(custom_checks=[unicode_found])
report = inspector.inspect('data/valid.csv')
pprint(report)
Exemplo n.º 27
0
def test_features(log, name, feature):
    inspector = Inspector(**feature.pop('settings', {}))
    expect = list(map(lambda item: tuple(item), feature.pop('report')))
    actual = log(inspector.inspect(**feature))
    assert actual == expect
Exemplo n.º 28
0
def test_inspector_warnings_no():
    inspector = Inspector()
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 0
Exemplo n.º 29
0
from pprint import pprint
from tabulator import Stream
from goodtables import Inspector, preset


@preset('ckan')
def ckan_preset(source, **options):
    warnings = []
    tables = []
    url = '%s/api/3/action/package_search' % source
    data = requests.get(url).json()
    for package in data['result']['results']:
        for resource in package['resources']:
            if resource['url'].endswith('.csv'):
                tables.append({
                    'source': resource['url'],
                    'stream': Stream(resource['url'], headers=1),
                    'schema': None,
                    'extra': {
                        'dataset': package['title'],
                        'resource': resource['name'],
                        'publisher': package['organization']['name']
                    },
                })
    return warnings, tables


inspector = Inspector(custom_presets=[ckan_preset])
report = inspector.inspect('http://data.surrey.ca', preset='ckan')
pprint(report)
Exemplo n.º 30
0
import os
from pprint import pprint
from tabulator import Stream
from jsontableschema import Schema
from goodtables import Inspector, preset


@preset('csvdir')
def csvdir(source):
    warnings = []
    tables = []
    for name in os.listdir(source):
        path = os.path.join(source, name)
        if name.endswith('.csv'):
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': None,
                'extra': {
                    'filename': name,
                },
            })
    return warnings, tables


inspector = Inspector(custom_presets=[csvdir])
report = inspector.inspect('data', preset='csvdir')
pprint(report)
Exemplo n.º 31
0
def test_inspector_table_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/valid.csv')
    assert log(report) == []
Exemplo n.º 32
0
def test_inspector_warnings_no():
    inspector = Inspector()
    source = 'data/datapackages/invalid/datapackage.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 0
Exemplo n.º 33
0
def test_inspector_warnings_bad_datapackage_json():
    inspector = Inspector()
    source = 'data/invalid_json.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 1
    assert 'Unable to parse JSON' in report['warnings'][0]
Exemplo n.º 34
0
def test_inspector_datapackage_valid(log, dp_path):
    inspector = Inspector()
    report = inspector.inspect(dp_path)
    assert log(report) == []
Exemplo n.º 35
0
def test_inspector_catch_all_open_exceptions(log):
    inspector = Inspector()
    report = inspector.inspect('data/latin1.csv', encoding='utf-8')
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
Exemplo n.º 36
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report1 = inspector.inspect(
    'data/datapackages/valid/datapackage.json', preset='datapackage')
report2 = inspector.inspect(
    'data/datapackages/invalid/datapackage.json', preset='datapackage')
pprint(report1)
pprint(report2)
Exemplo n.º 37
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report1 = inspector.inspect('data/valid.csv')
report2 = inspector.inspect('data/invalid.csv')
pprint(report1)
pprint(report2)
Exemplo n.º 38
0
def test_inspector_warnings_bad_datapackage_json():
    inspector = Inspector()
    source = 'data/invalid_json.json'
    report = inspector.inspect(source, preset='datapackage')
    assert len(report['warnings']) == 1
    assert 'Unable to parse JSON' in report['warnings'][0]
Exemplo n.º 39
0
def test_inspector_datapackage_valid(log):
    inspector = Inspector()
    report = inspector.inspect('data/datapackages/valid/datapackage.json',
                               preset='datapackage')
    assert log(report) == []
Exemplo n.º 40
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report = inspector.inspect([
    {
        'source': 'data/valid.csv',
        'schema': {
            'fields': [{
                'name': 'id'
            }, {
                'name': 'name'
            }]
        }
    },
    {
        'source': 'data/invalid.csv',
        'preset': 'table'
    },
    {
        'source': 'data/datapackages/valid/datapackage.json',
        'preset': 'datapackage'
    },
    {
        'source': 'data/datapackages/invalid/datapackage.json',
        'preset': 'datapackage'
    },
],
                           preset='nested')
pprint(report)
Exemplo n.º 41
0
def test_inspector_catch_all_open_exceptions(log):
    inspector = Inspector()
    report = inspector.inspect('data/latin1.csv', encoding='utf-8')
    assert log(report) == [
        (1, None, None, 'source-error'),
    ]
Exemplo n.º 42
0
def test_inspector_empty_source():
    inspector = Inspector()
    report = inspector.inspect('data/empty.csv')
    assert report['tables'][0]['row-count'] == 0
    assert report['tables'][0]['error-count'] == 0
Exemplo n.º 43
0
from pprint import pprint
from goodtables import Inspector, check


@check('unicode-found',
       type='structure',
       context='body',
       after='duplicate-row')
def unicode_found(errors, columns, row_number, state=None):
    for column in columns:
        if len(column) == 4:
            if column['value'] == '中国人':
                message = 'Row {row_number} has unicode in column {column_number}'
                message = message.format(row_number=row_number,
                                         column_number=column['column-number'])
                errors.append({
                    'code': 'unicode-found',
                    'message': message,
                    'row-number': row_number,
                    'column-number': column['column-number'],
                })


inspector = Inspector(custom_checks=[unicode_found])
report = inspector.inspect('data/valid.csv')
pprint(report)
Exemplo n.º 44
0
from goodtables import Inspector, preset

# 1. Create app to generate access token at https://www.dropbox.com/developers/apps
# 2. And add some csv files to FOLDER on dropbox
ACCESS_TOKEN = '<insert-access-token>'
FOLDER = '/goodtables'

# Get dropbox client
client = dropbox.dropbox.Dropbox(ACCESS_TOKEN)

@preset('dropbox')
def dropbox_preset(source, **options):
    warnings = []
    tables = []
    for item in client.files_list_folder(source).entries:
        if item.path_lower.endswith('.csv'):
            url = client.files_get_temporary_link(item.path_lower).link
            tables.append({
                'source': url,
                'stream': Stream(url, headers=1, format='csv'),
                'schema': None,
                'extra': {
                    'folder': source,
                },
            })
    return warnings, tables

inspector = Inspector(custom_presets=[dropbox_preset])
report = inspector.inspect(FOLDER, preset='dropbox')
pprint(report)
Exemplo n.º 45
0
from pprint import pprint
from goodtables import Inspector

inspector = Inspector()
report = inspector.inspect([
    {'source': 'data/valid.csv', 'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}},
    {'source': 'data/invalid.csv', 'preset': 'table'},
    {'source': 'data/datapackages/valid/datapackage.json', 'preset': 'datapackage'},
    {'source': 'data/datapackages/invalid/datapackage.json', 'preset': 'datapackage'},
], preset='nested')
pprint(report)
Exemplo n.º 46
0
import requests
from pprint import pprint
from tabulator import Stream
from goodtables import Inspector, preset

@preset('ckan')
def ckan_preset(source, **options):
    warnings = []
    tables = []
    url = '%s/api/3/action/package_search' % source
    data = requests.get(url).json()
    for package in data['result']['results']:
        for resource in package['resources']:
            if resource['url'].endswith('.csv'):
                tables.append({
                    'source': resource['url'],
                    'stream': Stream(resource['url'], headers=1),
                    'schema': None,
                    'extra': {
                        'dataset': package['title'],
                        'resource': resource['name'],
                        'publisher': package['organization']['name']
                    },
                })
    return warnings, tables

inspector = Inspector(custom_presets=[ckan_preset])
report = inspector.inspect('http://data.surrey.ca', preset='ckan')
pprint(report)
Exemplo n.º 47
0
def test_inspector_no_headers():
    inspector = Inspector()
    report = inspector.inspect('data/invalid_no_headers.csv', headers=None)
    assert report['tables'][0]['row-count'] == 3
    assert report['tables'][0]['error-count'] == 1
    assert report['tables'][0]['errors'][0]['code'] == 'extra-value'
Exemplo n.º 48
0
import os
from pprint import pprint
from tabulator import Stream
from jsontableschema import Schema
from goodtables import Inspector, preset

@preset('csvdir')
def csvdir(source):
    warnings = []
    tables = []
    for name in os.listdir(source):
        path = os.path.join(source, name)
        if name.endswith('.csv'):
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': None,
                'extra': {
                    'filename': name,
                },
            })
    return warnings, tables


inspector = Inspector(custom_presets=[csvdir])
report = inspector.inspect('data', preset='csvdir')
pprint(report)
Exemplo n.º 49
0
def validate(validation_conf, job_id, files={}):
    """Main validation task.

    Args:
        validation_conf (dict): VERIFIED validation conf

    See `schemas/validation-conf.yml`.

    """

    # Get job
    job = models.job.get(job_id)
    # TODO: job not found
    if job['status'] == 'created':
        params = {
            'id': job_id,
            'status': 'running'
        }
        models.job.update(params)

    # Add uploaded files
    for item in validation_conf['source']:
        if item.get('preset', 'table') in ['table', 'datapackage']:
            item['scheme'] = 'http'
            if item['source'] in files:
                item['scheme'] = 'file'
                item['source'] = files[item['source']]
            if item.get('schema') in files:
                item['schema'] = files[item['schema']]

    # Get report
    if 'settings' not in validation_conf:
        validation_conf['settings'] = {}
    max_tables = settings.MAX_TABLES_PER_SOURCE
    if (not validation_conf['settings'].get('table_limit') or
            validation_conf['settings']['table_limit'] > max_tables):
        validation_conf['settings']['table_limit'] = max_tables
    inspector = Inspector(**validation_conf.get('settings', {}))
    report = inspector.inspect(validation_conf['source'], preset='nested')

    # Hide uploaded files
    for table in report['tables']:
        if table['source'].startswith('/'):
            table['source'] = os.path.basename(table['source'])
    for index, warning in enumerate(report['warnings']):
        report['warnings'][index] = re.sub(r'/tmp/.*?/', '', warning)

    # Save report
    params = {
        'id': job_id,
        'finished': datetime.datetime.utcnow(),
    }
    if report['table-count'] > 0:
        params.update({
            'status': 'success' if report['valid'] else 'failure',
            'report': report,
        })
    else:
        params.update({
            'status': 'error',
            'error': {'message': '\n'.join(report['warnings']) or 'No tables found'},
        })
    models.job.update(params)
    job.update(params)

    return job