def test_inspector_warnings_row_limit(): inspector = Inspector(row_limit=1) source = 'data/datapackages/invalid/datapackage.json' report = inspector.inspect(source, preset='datapackage') assert len(report['warnings']) == 2 assert 'row(s) limit' in report['warnings'][0] assert 'row(s) limit' in report['warnings'][1]
def test_inspector_datapackage_invalid(log, dp_path): inspector = Inspector() report = inspector.inspect(dp_path) assert log(report) == [ (1, 3, None, 'blank-row'), (2, 4, None, 'blank-row'), ]
def test_nested_presets_set_default_preset(): inspector = Inspector(infer_schema=True) report = inspector.inspect([ {'source': 'data/datapackages/valid/datapackage.json'}, ], preset='nested') assert report['valid'] assert report['warnings'] == []
def test_inspector_catch_all_iter_exceptions(log): inspector = Inspector() # Reducing sample size to get raise on iter, not on open report = inspector.inspect([['h'], [1], 'bad'], sample_size=1) assert log(report) == [ (1, None, None, 'source-error'), ]
def test_inspector_table_invalid_error_limit(log): inspector = Inspector(error_limit=2, infer_schema=True) report = inspector.inspect('data/invalid.csv') assert log(report) == [ (1, None, 3, 'blank-header'), (1, None, 4, 'duplicate-header'), ]
def test_inspector_warnings_table_and_error_limit(): inspector = Inspector(table_limit=1, error_limit=1) source = 'data/datapackages/invalid/datapackage.json' report = inspector.inspect(source, preset='datapackage') assert len(report['warnings']) == 2 assert 'table(s) limit' in report['warnings'][0] assert 'error(s) limit' in report['warnings'][1]
def test_inspector_tables_invalid(log): inspector = Inspector(infer_schema=True) report = inspector.inspect([ { 'source': 'data/valid.csv', 'schema': { 'fields': [{ 'name': 'id' }, { 'name': 'name' }] } }, { 'source': 'data/invalid.csv' }, ], preset='nested') assert log(report) == [ (2, None, 3, 'blank-header'), (2, None, 4, 'duplicate-header'), (2, 2, 3, 'missing-value'), (2, 2, 4, 'missing-value'), (2, 3, None, 'duplicate-row'), (2, 4, None, 'blank-row'), (2, 5, 5, 'extra-value'), ]
def test_inspector_datapackage_invalid_table_limit(log): inspector = Inspector(table_limit=1) report = inspector.inspect('data/datapackages/invalid/datapackage.json', preset='datapackage') assert log(report) == [ (1, 3, None, 'blank-row'), ]
def validate(self, data_dict, action): records = data_dict.get('records', None) if not records: return self.error('records') if action == 'datastore_create': fields = data_dict.get('fields', None) if not fields: return self.error('fields') for field in fields: type = field.get('type', None) type = schema_mapped_types.get(type, type) field_descriptor = { 'name': field.get('id', None), 'type': type } self.schema['fields'].append(field_descriptor) elif action == 'datastore_upsert': resource_id = data_dict.get('resource_id', None) if not resource_id: return self.error('resource_id') try: data_dict = {'id': resource_id} result = get_action('datastore_info')(None, data_dict) schema = result['schema'] for field in schema: type = schema[field] type = schema_mapped_types.get(type, type) field_descriptor = {'name': field, 'type': type} self.schema['fields'].append(field_descriptor) except NotFound: return False fallback_storage_path = os.path.dirname(os.path.realpath(__file__)) schema_file = '{0}/{1}'.format( config.get('ckan.storage_path', fallback_storage_path), 'schema.json') with open(schema_file, 'w') as fp: json.dump(self.schema, fp) inspector = Inspector(order_fields=True) report = inspector.inspect(records, schema=schema_file) os.remove(schema_file) return report
def inspect(self): """ inspect the data frame and return an error report """ inspector = Inspector(custom_checks=self.custom_checks, order_fields=True) report = inspector.inspect(self.file_name, preset='table', schema=self.schema) if not report['valid']: raise ValueError(json.dumps(report, indent=4))
def validation(csv, cust): inspector = Inspector() inspector.__init__(row_limit=100000, error_limit=100000) # arbitrary row limit report = inspector.inspect(csv) email_data = [] pretty_str = '' if not report[ 'valid']: # an error report will only be sent if there are issues to be found for table in report['tables']: s = ast.literal_eval(table['datapackage']) filename = s['name'] + "_error_dump.txt" with open( filename, 'w', ) as fp: for error in table['errors']: row = error['row-number'] col = error['column-number'] err_str = error['message'] code = "" for err in cust: if col in err['columns'] and error[ 'code'] != 'required-constraint' and error[ 'code'] != 'type-or-format-error': err_str = err_str[:err_str.find( "\"", err_str.find("\"") + 1, ) + 1] value = err_str[err_str.find("\"") + 1:] value = value[:len(value) - 1] err_str = err_str + " in row " + str( row) + " and column " + str( col) + err['message'] code = err['name'] #print(code) break # multiple codes are possible, but the custom code should be given advantage non-constraints or type errors. elif error['code'] == 'required-constraint': value = '' code = error['code'] else: new_err_str = err_str[:err_str.find( "\"", err_str.find("\"") + 1, ) + 1] value = new_err_str[new_err_str.find("\"") + 1:] value = value[:len(value) - 1] code = error['code'] pretty_str = pretty_str + err_str + "\n" email_data.append({ 'code': code, 'row': row, 'col': col, 'value': value }) notification(owner, email_data, pretty_str, s['name'])
def test_nested_presets_set_default_preset(): inspector = Inspector(infer_schema=True) report = inspector.inspect([ { 'source': 'data/datapackages/valid/datapackage.json' }, ], preset='nested') assert report['valid'] assert report['warnings'] == []
def test_inspector_table_invalid(log): inspector = Inspector(infer_schema=True) report = inspector.inspect('data/invalid.csv') assert log(report) == [ (1, None, 3, 'blank-header'), (1, None, 4, 'duplicate-header'), (1, 2, 3, 'missing-value'), (1, 2, 4, 'missing-value'), (1, 3, None, 'duplicate-row'), (1, 4, None, 'blank-row'), (1, 5, 5, 'extra-value'), ]
def __init__(self, inspector=None): if inspector is not None: module, name = inspector.rsplit('.', 1) inspector = getattr(import_module(module), name) else: inspector = Inspector() self.__inspector = inspector
def test_inspector_tables_invalid(log): inspector = Inspector(infer_schema=True) report = inspector.inspect([ {'source': 'data/valid.csv', 'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}}, {'source': 'data/invalid.csv'}, ], preset='nested') assert log(report) == [ (2, None, 3, 'blank-header'), (2, None, 4, 'duplicate-header'), (2, 2, 3, 'missing-value'), (2, 2, 4, 'missing-value'), (2, 3, None, 'duplicate-row'), (2, 4, None, 'blank-row'), (2, 5, 5, 'extra-value'), ]
def validate_data(datapackage): # Start timer start = datetime.datetime.now() tables = [] for resource in datapackage.resources: is_tabular = resource.descriptor.get('format', None) == 'csv' \ or resource.descriptor.get('mediatype', None) == 'text/csv' \ or resource.local_data_path.endswith('csv') if is_tabular: path = resource.remote_data_path or resource.local_data_path tables.append({ 'source': path, 'stream': Stream(path, headers=1), 'schema': Schema(resource.descriptor['schema']), 'extra': {} }) inspector = Inspector() reports = [] errors = [] for table in tables: report = inspector._Inspector__inspect_table(table) errors.extend(report['errors']) reports.append(report) # Stop timer stop = datetime.datetime.now() errors = errors[:1000] report = { 'time': round((stop - start).total_seconds(), 3), 'valid': True if len(reports) == 0 else all(report['valid'] for report in reports), 'table-count': len(tables), 'error-count': sum(len(report['errors']) for report in reports), 'errors': errors, 'tables': reports, } return report
def validate(validation_conf, job_id): """Main validation task. Args: validation_conf (dict): VERIFIED validation conf See `schemas/validation-conf.yml`. """ # Get job job = models.job.get(job_id) # TODO: job not found if job['status'] == 'created': params = { 'id': job_id, 'status': 'running' } models.job.update(params) # Get report inspector = Inspector(**validation_conf.get('settings', {})) report = inspector.inspect(validation_conf['source'], preset='nested') # Save report params = { 'id': job_id, 'report': report, 'finished': datetime.datetime.utcnow(), 'status': 'success' if report['valid'] else 'failure' } models.job.update(params) job.update(params) return job
def test_inspector_catch_all_open_exceptions(log): inspector = Inspector() report = inspector.inspect('data/latin1.csv', encoding='utf-8') assert log(report) == [ (1, None, None, 'source-error'), ]
def test_inspector_datapackage_valid(log, dp_path): inspector = Inspector() report = inspector.inspect(dp_path) assert log(report) == []
def test_inspector_no_headers(): inspector = Inspector() report = inspector.inspect('data/invalid_no_headers.csv', headers=None) assert report['tables'][0]['row-count'] == 3 assert report['tables'][0]['error-count'] == 1 assert report['tables'][0]['errors'][0]['code'] == 'extra-value'
from pprint import pprint from goodtables import Inspector inspector = Inspector() report = inspector.inspect([ {'source': 'data/valid.csv', 'schema': {'fields': [{'name': 'id'}, {'name': 'name'}]}}, {'source': 'data/invalid.csv', 'preset': 'table'}, {'source': 'data/datapackages/valid/datapackage.json', 'preset': 'datapackage'}, {'source': 'data/datapackages/invalid/datapackage.json', 'preset': 'datapackage'}, ], preset='nested') pprint(report)
from pprint import pprint from goodtables import Inspector inspector = Inspector() report1 = inspector.inspect('data/valid.csv') report2 = inspector.inspect('data/invalid.csv') pprint(report1) pprint(report2)
def test_inspector_datapackage_valid(log): inspector = Inspector() report = inspector.inspect('data/datapackages/valid/datapackage.json', preset='datapackage') assert log(report) == []
import os from pprint import pprint from tabulator import Stream from jsontableschema import Schema from goodtables import Inspector, preset @preset('csvdir') def csvdir(source): warnings = [] tables = [] for name in os.listdir(source): path = os.path.join(source, name) if name.endswith('.csv'): tables.append({ 'source': path, 'stream': Stream(path, headers=1), 'schema': None, 'extra': { 'filename': name, }, }) return warnings, tables inspector = Inspector(custom_presets=[csvdir]) report = inspector.inspect('data', preset='csvdir') pprint(report)
from pprint import pprint from tabulator import Stream from goodtables import Inspector, preset @preset('ckan') def ckan_preset(source, **options): warnings = [] tables = [] url = '%s/api/3/action/package_search' % source data = requests.get(url).json() for package in data['result']['results']: for resource in package['resources']: if resource['url'].endswith('.csv'): tables.append({ 'source': resource['url'], 'stream': Stream(resource['url'], headers=1), 'schema': None, 'extra': { 'dataset': package['title'], 'resource': resource['name'], 'publisher': package['organization']['name'] }, }) return warnings, tables inspector = Inspector(custom_presets=[ckan_preset]) report = inspector.inspect('http://data.surrey.ca', preset='ckan') pprint(report)
def test_inspector_warnings_bad_datapackage_json(): inspector = Inspector() source = 'data/invalid_json.json' report = inspector.inspect(source, preset='datapackage') assert len(report['warnings']) == 1 assert 'Unable to parse JSON' in report['warnings'][0]
import requests from pprint import pprint from tabulator import Stream from goodtables import Inspector, preset @preset('ckan') def ckan_preset(source, **options): warnings = [] tables = [] url = '%s/api/3/action/package_search' % source data = requests.get(url).json() for package in data['result']['results']: for resource in package['resources']: if resource['url'].endswith('.csv'): tables.append({ 'source': resource['url'], 'stream': Stream(resource['url'], headers=1), 'schema': None, 'extra': { 'dataset': package['title'], 'resource': resource['name'], 'publisher': package['organization']['name'] }, }) return warnings, tables inspector = Inspector(custom_presets=[ckan_preset]) report = inspector.inspect('http://data.surrey.ca', preset='ckan') pprint(report)
def test_inspector_empty_source(): inspector = Inspector() report = inspector.inspect('data/empty.csv') assert report['tables'][0]['row-count'] == 0 assert report['tables'][0]['error-count'] == 0
def test_inspector_warnings_no(): inspector = Inspector() source = 'data/datapackages/invalid/datapackage.json' report = inspector.inspect(source, preset='datapackage') assert len(report['warnings']) == 0
from pprint import pprint from goodtables import Inspector, check @check('unicode-found', type='structure', context='body', after='duplicate-row') def unicode_found(errors, columns, row_number, state=None): for column in columns: if len(column) == 4: if column['value'] == '中国人': message = 'Row {row_number} has unicode in column {column_number}' message = message.format(row_number=row_number, column_number=column['column-number']) errors.append({ 'code': 'unicode-found', 'message': message, 'row-number': row_number, 'column-number': column['column-number'], }) inspector = Inspector(custom_checks=[unicode_found]) report = inspector.inspect('data/valid.csv') pprint(report)
from pprint import pprint from goodtables import Inspector inspector = Inspector() report1 = inspector.inspect( 'data/datapackages/valid/datapackage.json', preset='datapackage') report2 = inspector.inspect( 'data/datapackages/invalid/datapackage.json', preset='datapackage') pprint(report1) pprint(report2)
from goodtables import Inspector, preset # 1. Create app to generate access token at https://www.dropbox.com/developers/apps # 2. And add some csv files to FOLDER on dropbox ACCESS_TOKEN = '<insert-access-token>' FOLDER = '/goodtables' # Get dropbox client client = dropbox.dropbox.Dropbox(ACCESS_TOKEN) @preset('dropbox') def dropbox_preset(source, **options): warnings = [] tables = [] for item in client.files_list_folder(source).entries: if item.path_lower.endswith('.csv'): url = client.files_get_temporary_link(item.path_lower).link tables.append({ 'source': url, 'stream': Stream(url, headers=1, format='csv'), 'schema': None, 'extra': { 'folder': source, }, }) return warnings, tables inspector = Inspector(custom_presets=[dropbox_preset]) report = inspector.inspect(FOLDER, preset='dropbox') pprint(report)
def test_inspector_table_valid(log): inspector = Inspector() report = inspector.inspect('data/valid.csv') assert log(report) == []
def test_features(log, name, feature): inspector = Inspector(**feature.pop('settings', {})) expect = list(map(lambda item: tuple(item), feature.pop('report'))) actual = log(inspector.inspect(**feature)) assert actual == expect
from pprint import pprint from goodtables import Inspector, check @check('unicode-found', type='structure', context='body', after='duplicate-row') def unicode_found(errors, columns, row_number, state=None): for column in columns: if len(column) == 4: if column['value'] == '中国人': message = 'Row {row_number} has unicode in column {column_number}' message = message.format( row_number=row_number, column_number=column['column-number']) errors.append({ 'code': 'unicode-found', 'message': message, 'row-number': row_number, 'column-number': column['column-number'], }) inspector = Inspector(custom_checks=[unicode_found]) report = inspector.inspect('data/valid.csv') pprint(report)