def test_different_indicator_datatypes(self): """One rule, but three different indicators with different tagspecs and datatypes""" SCHEMA = [ ['#valid_tag', '#valid_datatype+consistent'], ['#indicator', 'true'] ] GOOD_DATA = [ ['#indicator+xxx', '#indicator+yyy', '#indicator+zzz'], ['100', 'aaa', '100'], ['200', '', '200'], ['300', '', '300'], ['400', '', '400'], ['500', '', '500'] ] BAD_DATA = [ ['#indicator+xxx', '#indicator+yyy', '#indicator+zzz'], ['100', 'aaa', '100'], ['200', '2', '200'], ['300', '3', '300'], ['400', '4', '400'], ['500', '5', '500'] ] report = hxl.validate(GOOD_DATA, SCHEMA) self.assertTrue(report['is_valid']) self.assertEqual(0, report['stats']['total']) report = hxl.validate(BAD_DATA, SCHEMA) self.assertFalse(report['is_valid']) self.assertEqual(1, report['stats']['total'])
def test_taxonomy_bad(self): schema = hxl.schema(SCHEMA_TAXONOMY) result = hxl.validate(hxl.data(DATA_TAXONOMY_BAD), schema) self.assertFalse(result['is_valid']) self.assertEqual(1, result['stats']['error']) self.assertEqual(0, result['stats']['external']) self.assertEqual(1, len(result['issues'])) self.assertEqual(0, len(result['external_issues']))
def test_spellings_multiple(self): SCHEMA = [ ['#valid_tag', '#valid_value+spelling'], ['#indicator', 'true'] ] DATA = [['#indicator+xxx', '#indicator+yyy']] + [['aaaaa', 'aaaab'] for n in range(0,50)] + [['aaaab', 'aaaaa']] report = hxl.validate(DATA, SCHEMA) self.assertFalse(report['is_valid']) self.assertEqual(2, report['stats']['total'])
def test_spellings_multiple(self): SCHEMA = [['#valid_tag', '#valid_value+spelling'], ['#indicator', 'true']] DATA = [['#indicator+xxx', '#indicator+yyy'] ] + [['aaaaa', 'aaaab'] for n in range(0, 50)] + [['aaaab', 'aaaaa']] report = hxl.validate(DATA, SCHEMA) self.assertFalse(report['is_valid']) self.assertEqual(2, report['stats']['total'])
def test_taxonomy_missing(self): """Handle a missing external taxonomy.""" schema = hxl.schema(SCHEMA_TAXONOMY_MISSING) result = hxl.validate(hxl.data(DATA_TAXONOMY_GOOD), schema) self.assertTrue(result['is_valid']) self.assertTrue('external_issues' in result) self.assertEqual(0, result['stats']['error']) self.assertEqual(1, result['stats']['external']) self.assertEqual(0, len(result['issues'])) self.assertEqual(1, len(result['external_issues']))
def test_different_indicator_datatypes(self): """One rule, but three different indicators with different tagspecs and datatypes""" SCHEMA = [['#valid_tag', '#valid_datatype+consistent'], ['#indicator', 'true']] GOOD_DATA = [['#indicator+xxx', '#indicator+yyy', '#indicator+zzz'], ['100', 'aaa', '100'], ['200', '', '200'], ['300', '', '300'], ['400', '', '400'], ['500', '', '500']] BAD_DATA = [['#indicator+xxx', '#indicator+yyy', '#indicator+zzz'], ['100', 'aaa', '100'], ['200', '2', '200'], ['300', '3', '300'], ['400', '4', '400'], ['500', '5', '500']] report = hxl.validate(GOOD_DATA, SCHEMA) self.assertTrue(report['is_valid']) self.assertEqual(0, report['stats']['total']) report = hxl.validate(BAD_DATA, SCHEMA) self.assertFalse(report['is_valid']) self.assertEqual(1, report['stats']['total'])
def test_double_correlation(self): """Test correlation when more than one column has same tagspec""" SCHEMA = [ ['#valid_tag', '#description', '#valid_correlation', '#valid_value+list'], ['#adm1+code', 'xxxxx', '#adm1+name', 'X001|X002'] ] DATASET = [ ['#adm1+name', '#adm1+code', '#adm1+code'], ['Coast', 'X001', 'X001'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ] report = hxl.validate(DATASET, SCHEMA) self.assertEqual(4, report['stats']['total'])
def test_double_correlation(self): """Test correlation when more than one column has same tagspec""" SCHEMA = [[ '#valid_tag', '#description', '#valid_correlation', '#valid_value+list' ], ['#adm1+code', 'xxxxx', '#adm1+name', 'X001|X002']] DATASET = [ ['#adm1+name', '#adm1+code', '#adm1+code'], ['Coast', 'X001', 'X001'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ['Plains', 'X002', 'X02'], ] report = hxl.validate(DATASET, SCHEMA) self.assertEqual(4, report['stats']['total'])
def test_taxonomy_all(self): schema = hxl.schema(SCHEMA_TAXONOMY_ALL) result = hxl.validate(hxl.data(DATA_TAXONOMY_GOOD), schema) self.assertTrue(result['is_valid']) self.assertEqual(0, result['stats']['error'])
import hxl DATA_FILE = 'data/unhcr_popstats_export_persons_of_concern_all_data.hxl' hxl.validate(hxl.io.make_input(DATA_FILE, allow_local=True))
def test_top_level(self): """Use the package-level alias""" report = hxl.validate(self.DATA)
def run_validation(url, content, content_hash, sheet_index, selector, schema_url, schema_content, schema_content_hash, schema_sheet_index, include_dataset): """ Do the actual validation run, using the arguments provided. Separated from the controller so that we can cache the result easiler. The *_hash arguments exist only to assist with caching. @returns: a validation report, suitable for returning as JSON. """ # test for opening error conditions if (url is not None and content is not None): raise requests.exceptions.BadRequest( "Both 'url' and 'content' specified") if (url is None and content is None): raise requests.exceptions.BadRequest( "Require one of 'url' or 'content'") if (schema_url is not None and schema_content is not None): raise requests.exceptions.BadRequest( "Both 'schema_url' and 'schema_content' specified") # set up the main data if content: source = hxl.data( hxl.io.make_input(content, sheet_index=sheet_index, selector=selector)) else: source = hxl.data(url, sheet_index=sheet_index, http_headers={'User-Agent': 'hxl-proxy/validation'}) # cache if we're including the dataset in the results (we have to run over it twice) if include_dataset: source = source.cache() # set up the schema (if present) if schema_content: schema_source = hxl.data( hxl.io.make_input(schema_content, sheet_index=schema_sheet_index, selector=selector)) elif schema_url: schema_source = hxl.data( schema_url, sheet_index=schema_sheet_index, http_headers={'User-Agent': 'hxl-proxy/validation'}) else: schema_source = None # Validate the dataset report = hxl.validate(source, schema_source) # add the URLs if supplied if url: report['data_url'] = url if sheet_index is not None: report['data_sheet_index'] = sheet_index if schema_url: report['schema_url'] = schema_url if schema_sheet_index is not None: report['schema_sheet_index'] = schema_sheet_index # include the original dataset if requested if include_dataset: content = [] content.append([ hxl_proxy.util.no_none(column.header) for column in source.columns ]) content.append([ hxl_proxy.util.no_none(column.display_tag) for column in source.columns ]) for row in source: content.append( [hxl_proxy.util.no_none(value) for value in row.values]) report['dataset'] = content return report
import hxl DATA_FILE='data/unhcr_popstats_export_persons_of_concern_all_data.hxl' hxl.validate(hxl.io.make_input(DATA_FILE, allow_local=True))