def test_cast_row_wrong_type_multiple_errors_handled(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] target = ['string', 'notdecimal', '10.6', 'string', 'string'] errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = schema.cast_row(source, exc_handler=handler) assert actual == target assert isinstance(actual[1], FailedCast) assert isinstance(actual[2], FailedCast) assert len(errors) == 1 expect_row_data = OrderedDict([('id', 'string'), ('height', 'notdecimal'), ('age', '10.6'), ('name', 'string'), ('occupation', 'string')]) expect_error_data = OrderedDict([('height', 'notdecimal'), ('age', '10.6')]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='There are 2 cast errors', expect_row_number=None, expect_row_data=expect_row_data, expect_error_data=expect_error_data) exc = errors[0][0] assert len(exc.errors) == 2
def test_cast_row_handled(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string'] target = ['string', Decimal(10.0), 1, 'string', 'string'] errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) assert schema.cast_row(source, exc_handler=handler) == target assert len(errors) == 0
def test_cast_row_null_values_handled(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '', '-', 'string', 'null'] target = ['string', None, None, 'string', None] errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) assert schema.cast_row(source, exc_handler=handler) == target assert len(errors) == 0
class ResourceIterator(object): def __init__(self, infile, spec, orig_spec, validate=False, debug=False): self.spec = spec self.table_schema = Schema(orig_spec['schema']) self.field_names = [f['name'] for f in orig_spec['schema']['fields']] self.validate = validate self.infile = infile self.debug = debug self.stopped = False def __iter__(self): return self def __next__(self): if self.stopped: raise StopIteration() if self.debug: logging.error('WAITING') line = self.infile.readline().strip() if self.debug: logging.error('INGESTING: %r', line) if line == '': self.stopped = True raise StopIteration() line = json.loadl(line) if self.validate: to_validate = [line.get(f) for f in self.field_names] try: self.table_schema.cast_row(to_validate) except CastError as e: logging.error('Failed to validate row: %s', e) for i, err in enumerate(e.errors): logging.error('%d) %s', i + 1, err.message) raise ValueError('Casting failed for row %r' % line) from e except TypeError as e: raise ValueError('Validation failed for row %r' % line) from e return line def next(self): return self.__next__()
def test_cast_row_too_short_handled(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string'] # Missing values get substituted by None target = ['string', Decimal(10.0), 1, 'string', None] errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) assert schema.cast_row(source, exc_handler=handler) == target assert len(errors) == 1 expect_row_data = OrderedDict( [('id', 'string'), ('height', '10.0'), ('age', '1'), ('name', 'string'), ('occupation', None)]) _check_error( errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=None, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def test_cast_row_too_long_handled(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string', 'string'] # superfluous values are left out target = ['string', Decimal(10.0), 1, 'string', 'string'] errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) assert schema.cast_row(source, exc_handler=handler) == target assert len(errors) == 1 # superfluous values are keyed with col num for error reporting expect_row_data = OrderedDict( [('id', 'string'), ('height', '10.0'), ('age', '1'), ('name', 'string'), ('occupation', 'string'), ('tableschema-cast-error-extra-col-6', 'string')]) _check_error( errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=None, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def _iter_rows(self): if self._schema is not None: # Not empty results schema_obj = Schema(self._schema) if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) # when the column is a string value, the jsontableschema # library is incorrectly mapping the several literal # string values ('null', 'none', '-', etc.) to the python # `None` value - a deeper fix might be to reconsider using # that library, or maybe fixing this issue in that # library (since it's probably not a good idea to render # a number of strings un-representable) - this fixes the # problem for our result sets. Essentially, this zips # over each result set and checks whether we mapped a # non-null value to `None` in a string field, and if # so it restores the non-null value before continuing table_row = map( lambda field, original, mapped: original if (not mapped) and original and field.type == 'string' else mapped, schema_obj.fields, values, table_row) yield OrderedDict(zip(field_names, table_row)) elif 'boolean' in self.raw_data: # Results of an ASK query yield {'boolean': self.raw_data['boolean']}
def test_cast_row(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string'] target = ['string', Decimal(10.0), 1, 'string', 'string'] assert schema.cast_row(source) == target
def test_cast_row_wrong_type_multiple_errors(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.CastError) as excinfo: schema.cast_row(source) assert len(excinfo.value.errors) == 2
def test_cast_row_wrong_type(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.CastError): schema.cast_row(source)
def test_cast_row_too_long(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string', 'string'] with pytest.raises(exceptions.CastError): schema.cast_row(source)
def test_cast_row_null_values(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '', '-', 'string', 'null'] target = ['string', None, None, 'string', None] assert schema.cast_row(source) == target
def test_cast_row_wrong_type_no_fail_fast_true(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.MultipleInvalid): schema.cast_row(source, no_fail_fast=True)