def test_read_invalid_extra_cols_handled(): # Test a schema-invalid extra column in one row source = [ ['key', 'value'], ['one', 1, 'unexpected'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = table.read(exc_handler=handler) expect = [ ['one', 1], ['two', 2], ] assert actual == expect assert len(errors) == 1 expect_row_data = OrderedDict([('key', 'one'), ('value', 1), ('tableschema-cast-error-extra-col-3', 'unexpected')]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def __get_table(self): if not self.__table: # Non tabular -> None if not self.tabular: return None # Get source/schema source = self.source if self.multipart: source = _MultipartSource(self) schema = self.__current_descriptor.get('schema') # Storage resource if self.__storage is not None: self.__table = Table(source, schema=schema, storage=self.__storage) # General resource else: options = self.__table_options descriptor = self.__current_descriptor # TODO: this option is experimental options['scheme'] = descriptor.get('scheme') options['format'] = descriptor.get('format', 'csv') if descriptor.get('data'): options['format'] = 'inline' if descriptor.get('encoding'): options['encoding'] = descriptor['encoding'] if descriptor.get('compression'): options['compression'] = descriptor['compression'] # TODO: these options are experimental options['pick_fields'] = descriptor.get( 'pickFields', options.get('pick_fields', None)) options['skip_fields'] = descriptor.get( 'skipFields', options.get('skip_fields', None)) options['pick_rows'] = descriptor.get( 'pickRows', options.get('pick_rows', [])) options['skip_rows'] = descriptor.get( 'skipRows', options.get('skip_rows', [])) # TODO: these options are depricated options['pick_fields'] = descriptor.get( 'pickColumns', options.get('pick_columns', None)) options['skip_fields'] = descriptor.get( 'skipColumns', options.get('skip_columns', None)) dialect = descriptor.get('dialect') if dialect: if not dialect.get('header', config.DEFAULT_DIALECT['header']): fields = descriptor.get('schema', {}).get('fields', []) options['headers'] = [ field['name'] for field in fields ] or None for key in _DIALECT_KEYS: if key in dialect: options[key.lower()] = dialect[key] self.__table = Table(source, schema=schema, **options) return self.__table
def test_iter_invalid_col_value_handled(): # Test a schema-invalid column value in one row, handled source = [ ['key', 'value'], ['one', 'not_an_int'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(exc_handler=handler)) expect = [ ['one', 'not_an_int'], ['two', 2], ] assert actual == expect assert isinstance(actual[0][1], FailedCast) assert len(errors) == 1 expect_row_data = OrderedDict([('key', 'one'), ('value', 'not_an_int')]) expect_error_data = OrderedDict([('value', 'not_an_int')]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='There are 1 cast errors', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_error_data)
def getSchema(self): ''' Get data schema infered from records, the number of records is defined by SCHEMA_INFER_LIMIT the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE ''' try: tmpCursor = self.getCursor() tmpCursor.execute(self.query_sql) i = settings.SCHEMA_INFER_LIMIT tmpList = [] row = tmpCursor.fetchone() while i > 0 and row is not None: tmpList.append(row) row = tmpCursor.fetchone() i -= 1 tmpList = [self.getColumns()] + tmpList t = Table(tmpList) t.infer() t.schema.descriptor[ 'missingValues'] = settings.SCHEMA_DATABASE_MISSING_VALUES return t.infer(confidence=settings.SCHEMA_INFER_CONFIDENCE) except Exception as e: tmpCursor.close() raise e
def test_iter_with_headers_field_names_mismatch_handled(): source = [ ['id', 'bad', 'name'], [1, 39, 'Paul'], [2, 42, 'Peter'], ] table = Table(source, schema=SCHEMA_CSV) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [] actual = list(table.iter(exc_handler=handler)) assert actual == expect assert len(errors) == 2 for i, error in enumerate(errors): expect_keyed_row_data = OrderedDict(zip(source[0], source[i + 1])) exc, row_number, row_data, error_data = error assert isinstance(exc, exceptions.CastError) assert row_number == i + 2 # actual row number including header line assert row_data == expect_keyed_row_data assert error_data == expect_keyed_row_data assert 'match schema field names' in str(exc)
def test_iter_missing_cols_handled(): source = [ ['key', 'value'], [ 'one', ], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(exc_handler=handler)) expect = [ ['one', None], ['two', 2], ] assert actual == expect expect_row_data = OrderedDict([('key', 'one'), ('value', None)]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def test_iter_unique_primary_key_violation_handled(): # Test exception handler option to switch off fail-fast data validation # behaviour schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' source = [ ['id', 'age', 'name'], [1, 39, 'Paul'], [1, 36, 'Jane'], ] table = Table(source, schema=schema) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [ [1, 39, 'Paul'], [1, 36, 'Jane'], ] actual = list(table.iter(exc_handler=handler)) assert actual == expect assert len(errors) == 1 exc, row_number, row_data, error_data = errors[0] assert isinstance(exc, exceptions.UniqueKeyError) assert row_number == 3 # actual row number including header line assert row_data == OrderedDict([('id', 1), ('age', 36), ('name', 'Jane')]) assert error_data == OrderedDict([('id', 1)]) assert 'duplicates' in str(exc)
def test_multiple_foreign_keys_same_field_invalid(): schema = deepcopy(FK_SCHEMA) relations = deepcopy(FK_RELATIONS) relations['gender'] = [{ 'firstname': 'Alex', 'gender': 'male/female' }, { 'firstname': 'Johny', 'gender': 'male' }, { 'firstname': 'Walter', 'gender': 'male' }, { 'firstname': 'Alice', 'gender': 'female' }] # the main ressource now has tow foreignKeys using the same 'name' field schema['foreignKeys'].append({ 'fields': 'name', 'reference': { 'resource': 'gender', 'fields': 'firstname' }, }) table = Table(FK_SOURCE, schema=schema) with pytest.raises(exceptions.RelationError) as excinfo: table.read(relations=relations) assert 'Foreign key' in str(excinfo.value)
def test_iter_single_field_foreign_key_invalid_handled(): relations = deepcopy(FK_RELATIONS) relations['people'][2]['firstname'] = 'Max' table = Table(FK_SOURCE, schema=FK_SCHEMA) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [ ['1', { 'firstname': 'Alex', 'surname': 'Martin' }, 'Martin'], ['2', { 'firstname': 'John', 'surname': 'Dockins' }, 'Dockins'], ['3', {}, 'White'], ] actual = list(table.iter(relations=relations, exc_handler=handler)) assert actual == expect assert len(errors) == 1 exc, row_number, row_data, error_data = errors[0] assert row_number == 4 expect_keyed_row_data = OrderedDict(zip(FK_SOURCE[0], FK_SOURCE[3])) assert row_data == expect_keyed_row_data assert error_data == OrderedDict([('name', 'Walter')]) assert isinstance(exc, exceptions.UnresolvedFKError) assert 'Foreign key' in str(exc)
def test_iter_with_headers_field_names_mismatch_stream_closed(): table = Table('data/data_headers_field_names_mismatch.csv', schema=SCHEMA_CSV) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_infer_schema_empty_file(): s = Table('data/empty.csv') d = s.infer() assert d == { 'fields': [], 'missingValues': [''], }
def test_single_field_foreign_key_invalid(): relations = deepcopy(FK_RELATIONS) relations['people'][2]['firstname'] = 'Max' table = Table(FK_SOURCE, schema=FK_SCHEMA) with pytest.raises(exceptions.RelationError) as excinfo: table.read(relations=relations) assert 'Foreign key' in str(excinfo.value)
def test_iter_invalid_extra_cols_stream_closed(): table = Table('data/data_invalid_extra_cols.csv', schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass # Circumvent name mangling to get at (overly private ;-)) # __stream attribute assert table._Table__stream.closed
def test_iter_unique_primary_key_violation_stream_closed(): schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' table = Table('data/data_unique_primary_key_violation.csv', schema=schema) with pytest.raises(exceptions.TableSchemaException) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_read_with_headers_field_names_mismatch(): source = [ ['id', 'bad', 'name'], [1, 39, 'Paul'], ] table = Table(source, schema=SCHEMA_CSV) with pytest.raises(exceptions.CastError) as excinfo: table.read() assert 'match schema field names' in str(excinfo.value)
def test_schema_infer_storage(import_module, apply_defaults): import_module.return_value = Mock(Storage=Mock(return_value=Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), ))) table = Table('table', storage='storage') table.infer() assert table.headers == ['key', 'value'] assert table.schema.descriptor == apply_defaults(SCHEMA_MIN)
def test_iter_single_field_foreign_key_invalid(): relations = deepcopy(FK_RELATIONS) relations['people'][2]['firstname'] = 'Max' table = Table(FK_SOURCE, schema=FK_SCHEMA) with pytest.raises(exceptions.RelationError) as excinfo: for _ in table.iter(relations=relations): pass assert isinstance(excinfo.value, exceptions.UnresolvedFKError) assert 'Foreign key' in str(excinfo.value)
def test_schema_infer_missing_values(): table = Table('data/data_infer_missing_values.csv') table.infer(missing_values=['-']) schema = deepcopy(SCHEMA_CSV) schema['missingValues'] = ['-'] assert table.schema.descriptor == schema assert table.read() == [[1, 39, 'Paul'], [None, 25, 'Test'], [2, 23, 'Jimmy'], [None, 25, 'Test'], [3, 36, 'Jane'], [None, 25, 'Test'], [4, 28, 'Judy']]
def test_multi_field_foreign_key_invalid(): schema = deepcopy(FK_SCHEMA) schema['foreignKeys'][0]['fields'] = ['name', 'surname'] schema['foreignKeys'][0]['reference']['fields'] = ['firstname', 'surname'] relations = deepcopy(FK_RELATIONS) del relations['people'][2] table = Table(FK_SOURCE, schema=schema) with pytest.raises(exceptions.RelationError) as excinfo: table.read(relations=relations) assert 'Foreign key' in str(excinfo.value)
def test_multiple_foreign_keys_same_field(): schema = deepcopy(FK_SCHEMA) relations = deepcopy(FK_RELATIONS) relations['gender'] = [{ 'firstname': 'Alex', 'gender': 'male/female' }, { 'firstname': 'John', 'gender': 'male' }, { 'firstname': 'Walter', 'gender': 'male' }, { 'firstname': 'Alice', 'gender': 'female' }] # the main ressource now has tow foreignKeys using the same 'name' field schema['foreignKeys'].append({ 'fields': 'name', 'reference': { 'resource': 'gender', 'fields': 'firstname' }, }) table = Table(FK_SOURCE, schema=schema) keyed_rows = table.read(keyed=True, relations=relations) assert keyed_rows == [ { 'id': '1', 'name': { 'firstname': 'Alex', 'surname': 'Martin', 'gender': 'male/female' }, 'surname': 'Martin' }, { 'id': '2', 'name': { 'firstname': 'John', 'surname': 'Dockins', 'gender': 'male' }, 'surname': 'Dockins' }, { 'id': '3', 'name': { 'firstname': 'Walter', 'surname': 'White', 'gender': 'male' }, 'surname': 'White' }, ]
def test_read_storage(import_module): # Mocks import_module.return_value = Mock(Storage=Mock(return_value=Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), ))) # Tests table = Table('table', backend='storage') expect = [['one', 1], ['two', 2]] actual = table.read() assert actual == expect
def test_iter_invalid_extra_cols(): source = [ ['key', 'value'], ['one', 1, 'unexpected'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert 'Row length' in str(excinfo.value)
def task5(): table = Table('movies.csv') table.infer() # fixing fields # IDs should be strings table.schema.descriptor['fields'][0]['type'] = 'string' table.schema.descriptor['fields'][1]['type'] = 'string' table.schema.commit() print(table.schema.valid) table.schema.save('schema.json')
def infer(self, limit=None, confidence=.75): """ Infer the current type by looking at the values in the table """ # Do initial infer tqo set up headers and schema. Table.infer(self) rows = self.read(cast=False) headers = self.headers # Get descriptor fields = [] type_matches = {} for header in headers: fields.append({'name': header}) rindex = 0 for rindex, row in enumerate(rows): if limit is not None and rindex == limit: break # build a column-wise lookup of type matches for cindex, value in enumerate(row): typeid = self.__get_type(value, type_matches.get(cindex, None)) type_matches[cindex] = typeid self.row_count = rindex url_type = type(urlparse('foo')) for index, results in type_matches.items(): type_name, type_format = None, 'default' if results is bool: type_name = 'boolean' elif results is int: type_name = 'integer' elif results is float: type_name = 'number' elif results is str: type_name = 'string' elif results is datetime.datetime: type_name = 'datetime' type_format = 'any' elif results is url_type: type_name = 'string' type_format = 'uri' else: raise DerivaCSVError(msg='Bad type in infer') fields[index].update({'type': type_name, 'format': type_format}) # Now update the schema to have the inferred values. self.schema.descriptor['fields'] = fields # Reset the key constraints as they were blasted away by the infer. self.__set_key_constraints() self.schema.commit() return
def test_unique_constraint_violation(): schema = deepcopy(SCHEMA_CSV) schema['fields'][0]['constraints'] = {'unique': True} source = [ ['id', 'age', 'name'], [1, 39, 'Paul'], [1, 36, 'Jane'], ] table = Table(source, schema=schema) with pytest.raises(exceptions.TableSchemaException) as excinfo: table.read() assert 'duplicates' in str(excinfo.value)
def test_unique_primary_key_violation(): schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' source = [ ['id', 'age', 'name'], [1, 39, 'Paul'], [1, 36, 'Jane'], ] table = Table(source, schema=schema) with pytest.raises(exceptions.TableSchemaException) as excinfo: table.read() assert 'duplicates' in str(excinfo.value)
def reindex(self): file_path, params = self.get_file_info() t = Table(file_path, ignore_blank_headers=True, **params) schema = t.infer() data = t.iter(keyed=True) self.storage.create(self.index_name, schema, reindex=True, always_recreate=True) self.storage.write(self.index_name, data) for res in self.storage.write(self.index_name, data): pass
def infer(path, limit=2000): table = Table(path) table.infer(limit=limit, confidence=0.75) data = pd.read_csv(path, low_memory=False) num_rows = data.index._stop rows_to_scan = limit if limit < num_rows else num_rows metadata_array = [] for field in table.schema.fields: metadata = Metadata() metadata.name = field.name metadata.type = field.type metadata.format = field.format object_description = data[field.name].astype(object).describe() missing_count = num_rows - int(object_description['count']) metadata.missing_count = missing_count metadata.missing_percentage = round( float(missing_count) / num_rows * 100, 2) distinct_count = int(object_description['unique']) metadata.distinct_count = distinct_count metadata.distinct_percentage = round( float(distinct_count) / (num_rows - missing_count) * 100, 2) metadata.most_frequent = object_description['top'] if metadata.type == "string" and metadata.missing_percentage != 100.0: if rows_to_scan == data[metadata.name].head(rows_to_scan).apply( lambda x: is_date(x)).sum(): metadata.type = "date" if (metadata.type == "integer" or metadata.type == "number") and (data.dtypes[field.name] == "int64" or data.dtypes[field.name] == "float64"): numeric_description = data[field.name].describe() metadata.min = numeric_description['min'] metadata.max = numeric_description['max'] metadata.ml_type = metadata.type if metadata.type == "integer" or metadata.type == "number": metadata.ml_type = "numeric" if metadata.type == "string": metadata.ml_type = "open_text" metadata_array.append(metadata) return metadata_array
def test_table_sql(name, resource): # Storage engine = create_engine('sqlite:///') storage = Storage.connect('sql', engine=engine) # Save table = Table(resource['data'], schema=resource['schema']) table.save('table', storage=storage) # Load table = Table('table', schema=resource['schema'], storage=storage) assert table.read() == cast(resource)['data']
def test_read_storage_passed_as_instance(): # Mocks storage = Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), spec=Storage, ) # Tests table = Table('table', storage=storage) table.infer() expect = [['one', 1], ['two', 2]] actual = table.read() assert actual == expect
def test_processors(): # Processor def skip_under_30(erows): for row_number, headers, row in erows: krow = dict(zip(headers, row)) if krow['age'] >= 30: yield (row_number, headers, row) # Create table table = Table('data/data_infer.csv', post_cast=[skip_under_30]) table.infer() expect = [[1, 39, 'Paul'], [3, 36, 'Jane']] actual = table.read() assert actual == expect
def test_table_bigquery(name, resource): # Storage os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' credentials = GoogleCredentials.get_application_default() service = build('bigquery', 'v2', credentials=credentials) project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] dataset = 'resource' prefix = '%s_' % uuid.uuid4().hex storage = Storage.connect('bigquery', service=service, project=project, dataset=dataset, prefix=prefix) # Save table = Table(resource['data'], schema=resource['schema']) table.save('table', storage=storage) # Load table = Table('table', schema=resource['schema'], storage=storage) assert table.read() == cast(resource)['data'] # Clean storage.delete()
# pip install sqlalchemy tableschema-sql import sqlalchemy as sa from pprint import pprint from tableschema import Table # Data source SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' # Create SQL database db = sa.create_engine('sqlite://') # Data processor def skip_under_30(erows): for number, headers, row in erows: krow = dict(zip(headers, row)) if krow['age'] >= 30: yield (number, headers, row) # Work with table table = Table(SOURCE, post_cast=[skip_under_30]) table.schema.save('tmp/persons.json') # Save INFERRED schema table.save('persons', backend='sql', engine=db) # Save data to SQL table.save('tmp/persons.csv') # Save data to DRIVE # Check the result pprint(Table('persons', backend='sql', engine=db).read(keyed=True)) pprint(Table('tmp/persons.csv').read(keyed=True)) # Will print (twice) # [{'age': 39, 'id': 1, 'name': 'Paul'}, # {'age': 36, 'id': 3, 'name': 'Jane'}]
# pip install tableschema-pandas from pprint import pprint from tableschema import Table # Data source SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' # Data processor def skip_under_30(erows): for number, headers, row in erows: krow = dict(zip(headers, row)) if krow['age'] >= 30: yield (number, headers, row) # Export to pandas table = Table(SOURCE, post_convert=[skip_under_30]) storage = table.save('persons', backend='pandas') pprint(storage['persons']) # Will print (if use skip_under_30 filter) # id age name # 1 39 Paul # 3 36 Jane
from tableschema import Table # Data from WEB, schema from MEMORY SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] } # If schema is not passed it will be inferred table = Table(SOURCE, schema=SCHEMA) rows = table.iter() while True: try: print(next(rows)) except StopIteration: break except Exception as exception: print(exception)