def write(self, bucket, rows, keyed=False, as_generator=False, update_keys=None): """https://github.com/frictionlessdata/tableschema-sql-py#storage """ # Check update keys if update_keys is not None and len(update_keys) == 0: message = 'Argument "update_keys" cannot be an empty list' raise tableschema.exceptions.StorageError(message) # Get table and description table = self.__get_table(bucket) schema = tableschema.Schema(self.describe(bucket)) fallbacks = self.__fallbacks.get(bucket, []) # Write rows to table convert_row = partial(self.__mapper.convert_row, schema=schema, fallbacks=fallbacks) writer = Writer(table, schema, update_keys, self.__autoincrement, convert_row) with self.__connection.begin(): gen = writer.write(rows, keyed=keyed) if as_generator: return gen collections.deque(gen, maxlen=0)
def get_table_schema(table): """ Create a Table Schema descriptor from an SQL Alchemy table. See: https://frictionlessdata.io/specs/table-schema/ There are four possible elements in the Table Schema: * fields (an array of field descriptors) * primaryKey * foreignKeys (an array of foreignKey objects) * missingValues (an array of strings to be interpreted as null) """ descriptor = {} descriptor['fields'] = get_fields(table) descriptor['primaryKey'] = get_primary_key(table) fkeys = get_foreign_keys(table) if fkeys: descriptor['foreignKeys'] = fkeys descriptor['missingValues'] = get_missing_values(table) schema = tableschema.Schema(descriptor) if not schema.valid: raise AssertionError(f""" Invalid table schema for {table} Errors: {schema.errors} """) return descriptor
def test_validate_invalid_milestone_name(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["custom_milestone_field_names"] = { "10": "bad_milestone_name" } bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def cast(resource, skip=[]): resource = deepcopy(resource) schema = tableschema.Schema(resource['schema']) for row in resource['data']: for index, field in enumerate(schema.fields): if field.type not in skip: row[index] = field.cast_value(row[index]) return resource
def write(self, bucket, rows, keyed=False, as_generator=False, update_keys=None, buffer_size=1000, use_bloom_filter=True): """Write to bucket # Arguments keyed (bool): accept keyed rows as_generator (bool): returns generator to provide writing control to the client update_keys (str[]): update instead of inserting if key values match existent rows buffer_size (int=1000): maximum number of rows to try and write to the db in one batch use_bloom_filter (bool=True): should we use a bloom filter to optimize DB update performance (in exchange for some setup time) """ # Check update keys if update_keys is not None and len(update_keys) == 0: message = 'Argument "update_keys" cannot be an empty list' raise tableschema.exceptions.StorageError(message) # Get table and description table = self.__get_table(bucket) schema = tableschema.Schema(self.describe(bucket)) fallbacks = self.__fallbacks.get(bucket, []) # Write rows to table convert_row = partial(self.__mapper.convert_row, schema=schema, fallbacks=fallbacks) autoincrement = self.__get_autoincrement_for_bucket(bucket) writer = Writer( table, schema, # Only PostgreSQL supports "returning" so we don't use autoincrement for all autoincrement=autoincrement if self.__dialect in ['postgresql'] else None, update_keys=update_keys, convert_row=convert_row, buffer_size=buffer_size, use_bloom_filter=use_bloom_filter) with self.__connection.begin(): gen = writer.write(rows, keyed=keyed) if as_generator: return gen collections.deque(gen, maxlen=0)
def test_infer_schema_row_limit(self): filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = tableschema.compat.csv_reader(stream) schema = tableschema.infer(headers, values, row_limit=4) schema_model = tableschema.Schema(schema) self.assertEqual(schema_model.get_field('id').type, 'integer') self.assertEqual(schema_model.get_field('age').type, 'integer') self.assertEqual(schema_model.get_field('name').type, 'string')
def test_infer_schema_primary_key_list(self): primary_key = ['id', 'age'] filepath = os.path.join(self.data_dir, 'data_infer.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = tableschema.compat.csv_reader(stream) schema = tableschema.infer(headers, values, primary_key=primary_key) schema_model = tableschema.Schema(schema) self.assertTrue(schema_model.primary_key, primary_key)
def iter(self, bucket): schema = tableschema.Schema(self.describe(bucket)) datastore_search_url = \ "{}/datastore_search".format(self.__base_endpoint) params = {'resource_id': bucket} response = self._make_ckan_request(datastore_search_url, params=params) while response['result']['records']: for row in response['result']['records']: row = self.__mapper.restore_row(row, schema=schema) yield row next_url = self.__base_url + response['result']['_links']['next'] response = self._make_ckan_request(next_url)
def schema(self): return tableschema.Schema( { 'fields': [{ 'name': 'dest_path', 'type': 'string', 'constraints': { 'required': True, 'unique': True } }] }, strict=True)
def process_resource(spec, rows): schema = spec['schema'] jts = tableschema.Schema(schema) field_names = list(map(lambda f: f['name'], schema['fields'])) for row in rows: flattened_row = [row.get(name) for name in field_names] try: flattened_row = jts.cast_row(flattened_row) except Exception: logging.error('Failed to cast row %r', flattened_row) raise row = dict(zip(field_names, flattened_row)) yield row
def cast(resource, skip=[], wrap={}, wrap_each={}): resource = deepcopy(resource) schema = tableschema.Schema(resource['schema']) for row in resource['data']: for index, field in enumerate(schema.fields): value = row[index] if field.type not in skip: value = field.cast_value(value) if field.type in wrap: value = wrap[field.type](value) if field.type in wrap_each: value = list(map(wrap_each[field.type], value)) row[index] = value return resource
def process_one(rows, field, op, arg): spec = rows.spec mutated_field = field fields = spec['schema']['fields'] schema = { 'fields': list(filter(lambda f: f['name'] == mutated_field, fields)) } jts = tableschema.Schema(schema).fields[0] arg = jts.cast_value(arg) op = OPS[op] for row in rows: if op(arg, row.get(field)): yield row
def iter(self, bucket): # Check existense if bucket not in self.buckets: message = 'Bucket "%s" doesn\'t exist.' % bucket raise tableschema.exceptions.StorageError(message) # Prepare descriptor = self.describe(bucket) schema = tableschema.Schema(descriptor) # Yield rows for pk, row in self.__dataframes[bucket].iterrows(): row = self.__mapper.restore_row(row, schema=schema, pk=pk) yield row
def add_extended_metadata(filename, metadata): """Update the given metadata dict with additional tableschema metadata. This is only available for files that can be read by tableschema, which are only tsvs and csvs. Tableschema doesn't add much, but it could be handy if it can detect extended types like locations.""" metadata['previewbytes'] = get_preview_byte_count(filename) try: ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor new_field_definitions = [] for m, ts in zip(metadata['field_definitions'], ts_info['fields']): m['format'] = ts['format'] new_field_definitions.append(m) metadata['field_definitions'] = new_field_definitions except tabulator.exceptions.FormatError: pass return metadata
def iter(self, bucket): # Get table and fallbacks table = self.__get_table(bucket) schema = tableschema.Schema(self.describe(bucket)) autoincrement = self.__get_autoincrement_for_bucket(bucket) # Open and close transaction with self.__connection.begin(): # Streaming could be not working for some backends: # http://docs.sqlalchemy.org/en/latest/core/connections.html select = table.select().execution_options(stream_results=True) result = select.execute() for row in result: row = self.__mapper.restore_row( row, schema=schema, autoincrement=autoincrement) yield row
def write_aux(self, bucket, rows, method="upsert"): schema = tableschema.Schema(self.describe(bucket)) datastore_upsert_url = \ "{}/datastore_upsert".format(self.__base_endpoint) records = [] for r in rows: records.append(self.__mapper.convert_row(r, schema)) yield r params = { 'resource_id': bucket, 'method': method, 'force': True, 'records': records } self._make_ckan_request(datastore_upsert_url, method='POST', json=params)
def iter(self, bucket): """https://github.com/frictionlessdata/tableschema-sql-py#storage """ # Get table and fallbacks table = self.__get_table(bucket) schema = tableschema.Schema(self.describe(bucket)) # Open and close transaction with self.__connection.begin(): # Streaming could be not working for some backends: # http://docs.sqlalchemy.org/en/latest/core/connections.html select = table.select().execution_options(stream_results=True) result = select.execute() for row in result: row = self.__mapper.restore_row(row, schema=schema) yield row
def analyze_dataframe(filename, foreign_keys=None): # Pandas analysis df = pandas.read_csv(filename, sep='\t') pandas_info = df.describe(include='all') # Tableschema analysis ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor column_metadata = [] for column in ts_info['fields'][:10]: df_metadata = column.copy() col_name = column['name'] df_metadata.update(get_pandas_field_metadata(pandas_info, col_name)) df_metadata.update(get_foreign_key(foreign_keys, column)) column_metadata.append(df_metadata) dataframe_metadata = { 'name': 'Data Dictionary', # df.shape[0] seems to have issues determining rows 'numrows': len(df.index), 'numcols': df.shape[1], 'previewbytes': get_preview_byte_count(filename), 'field_definitions': column_metadata, 'labels': { 'name': 'Column Name', 'type': 'Data Type', 'format': 'Format', 'count': 'Number of non-null entries', '25': '25th Percentile', '50': '50th Percentile', '75': '75th Percentile', 'std': 'Standard Deviation', 'mean': 'Mean Value', 'min': 'Minimum Value', 'max': 'Maximum Value', 'unique': 'Unique Values', 'top': 'Top Common', 'frequency': 'Frequency of Top Common Value', 'reference': 'Link to resource definition' } } return dataframe_metadata
def process_resource(spec, rows, parameters, logger: LoggerImpl): mutated_field = parameters['field'] fields = spec['schema']['fields'] schema = { 'fields': list(filter(lambda f: f['name'] == mutated_field, fields)) } jts = tableschema.Schema(schema).fields[0] bad_count = 0 for i, row in enumerate(rows): if bad_count > 100: yield row continue value = row.get(mutated_field) try: value = jts.cast_value(value) row[mutated_field] = value yield row except Exception: bad_count += 1 logger.bad_value(spec['name'], i, dict(row), mutated_field, row.get(mutated_field)) yield row
def convert_descriptor_and_rows(self, descriptor, rows): """Convert descriptor and rows to Pandas """ # Prepare primary_key = None schema = tableschema.Schema(descriptor) if len(schema.primary_key) == 1: primary_key = schema.primary_key[0] elif len(schema.primary_key) > 1: message = 'Multi-column primary keys are not supported' raise tableschema.exceptions.StorageError(message) # Get data/index data_rows = [] index_rows = [] jtstypes_map = {} for row in rows: values = [] index = None for field, value in zip(schema.fields, row): try: if isinstance(value, float) and np.isnan(value): value = None if value and field.type == 'integer': value = int(value) value = field.cast_value(value) except tableschema.exceptions.CastError: value = json.loads(value) # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na if value is None and field.type in ('number', 'integer'): jtstypes_map[field.name] = 'number' value = np.NaN if field.name == primary_key: index = value else: values.append(value) data_rows.append(tuple(values)) index_rows.append(index) # Get dtypes dtypes = [] for field in schema.fields: if field.name != primary_key: field_name = field.name if six.PY2: field_name = field.name.encode('utf-8') dtype = self.convert_type(jtstypes_map.get(field.name, field.type)) dtypes.append((field_name, dtype)) # Create dataframe index = None columns = schema.headers array = np.array(data_rows, dtype=dtypes) if primary_key: index_field = schema.get_field(primary_key) index_dtype = self.convert_type(index_field.type) index_class = pd.Index if index_field.type in ['datetime', 'date']: index_class = pd.DatetimeIndex index = index_class(index_rows, name=primary_key, dtype=index_dtype) columns = filter(lambda column: column != primary_key, schema.headers) dataframe = pd.DataFrame(array, index=index, columns=columns) return dataframe
def __infer(self, sample, headers=None): if headers is None: headers = [None] * len(sample[0]) if sample else [] schema = tableschema.Schema() schema.infer(sample, headers=headers) return schema
import unittest import tableschema from etl.helpers import table_schema TEST_SCHEMA1: tableschema.Schema = tableschema.Schema({ "fields": [{ "name": "field1" }, { "name": "field2", "milestones": [0] }], "column_based_milestone_names": ["Intake", "Exit", "NinetyDays"], }) TEST_SCHEMA2: tableschema.Schema = tableschema.Schema({ "fields": [ { "name": "field1" }, { "name": "field2", "milestones": [0] }, { "name": "field3", "milestones": [0, 1] }, { "name": "field4", "milestones": [2],
import unittest import typing import numpy as np import pandas as pd import tableschema from etl.helpers import common, dataset_shape MEMBER_ORGANIZATION_ID = "sample_id" TEST_SCHEMA: tableschema.Schema = tableschema.Schema( { "fields": [ {"name": "field1"}, {"name": "field2"}, {"name": "field3", "allows_multiple": True}, ] } ) TEST_COLUMN_MAPPING: typing.Dict[str, str] = { "internal_column_name1": "field1", "internal_column_name2": "field2", "internal_column_name3": "field3", } TEST_SCHEMA_COL: tableschema.Schema = tableschema.Schema( { "fields": [ {"name": "field1"}, {"name": "field2", "milestones": [0]},
def convert_descriptor(self, bucket, descriptor, index_fields=[], autoincrement=None): """Convert descriptor to SQL """ # Prepare columns = [] indexes = [] fallbacks = [] constraints = [] column_mapping = {} table_name = self.convert_bucket(bucket) comment = _get_comment(descriptor.get('title', ''), descriptor.get('description', '')) schema = tableschema.Schema(descriptor) # Autoincrement if autoincrement is not None: columns.append( sa.Column(autoincrement, sa.Integer, autoincrement=True, nullable=False)) # Fields for field in schema.fields: column_type = self.convert_type(field.type) if not column_type: column_type = sa.Text fallbacks.append(field.name) nullable = not field.required comment = _get_field_comment(field) unique = field.constraints.get('unique', False) checks = [] for name, value in field.constraints.items(): if name == 'minLength': checks.append( Check('LENGTH("%s") >= %s' % (field.name, value))) elif name == 'maxLength': checks.append( Check('LENGTH("%s") <= %s' % (field.name, value))) elif name == 'minimum': checks.append(Check('"%s" >= %s' % (field.name, value))) elif name == 'maximum': checks.append(Check('"%s" <= %s' % (field.name, value))) elif name == 'pattern': if self.__dialect in ['postgresql']: checks.append( Check('"%s" ~ \'%s\'' % (field.name, value))) else: checks.append( Check('"%s" REGEXP \'%s\'' % (field.name, value))) elif name == 'enum': column_type = sa.Enum(*value, name='%s_%s_enum' % (table_name, field.name)) column = sa.Column(*([field.name, column_type] + checks), nullable=nullable, comment=comment, unique=unique) columns.append(column) column_mapping[field.name] = column # Primary key pk = descriptor.get('primaryKey', None) if pk is not None: if isinstance(pk, six.string_types): pk = [pk] if autoincrement is not None: if pk is not None: pk = [autoincrement] + pk else: pk = [autoincrement] if pk is not None: constraint = sa.PrimaryKeyConstraint(*pk) constraints.append(constraint) # Foreign keys if self.__dialect in ['postgresql', 'sqlite']: fks = descriptor.get('foreignKeys', []) for fk in fks: fields = fk['fields'] resource = fk['reference']['resource'] foreign_fields = fk['reference']['fields'] if isinstance(fields, six.string_types): fields = [fields] if resource != '': table_name = self.convert_bucket(resource) if isinstance(foreign_fields, six.string_types): foreign_fields = [foreign_fields] composer = lambda field: '.'.join([table_name, field]) foreign_fields = list(map(composer, foreign_fields)) constraint = sa.ForeignKeyConstraint(fields, foreign_fields) constraints.append(constraint) # Indexes if self.__dialect == 'postgresql': for index, index_definition in enumerate(index_fields): name = table_name + '_ix%03d' % index index_columns = [ column_mapping[field] for field in index_definition ] indexes.append(sa.Index(name, *index_columns)) return columns, constraints, indexes, fallbacks, comment
def test_validate_invalid_milestone(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["milestones"] = [10] bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def test_validate_numeric_enum_missing_value(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["constraints"]["maximum"] = 3 bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def test_validate_numeric_enum_not_int(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["type"] = "string" bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def convert_descriptor(self, bucket, descriptor, index_fields=[], autoincrement=None): """Convert descriptor to SQL """ # Prepare columns = [] indexes = [] fallbacks = [] constraints = [] column_mapping = {} table_name = self.convert_bucket(bucket) schema = tableschema.Schema(descriptor) # Autoincrement if autoincrement is not None: columns.append( sa.Column(autoincrement, sa.Integer, autoincrement=True, nullable=False)) # Fields for field in schema.fields: column_type = self.convert_type(field.type) if not column_type: column_type = sa.Text fallbacks.append(field.name) nullable = not field.required column = sa.Column(field.name, column_type, nullable=nullable) columns.append(column) column_mapping[field.name] = column # Primary key pk = descriptor.get('primaryKey', None) if pk is not None: if isinstance(pk, six.string_types): pk = [pk] if autoincrement is not None: if pk is not None: pk = [autoincrement] + pk else: pk = [autoincrement] if pk is not None: constraint = sa.PrimaryKeyConstraint(*pk) constraints.append(constraint) # Foreign keys if self.__dialect == 'postgresql': fks = descriptor.get('foreignKeys', []) for fk in fks: fields = fk['fields'] resource = fk['reference']['resource'] foreign_fields = fk['reference']['fields'] if isinstance(fields, six.string_types): fields = [fields] if resource != '': table_name = self.convert_bucket(resource) if isinstance(foreign_fields, six.string_types): foreign_fields = [foreign_fields] composer = lambda field: '.'.join([table_name, field]) foreign_fields = list(map(composer, foreign_fields)) constraint = sa.ForeignKeyConstraint(fields, foreign_fields) constraints.append(constraint) # Indexes if self.__dialect == 'postgresql': for index, index_definition in enumerate(index_fields): name = table_name + '_ix%03d' % index index_columns = [ column_mapping[field] for field in index_definition ] indexes.append(sa.Index(name, *index_columns)) return (columns, constraints, indexes, fallbacks)
def convert_descriptor_and_rows(self, descriptor, rows): """Convert descriptor and rows to Pandas """ schema = tableschema.Schema(descriptor) # Get data/index data_rows = [] index_rows = [] jtstypes_map = {} for row in rows: data_values = [] index_values = [] for field, value in zip(schema.fields, row): try: if isinstance(value, float) and np.isnan(value): value = None if value and field.type == 'integer': value = int(value) value = field.cast_value(value) except tableschema.exceptions.CastError: value = json.loads(value) # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na if value is None and field.type in ('number', 'integer'): jtstypes_map[field.name] = 'number' value = np.NaN if field.name in schema.primary_key: index_values.append(value) else: data_values.append(value) if len(schema.primary_key) == 1: index_rows.append(index_values[0]) elif len(schema.primary_key) > 1: index_rows.append(tuple(index_values)) data_rows.append(tuple(data_values)) # Create index index = None if schema.primary_key: if len(schema.primary_key) == 1: index_class = pd.Index index_field = schema.get_field(schema.primary_key[0]) index_dtype = self.convert_type(index_field.type) if field.type in ['datetime', 'date']: index_class = pd.DatetimeIndex index = index_class(index_rows, name=index_field.name, dtype=index_dtype) elif len(schema.primary_key) > 1: index = pd.MultiIndex.from_tuples(index_rows, names=schema.primary_key) # Create dtypes/columns dtypes = [] columns = [] for field in schema.fields: if field.name not in schema.primary_key: field_name = field.name if six.PY2: field_name = field.name.encode('utf-8') dtype = self.convert_type( jtstypes_map.get(field.name, field.type)) dtypes.append((field_name, dtype)) columns.append(field.name) # Create dataframe array = np.array(data_rows, dtype=dtypes) dataframe = pd.DataFrame(array, index=index, columns=columns) return dataframe