def load_sheets(): schema = {'fields': []} sheets = [] for sheet_file in get_sheet_files(): filename = 'data/search_import/{}'.format(sheet_file['name']) wb = load_workbook(filename) for sheet_number, sheet_name in enumerate(wb.sheetnames, start=1): stream = Stream(filename, sheet=sheet_name) stream.open() # logging.info('{}/{}'.format(filename, sheet_name)) stream_iter = stream.iter() first_row = next(stream_iter) if 'migdar_id' not in first_row and sheet_number > 1: header_row = first_sheet_header_row else: header_row = first_row first_row = None if sheet_number == 1: first_sheet_header_row = header_row # logging.info(header_row) # logging.info(first_row) for k in header_row: if k and k not in [f['name'] for f in schema['fields']]: # logging.info('found field: {}'.format(k)) field_type = 'string' schema['fields'].append({'name': k, 'type': field_type}) sheets.append({ 'iterator': sheet_iterator(first_row, header_row, stream_iter, filename, sheet_name, stream), 'deleted': 'deleted' in sheet_name.strip().lower() }) return schema, sheets
def load_from_gdrive_files(rows): if rows.res.name == 'search_import_index': for row_index, row in enumerate(rows, start=1): # if row_index !=5: # continue file_url = f"https://migdar-internal-search.odata.org.il/__data/search_import/{row['name']}" print(file_url) with tempfile.NamedTemporaryFile('w+b', suffix='.xlsx') as temp_file: with get_migdar_session().get(file_url, stream=True) as response: for chunk in response.iter_content(): temp_file.write(chunk) temp_file.flush() wb = load_workbook(temp_file.name) for sheet_number, sheet_name in enumerate(wb.sheetnames, start=1): if 'deleted' in sheet_name.strip().lower(): continue stream = Stream(temp_file.name, sheet=sheet_name) stream.open() print('#{}.{}/{}: loading sheet'.format( row_index, row['name'], sheet_name)) stream_iter = stream.iter() try: first_row = next(stream_iter) except StopIteration: first_row = None if first_row: if 'migdar_id' not in first_row and sheet_number > 1: header_row = first_sheet_header_row else: header_row = first_row first_row = None if sheet_number == 1: first_sheet_header_row = header_row yield from sheet_iterator(first_row, header_row, stream_iter, row['name'], sheet_name, stream) else: for row in stream_iter: pass else: yield from rows
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False, foreign_keys_values=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Prepare relation checks if relations and not foreign_keys_values: # we have to test relations but the index has not been precomputed # prepare the index to boost validation process foreign_keys_values = self.index_foreign_keys_values(relations) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: row_with_relations = dict(zip(headers, copy(row))) for foreign_key in self.schema.foreign_keys: refValue = _resolve_relations(row, headers, foreign_keys_values, foreign_key) if refValue is None: self.__stream.close() keyed_row = OrderedDict(zip(headers, row)) # local values of the FK local_values = tuple( keyed_row[f] for f in foreign_key['fields']) message = 'Foreign key "%s" violation in row "%s": %s not found in %s' message = message % ( foreign_key['fields'], row_number, local_values, foreign_key['reference']['resource']) raise exceptions.RelationError(message) elif type(refValue) is dict: for field in foreign_key['fields']: if type(row_with_relations[field]) is not dict: # no previous refValues injected on this field row_with_relations[field] = refValue else: # alreayd one ref, merging row_with_relations[field].update(refValue) else: # case when all original value of the FK are empty # refValue == row, there is nothing to do # an empty dict might be a better returned value for this case ? pass # mutate row now that we are done, in the right order row = [row_with_relations[f] for f in headers] # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None, foreign_keys_values=False): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations, foreign_keys_values=foreign_keys_values) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage def index_foreign_keys_values(self, relations): # we dont need to load the complete reference table to test relations # we can lower payload AND optimize testing foreign keys # by preparing the right index based on the foreign key definition # foreign_keys are sets of tuples of all possible values in the foreign table # foreign keys = # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... } foreign_keys = defaultdict(dict) if self.schema: for fk in self.schema.foreign_keys: # load relation data relation = fk['reference']['resource'] # create a set of foreign keys # to optimize we prepare index of existing values # this index should use reference + foreign_keys as key # cause many foreign keys may use the same reference foreign_keys[relation][tuple(fk['reference']['fields'])] = {} for row in relations[relation]: key = tuple([ row[foreign_field] for foreign_field in fk['reference']['fields'] ]) # here we should chose to pick the first or nth row which match # previous implementation picked the first, so be it if key not in foreign_keys[relation][tuple( fk['reference']['fields'])]: foreign_keys[relation][tuple( fk['reference']['fields'])][key] = row return foreign_keys # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): """Table Schema table representation. Args: source (mixed): data source schema (Schema/dict/str): schema instance or descriptor/path/url backend (None/str): backend name like `sql` or `bigquery` options (dict): tabulator options or backend options """ # Public def __init__(self, source, schema=None, post_cast=None, backend=None, **options): # Defaults if post_cast is None: post_cast = [] # Schema self.__schema = None if isinstance(schema, (compat.str, dict)): self.__schema = Schema(schema) # Tabulator if backend is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) self.__stream.open() if self.__schema is None: self.__schema = Schema( infer(self.__stream.headers, self.__stream.sample)) # Storage else: module = 'tableschema.plugins.%s' % backend storage = import_module(module).Storage(**options) generator = partial(storage.iter, source) if self.__schema is None: self.__schema = Schema(storage.describe(source)) storage.describe(source, self.__schema.descriptor) self.__stream = Stream(generator, headers=self.__schema.headers) self.__stream.open() # Attributes self.__post_cast = post_cast @property def stream(self): """tabulator.Stream: stream instance """ return self.__stream @property def schema(self): """Schema: schema instance """ return self.__schema def iter(self, keyed=False, extended=False): """Yields table rows. Args: keyed (bool): yield keyed rows extended (bool): yield extended rows Yields: mixed[]/mixed{}: row or keyed row or extended row """ self.__stream.reset() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator) for number, headers, row in iterator: if extended: yield (number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row def read(self, keyed=False, extended=False, limit=None): """Read table rows. Args: limit (int): return this amount of rows Returns: list[]: table rows """ result = [] rows = self.iter(keyed=keyed, extended=extended) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def save(self, target, backend=None, **options): """Save table rows. NOTE: To save schema use `table.schema.save(target)` Args: target (str): saving target backend (None/str): backend name like sql` or `bigquery` options (dict): tabulator options or backend options Returns: None/Storage: storage instance if backend used """ # Tabulator if backend is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) # Storage else: module = 'tableschema.plugins.%s' % backend storage = import_module(module).Storage(**options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter()) return storage # Internal def __apply_processors(self, iterator): # Apply processors to iterator def builtin_processor(extended_rows): for number, headers, row in extended_rows: headers = self.__schema.headers row = self.__schema.cast_row(row) yield (number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: for foreign_key in self.schema.foreign_keys: row = _resolve_relations(row, headers, relations, foreign_key) if row is None: self.__stream.close() message = 'Foreign key "%s" violation in row "%s"' message = message % (foreign_key['fields'], row_number) raise exceptions.RelationError(message) # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): """Table representation # Arguments source (str/list[]): data source one of: - local file (path) - remote file (url) - array of arrays representing the rows schema (any): data schema in all forms supported by `Schema` class strict (bool): strictness option to pass to `Schema` constructor post_cast (function[]): list of post cast processors storage (None): storage name like `sql` or `bigquery` options (dict): `tabulator` or storage's options # Raises TableSchemaException: raises on any error """ # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """Table's headers is available # Returns str[]: headers """ return self.__headers @property def schema(self): """Returns schema class instance if available # Returns Schema: schema """ return self.__schema @property def size(self): """Table's size in BYTES if it's available If it's already read using e.g. `table.read`, otherwise returns `None`. In the middle of an iteration it returns size of already read contents # Returns int/None: size in BYTES """ if self.__stream: return self.__stream.size @property def hash(self): """Table's SHA256 hash if it's available. If it's already read using e.g. `table.read`, otherwise returns `None`. In the middle of an iteration it returns hash of already read contents # Returns str/None: SHA256 hash """ if self.__stream: return self.__stream.hash def iter(self, keyed=False, extended=False, cast=True, integrity=False, relations=False, foreign_keys_values=False, exc_handler=None): """Iterates through the table data and emits rows cast based on table schema. # Arguments keyed (bool): yield keyed rows in a form of `{header1\\: value1, header2\\: value2}` (default is false; the form of rows is `[value1, value2]`) extended (bool): yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]` (default is false; the form of rows is `[value1, value2]`) cast (bool): disable data casting if false (default is true) integrity (dict): dictionary in a form of `{'size'\\: <bytes>, 'hash'\\: '<sha256>'}` to check integrity of the table when it's read completely. Both keys are optional. relations (dict): dictionary of foreign key references in a form of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. If provided, foreign key fields will checked and resolved to one of their references (/!\\ one-to-many fk are not completely resolved). foreign_keys_values (dict): three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`. If not provided but relations is true, it will be created before the validation process by *index_foreign_keys_values* method exc_handler (func): optional custom exception handler callable. Can be used to defer raising errors (i.e. "fail late"), e.g. for data validation purposes. Must support the signature below # Custom exception handler ```python def exc_handler(exc, row_number=None, row_data=None, error_data=None): '''Custom exception handler (example) # Arguments: exc(Exception): Deferred exception instance row_number(int): Data row number that triggers exception exc row_data(OrderedDict): Invalid data row source data error_data(OrderedDict): Data row source data field subset responsible for the error, if applicable (e.g. invalid primary or foreign key fields). May be identical to row_data. ''' # ... ``` # Raises TableSchemaException: base class of any error CastError: data cast error IntegrityError: integrity checking error UniqueKeyError: unique key constraint violation UnresolvedFKError: unresolved foreign key reference error # Returns Iterator[list]: yields rows """ # TODO: Use helpers.default_exc_handler instead. Prerequisite: Use # stream context manager to make sure the stream gets properly closed # in all situations, see comment below. if exc_handler is None: stream = self.__stream def exc_handler(exc, *args, **kwargs): stream.close() raise exc # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Prepare relation checks if relations and not foreign_keys_values: # we have to test relations but the index has not been precomputed # prepare the index to boost validation process foreign_keys_values = self.index_foreign_keys_values(relations) # Open/iterate stream # TODO: Use context manager instead to make sure stream gets closed in # case of exceptions. Leaving that in for now for the sake of a smaller # diff. self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast, exc_handler=exc_handler) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: message = ('Table headers (%r) don\'t match ' 'schema field names (%r) in row %s' % (self.headers, self.schema.field_names, row_number)) keyed_row = OrderedDict(zip(headers, row)) exc_handler(exceptions.CastError(message), row_number=row_number, row_data=keyed_row, error_data=keyed_row) continue # Check unique if cast: for indexes, cache in unique_fields_cache.items(): keyed_values = OrderedDict((headers[i], value) for i, value in enumerate(row) if i in indexes) values = tuple(keyed_values.values()) if not all(map(lambda value: value is None, values)): if values in cache['data']: message = ('Field(s) "%s" duplicates in row "%s" ' 'for values %r' % (cache['name'], row_number, values)) exc_handler(exceptions.UniqueKeyError(message), row_number=row_number, row_data=OrderedDict(zip(headers, row)), error_data=keyed_values) cache['data'].add(values) # Resolve relations if relations: if self.schema: row_with_relations = dict(zip(headers, copy(row))) for foreign_key in self.schema.foreign_keys: refValue = _resolve_relations(row, headers, foreign_keys_values, foreign_key) if refValue is None: keyed_row = OrderedDict(zip(headers, row)) # local values of the FK local_keyed_values = { key: keyed_row[key] for key in foreign_key['fields'] } local_values = tuple(local_keyed_values.values()) message = ( 'Foreign key "%s" violation in row "%s": ' '%s not found in %s' % (foreign_key['fields'], row_number, local_values, foreign_key['reference']['resource'])) exc_handler(exceptions.UnresolvedFKError(message), row_number=row_number, row_data=keyed_row, error_data=local_keyed_values) # If we reach this point we don't fail-early # i.e. no exception has been raised. As the # reference can't be resolved, use empty dict # as the "unresolved result". for field in foreign_key['fields']: if not isinstance(row_with_relations[field], dict): row_with_relations[field] = {} elif type(refValue) is dict: # Substitute resolved referenced object for # original referencing field value. # For a composite foreign key, this substitutes # each part of the composite key with the # referenced object. for field in foreign_key['fields']: if type(row_with_relations[field]) is not dict: # no previous refValues injected on this field row_with_relations[field] = refValue else: # alreayd one ref, merging row_with_relations[field].update(refValue) else: # case when all original value of the FK are empty # refValue == row, there is nothing to do # an empty dict might be a better returned value for this case ? pass # mutate row now that we are done, in the right order row = [row_with_relations[f] for f in headers] # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Check integrity if integrity: violations = [] size = integrity.get('size') hash = integrity.get('hash') if size and size != self.__stream.size: violations.append('size "%s"' % self.__stream.size) if hash and hash != self.__stream.hash: violations.append('hash "%s"' % self.__stream.hash) if violations: message = 'Calculated %s differ(s) from declared value(s)' raise exceptions.IntegrityError(message % ' and '.join(violations)) # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, limit=None, integrity=False, relations=False, foreign_keys_values=False, exc_handler=None): """Read the whole table and return as array of rows > It has the same API as `table.iter` except for # Arguments limit (int): limit count of rows to read and return # Returns list[]: returns rows """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, integrity=integrity, relations=relations, foreign_keys_values=foreign_keys_values, exc_handler=exc_handler) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """Infer a schema for the table. It will infer and set Table Schema to `table.schema` based on table data. # Arguments limit (int): limit rows sample size confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1) # Returns dict: Table Schema descriptor """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """Save data source to file locally in CSV format with `,` (comma) delimiter > To save schema use `table.schema.save()` # Arguments target (str): saving target (e.g. file path) storage (None/str): storage name like `sql` or `bigquery` options (dict): `tabulator` or storage options # Raises TableSchemaException: raises an error if there is saving problem # Returns True/Storage: returns true or storage instance """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage def index_foreign_keys_values(self, relations): """Creates a three-level dictionary of foreign key references We create them optimized to speed up validation process in a form of `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`. For each foreign key of the schema it will iterate through the corresponding `relations['resource']` to create an index (i.e. a dict) of existing values for the foreign fields and store on keyed row for each value combination. The optimization relies on the indexation of possible values for one foreign key in a hashmap to later speed up resolution. This method is public to allow creating the index once to apply it on multiple tables charing the same schema (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group)) # Notes - the second key of the output is a tuple of the foreign fields, a proxy identifier of the foreign key - the same relation resource can be indexed multiple times as a schema can contain more than one Foreign Keys pointing to the same resource # Arguments relations (dict): dict of foreign key references in a form of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. It must contain all resources pointed in the foreign keys schema definition. # Returns dict: returns a three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})` """ # we dont need to load the complete reference table to test relations # we can lower payload AND optimize testing foreign keys # by preparing the right index based on the foreign key definition # foreign_keys are sets of tuples of all possible values in the foreign table # foreign keys = # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... } foreign_keys = defaultdict(dict) if self.schema: for fk in self.schema.foreign_keys: # load relation data relation = fk['reference']['resource'] # create a set of foreign keys # to optimize we prepare index of existing values # this index should use reference + foreign_keys as key # cause many foreign keys may use the same reference foreign_keys[relation][tuple(fk['reference']['fields'])] = {} for row in relations[relation]: key = tuple([ row[foreign_field] for foreign_field in fk['reference']['fields'] ]) # here we should chose to pick the first or nth row which match # previous implementation picked the first, so be it if key not in foreign_keys[relation][tuple( fk['reference']['fields'])]: foreign_keys[relation][tuple( fk['reference']['fields'])][key] = row return foreign_keys # Private def __apply_processors(self, iterator, cast=True, exc_handler=None): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row, row_number=row_number, exc_handler=exc_handler) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: for foreign_key in self.schema.foreign_keys: row = _resolve_relations(row, headers, relations, foreign_key) if row is None: self.__stream.close() message = 'Foreign key "%s" violation in row "%s"' message = message % (foreign_key['fields'], row_number) raise exceptions.RelationError(message) # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator