def load_sheets(): schema = {'fields': []} sheets = [] for sheet_file in get_sheet_files(): filename = 'data/search_import/{}'.format(sheet_file['name']) wb = load_workbook(filename) for sheet_number, sheet_name in enumerate(wb.sheetnames, start=1): stream = Stream(filename, sheet=sheet_name) stream.open() # logging.info('{}/{}'.format(filename, sheet_name)) stream_iter = stream.iter() first_row = next(stream_iter) if 'migdar_id' not in first_row and sheet_number > 1: header_row = first_sheet_header_row else: header_row = first_row first_row = None if sheet_number == 1: first_sheet_header_row = header_row # logging.info(header_row) # logging.info(first_row) for k in header_row: if k and k not in [f['name'] for f in schema['fields']]: # logging.info('found field: {}'.format(k)) field_type = 'string' schema['fields'].append({'name': k, 'type': field_type}) sheets.append({ 'iterator': sheet_iterator(first_row, header_row, stream_iter, filename, sheet_name, stream), 'deleted': 'deleted' in sheet_name.strip().lower() }) return schema, sheets
def read_list_from_csv( url: str, headers: Union[int, List[int], List[str], None] = None, dict_form: bool = False, **kwargs: Any, ) -> List[Union[Dict, List]]: """Read a list of rows in dict or list form from a csv. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined a list of strings. If not set, all rows will be treated as containing values. Args: url (str): URL or path to read from headers (Union[int, List[int], List[str], None]): Row number of headers. Defaults to None. dict_form (bool): Return dict (requires headers parameter) or list for each row. Defaults to False (list) **kwargs: Other arguments to pass to Tabulator Stream Returns: List[Union[Dict, List]]: List of rows in dict or list form """ if dict_form and headers is None: raise ValueError("If dict_form is True, headers must not be None!") stream = Stream(url, headers=headers, **kwargs) stream.open() result = stream.read(keyed=dict_form) stream.close() return result
def get_speech_parts_stream(**kwargs): stream = Stream(**kwargs) stream.open() if stream.headers == ['header', 'body']: return stream else: return None
def get_speech_parts_stream(**kwargs): stream = Stream(**kwargs) stream.open() assert stream.headers == [ 'header', 'body' ], "Invalid committee meeting protocol parts csv header: {}".format( kwargs["source"]) return stream
def _xls(): _s = Stream(path, format='xls', encoding=encoding) try: _s.open() _s.close() return 'xls' except (FormatError, BadZipFile, ValueError, XLRDError, FileNotFoundError, NotImplementedError): return None
def _csv(path, encoding): _s = Stream(path, format='csv', encoding=encoding) try: _s.open() _s.close() return 'csv' except (FormatError, UnicodeDecodeError, FileNotFoundError, BadZipFile): return None
def test_stream_reset_on_close_issue_190(): source = [['1', 'english'], ['2', '中国人']] stream = Stream(source) stream.open() stream.read(limit=1) == [['1', 'english']] stream.open() stream.read(limit=1) == [['1', 'english']] stream.close()
def _ods(): _s = Stream(path, format='ods', encoding=encoding) try: _s.open() _s.close() return True except (FormatError, OSError, BadZipFile, FileNotFoundError, TypeError): return False
def _xlsx(): _s = Stream(path, format='xlsx', encoding=encoding) try: _s.open() _s.close() return 'xlsx' except ValueError: return 'xlsx' except (FormatError, BadZipFile, OSError, FileNotFoundError, KeyError): return None
def test_stream_local_csv_zip_multiple_open(): # That's how `tableschema.iter()` acts stream = Stream('data/table.csv.zip') stream.open() assert stream.headers is None assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] stream.close() stream.open() assert stream.headers is None assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] stream.close()
def write_list_to_csv(list_of_rows, filepath, headers=None): # type: (List[Union[DictUpperBound, List]], str, Optional[List[str]]) -> None """Write a list of rows in dict or list form to a csv. Args: list_of_rows (List[Union[DictUpperBound, List]]): List of rows in dict or list form filepath (str): Path to write to headers (Optional[List[str]]): Headers to write. Defaults to None. Returns: None """ stream = Stream(list_of_rows, headers=headers) stream.open() stream.save(filepath, format='csv') stream.close()
def read_list_from_csv(filepath, dict_form=False, headers=None): # type: (str, bool, Optional[int]) -> List[Union[Dict, List]] """Read a list of rows in dict or list form from a csv. Args: filepath (str): Path to read from dict_form (bool): Return in dict form. Defaults to False. headers (Optional[List[str]]): Row number of headers. Defaults to None. Returns: List[Union[Dict, List]]: List of rows in dict or list form """ stream = Stream(filepath, headers=headers) stream.open() result = stream.read(keyed=dict_form) stream.close() return result
def load_from_gdrive_files(rows): if rows.res.name == 'search_import_index': for row_index, row in enumerate(rows, start=1): # if row_index !=5: # continue file_url = f"https://migdar-internal-search.odata.org.il/__data/search_import/{row['name']}" print(file_url) with tempfile.NamedTemporaryFile('w+b', suffix='.xlsx') as temp_file: with get_migdar_session().get(file_url, stream=True) as response: for chunk in response.iter_content(): temp_file.write(chunk) temp_file.flush() wb = load_workbook(temp_file.name) for sheet_number, sheet_name in enumerate(wb.sheetnames, start=1): if 'deleted' in sheet_name.strip().lower(): continue stream = Stream(temp_file.name, sheet=sheet_name) stream.open() print('#{}.{}/{}: loading sheet'.format( row_index, row['name'], sheet_name)) stream_iter = stream.iter() try: first_row = next(stream_iter) except StopIteration: first_row = None if first_row: if 'migdar_id' not in first_row and sheet_number > 1: header_row = first_sheet_header_row else: header_row = first_row first_row = None if sheet_number == 1: first_sheet_header_row = header_row yield from sheet_iterator(first_row, header_row, stream_iter, row['name'], sheet_name, stream) else: for row in stream_iter: pass else: yield from rows
def write_list_to_csv( filepath: str, list_of_rows: List[Union[DictUpperBound, List]], headers: Union[int, List[int], List[str], None] = None, ) -> None: """Write a list of rows in dict or list form to a csv. (The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined a list of strings. If not set, all rows will be treated as containing values.) Args: filepath (str): Path to write to list_of_rows (List[Union[DictUpperBound, List]]): List of rows in dict or list form headers (Union[int, List[int], List[str], None]): Headers to write. Defaults to None. Returns: None """ stream = Stream(list_of_rows, headers=headers) stream.open() stream.save(filepath, format="csv") stream.close()
def test_stream_format_error_html(): stream = Stream('data/special/table.csv.html', format='csv') with pytest.raises(exceptions.FormatError) as excinfo: stream.open()
def test_stream_compression_error_zip(): source = 'id,filename\n1,archive.zip' stream = Stream(source, scheme='text', format='csv') stream.open()
def test_stream_source_error_data(): stream = Stream('[1,2]', scheme='text', format='json') with pytest.raises(exceptions.SourceError) as excinfo: stream.open() stream.read()
def test_stream_gsheet_bad_url(): stream = Stream('https://docs.google.com/spreadsheets/d/bad') with pytest.raises(exceptions.HTTPError) as excinfo: stream.open()
def test_stream_compression_error_gz(): source = 'id,filename\n\1,dump.tar.gz' stream = Stream(source, scheme='text', format='csv') stream.open()
def test_stream_io_error(): stream = Stream('bad_path.csv') with pytest.raises(exceptions.IOError) as excinfo: stream.open() assert 'bad_path.csv' in str(excinfo.value)
def test_stream_format_error(): stream = Stream('data/special/table.bad-format') with pytest.raises(exceptions.FormatError) as excinfo: stream.open() assert 'bad-format' in str(excinfo.value)
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False, foreign_keys_values=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Prepare relation checks if relations and not foreign_keys_values: # we have to test relations but the index has not been precomputed # prepare the index to boost validation process foreign_keys_values = self.index_foreign_keys_values(relations) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: row_with_relations = dict(zip(headers, copy(row))) for foreign_key in self.schema.foreign_keys: refValue = _resolve_relations(row, headers, foreign_keys_values, foreign_key) if refValue is None: self.__stream.close() keyed_row = OrderedDict(zip(headers, row)) # local values of the FK local_values = tuple( keyed_row[f] for f in foreign_key['fields']) message = 'Foreign key "%s" violation in row "%s": %s not found in %s' message = message % ( foreign_key['fields'], row_number, local_values, foreign_key['reference']['resource']) raise exceptions.RelationError(message) elif type(refValue) is dict: for field in foreign_key['fields']: if type(row_with_relations[field]) is not dict: # no previous refValues injected on this field row_with_relations[field] = refValue else: # alreayd one ref, merging row_with_relations[field].update(refValue) else: # case when all original value of the FK are empty # refValue == row, there is nothing to do # an empty dict might be a better returned value for this case ? pass # mutate row now that we are done, in the right order row = [row_with_relations[f] for f in headers] # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None, foreign_keys_values=False): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations, foreign_keys_values=foreign_keys_values) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage def index_foreign_keys_values(self, relations): # we dont need to load the complete reference table to test relations # we can lower payload AND optimize testing foreign keys # by preparing the right index based on the foreign key definition # foreign_keys are sets of tuples of all possible values in the foreign table # foreign keys = # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... } foreign_keys = defaultdict(dict) if self.schema: for fk in self.schema.foreign_keys: # load relation data relation = fk['reference']['resource'] # create a set of foreign keys # to optimize we prepare index of existing values # this index should use reference + foreign_keys as key # cause many foreign keys may use the same reference foreign_keys[relation][tuple(fk['reference']['fields'])] = {} for row in relations[relation]: key = tuple([ row[foreign_field] for foreign_field in fk['reference']['fields'] ]) # here we should chose to pick the first or nth row which match # previous implementation picked the first, so be it if key not in foreign_keys[relation][tuple( fk['reference']['fields'])]: foreign_keys[relation][tuple( fk['reference']['fields'])][key] = row return foreign_keys # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): """Table representation # Arguments source (str/list[]): data source one of: - local file (path) - remote file (url) - array of arrays representing the rows schema (any): data schema in all forms supported by `Schema` class strict (bool): strictness option to pass to `Schema` constructor post_cast (function[]): list of post cast processors storage (None): storage name like `sql` or `bigquery` options (dict): `tabulator` or storage's options # Raises TableSchemaException: raises on any error """ # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """Table's headers is available # Returns str[]: headers """ return self.__headers @property def schema(self): """Returns schema class instance if available # Returns Schema: schema """ return self.__schema @property def size(self): """Table's size in BYTES if it's available If it's already read using e.g. `table.read`, otherwise returns `None`. In the middle of an iteration it returns size of already read contents # Returns int/None: size in BYTES """ if self.__stream: return self.__stream.size @property def hash(self): """Table's SHA256 hash if it's available. If it's already read using e.g. `table.read`, otherwise returns `None`. In the middle of an iteration it returns hash of already read contents # Returns str/None: SHA256 hash """ if self.__stream: return self.__stream.hash def iter(self, keyed=False, extended=False, cast=True, integrity=False, relations=False, foreign_keys_values=False, exc_handler=None): """Iterates through the table data and emits rows cast based on table schema. # Arguments keyed (bool): yield keyed rows in a form of `{header1\\: value1, header2\\: value2}` (default is false; the form of rows is `[value1, value2]`) extended (bool): yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]` (default is false; the form of rows is `[value1, value2]`) cast (bool): disable data casting if false (default is true) integrity (dict): dictionary in a form of `{'size'\\: <bytes>, 'hash'\\: '<sha256>'}` to check integrity of the table when it's read completely. Both keys are optional. relations (dict): dictionary of foreign key references in a form of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. If provided, foreign key fields will checked and resolved to one of their references (/!\\ one-to-many fk are not completely resolved). foreign_keys_values (dict): three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`. If not provided but relations is true, it will be created before the validation process by *index_foreign_keys_values* method exc_handler (func): optional custom exception handler callable. Can be used to defer raising errors (i.e. "fail late"), e.g. for data validation purposes. Must support the signature below # Custom exception handler ```python def exc_handler(exc, row_number=None, row_data=None, error_data=None): '''Custom exception handler (example) # Arguments: exc(Exception): Deferred exception instance row_number(int): Data row number that triggers exception exc row_data(OrderedDict): Invalid data row source data error_data(OrderedDict): Data row source data field subset responsible for the error, if applicable (e.g. invalid primary or foreign key fields). May be identical to row_data. ''' # ... ``` # Raises TableSchemaException: base class of any error CastError: data cast error IntegrityError: integrity checking error UniqueKeyError: unique key constraint violation UnresolvedFKError: unresolved foreign key reference error # Returns Iterator[list]: yields rows """ # TODO: Use helpers.default_exc_handler instead. Prerequisite: Use # stream context manager to make sure the stream gets properly closed # in all situations, see comment below. if exc_handler is None: stream = self.__stream def exc_handler(exc, *args, **kwargs): stream.close() raise exc # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Prepare relation checks if relations and not foreign_keys_values: # we have to test relations but the index has not been precomputed # prepare the index to boost validation process foreign_keys_values = self.index_foreign_keys_values(relations) # Open/iterate stream # TODO: Use context manager instead to make sure stream gets closed in # case of exceptions. Leaving that in for now for the sake of a smaller # diff. self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast, exc_handler=exc_handler) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: message = ('Table headers (%r) don\'t match ' 'schema field names (%r) in row %s' % (self.headers, self.schema.field_names, row_number)) keyed_row = OrderedDict(zip(headers, row)) exc_handler(exceptions.CastError(message), row_number=row_number, row_data=keyed_row, error_data=keyed_row) continue # Check unique if cast: for indexes, cache in unique_fields_cache.items(): keyed_values = OrderedDict((headers[i], value) for i, value in enumerate(row) if i in indexes) values = tuple(keyed_values.values()) if not all(map(lambda value: value is None, values)): if values in cache['data']: message = ('Field(s) "%s" duplicates in row "%s" ' 'for values %r' % (cache['name'], row_number, values)) exc_handler(exceptions.UniqueKeyError(message), row_number=row_number, row_data=OrderedDict(zip(headers, row)), error_data=keyed_values) cache['data'].add(values) # Resolve relations if relations: if self.schema: row_with_relations = dict(zip(headers, copy(row))) for foreign_key in self.schema.foreign_keys: refValue = _resolve_relations(row, headers, foreign_keys_values, foreign_key) if refValue is None: keyed_row = OrderedDict(zip(headers, row)) # local values of the FK local_keyed_values = { key: keyed_row[key] for key in foreign_key['fields'] } local_values = tuple(local_keyed_values.values()) message = ( 'Foreign key "%s" violation in row "%s": ' '%s not found in %s' % (foreign_key['fields'], row_number, local_values, foreign_key['reference']['resource'])) exc_handler(exceptions.UnresolvedFKError(message), row_number=row_number, row_data=keyed_row, error_data=local_keyed_values) # If we reach this point we don't fail-early # i.e. no exception has been raised. As the # reference can't be resolved, use empty dict # as the "unresolved result". for field in foreign_key['fields']: if not isinstance(row_with_relations[field], dict): row_with_relations[field] = {} elif type(refValue) is dict: # Substitute resolved referenced object for # original referencing field value. # For a composite foreign key, this substitutes # each part of the composite key with the # referenced object. for field in foreign_key['fields']: if type(row_with_relations[field]) is not dict: # no previous refValues injected on this field row_with_relations[field] = refValue else: # alreayd one ref, merging row_with_relations[field].update(refValue) else: # case when all original value of the FK are empty # refValue == row, there is nothing to do # an empty dict might be a better returned value for this case ? pass # mutate row now that we are done, in the right order row = [row_with_relations[f] for f in headers] # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Check integrity if integrity: violations = [] size = integrity.get('size') hash = integrity.get('hash') if size and size != self.__stream.size: violations.append('size "%s"' % self.__stream.size) if hash and hash != self.__stream.hash: violations.append('hash "%s"' % self.__stream.hash) if violations: message = 'Calculated %s differ(s) from declared value(s)' raise exceptions.IntegrityError(message % ' and '.join(violations)) # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, limit=None, integrity=False, relations=False, foreign_keys_values=False, exc_handler=None): """Read the whole table and return as array of rows > It has the same API as `table.iter` except for # Arguments limit (int): limit count of rows to read and return # Returns list[]: returns rows """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, integrity=integrity, relations=relations, foreign_keys_values=foreign_keys_values, exc_handler=exc_handler) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """Infer a schema for the table. It will infer and set Table Schema to `table.schema` based on table data. # Arguments limit (int): limit rows sample size confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1) # Returns dict: Table Schema descriptor """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """Save data source to file locally in CSV format with `,` (comma) delimiter > To save schema use `table.schema.save()` # Arguments target (str): saving target (e.g. file path) storage (None/str): storage name like `sql` or `bigquery` options (dict): `tabulator` or storage options # Raises TableSchemaException: raises an error if there is saving problem # Returns True/Storage: returns true or storage instance """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage def index_foreign_keys_values(self, relations): """Creates a three-level dictionary of foreign key references We create them optimized to speed up validation process in a form of `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`. For each foreign key of the schema it will iterate through the corresponding `relations['resource']` to create an index (i.e. a dict) of existing values for the foreign fields and store on keyed row for each value combination. The optimization relies on the indexation of possible values for one foreign key in a hashmap to later speed up resolution. This method is public to allow creating the index once to apply it on multiple tables charing the same schema (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group)) # Notes - the second key of the output is a tuple of the foreign fields, a proxy identifier of the foreign key - the same relation resource can be indexed multiple times as a schema can contain more than one Foreign Keys pointing to the same resource # Arguments relations (dict): dict of foreign key references in a form of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. It must contain all resources pointed in the foreign keys schema definition. # Returns dict: returns a three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})` """ # we dont need to load the complete reference table to test relations # we can lower payload AND optimize testing foreign keys # by preparing the right index based on the foreign key definition # foreign_keys are sets of tuples of all possible values in the foreign table # foreign keys = # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... } foreign_keys = defaultdict(dict) if self.schema: for fk in self.schema.foreign_keys: # load relation data relation = fk['reference']['resource'] # create a set of foreign keys # to optimize we prepare index of existing values # this index should use reference + foreign_keys as key # cause many foreign keys may use the same reference foreign_keys[relation][tuple(fk['reference']['fields'])] = {} for row in relations[relation]: key = tuple([ row[foreign_field] for foreign_field in fk['reference']['fields'] ]) # here we should chose to pick the first or nth row which match # previous implementation picked the first, so be it if key not in foreign_keys[relation][tuple( fk['reference']['fields'])]: foreign_keys[relation][tuple( fk['reference']['fields'])][key] = row return foreign_keys # Private def __apply_processors(self, iterator, cast=True, exc_handler=None): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row, row_number=row_number, exc_handler=exc_handler) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
def test_stream_scheme_error(): stream = Stream('', scheme='bad_scheme') with pytest.raises(exceptions.SchemeError) as excinfo: stream.open() assert 'bad_scheme' in str(excinfo.value)
def test_stream_format_error(): stream = Stream('', format='bad_format') with pytest.raises(exceptions.FormatError) as excinfo: stream.open() assert 'bad_format' in str(excinfo.value)
def test_stream_http_error(): stream = Stream('http://github.com/bad_path.csv') with pytest.raises(exceptions.HTTPError) as excinfo: stream.open()
def spreadsheet_file_format(path, encoding): # noqa: C901 encoding = encoding or 'utf-8' _s = Stream(path, encoding=encoding) _s.open() _s.close() return _s.format if _s.format != 'inline' else None
def create_datastore(self, schema=None, primary_key=None, delete_first=0, path=None): # type: (Optional[List[dict]], Optional[str], Optional[int], Optional[str]) -> None """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX. Args: schema (List[dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None. primary_key (Optional[str]): Primary key of schema. Defaults to None. delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0. path (Optional[str]): Local path to file that was uploaded. Defaults to None. Returns: None """ if delete_first == 0: pass elif delete_first == 1: self.delete_datastore() elif delete_first == 2: if primary_key is None: self.delete_datastore() else: raise HDXError( 'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)' ) if path is None: # Download the resource url, path = self.download() delete_after_download = True else: url = self.data.get('url', None) if not url: raise HDXError('No URL to download!') delete_after_download = False zip_path = None stream = None try: extension = splitext(path)[1] if extension.lower() == '.zip': zip_file = zipfile.ZipFile(path) filename = zip_file.namelist()[0] tempdir = dirname(abspath(path)) zip_file.extract(filename, tempdir) zip_path = path path = join(tempdir, filename) def convert_to_text(extended_rows): for number, headers, row in extended_rows: for i, val in enumerate(row): row[i] = str(val) yield (number, headers, row) tabulator.config.BYTES_SAMPLE_SIZE = 1000000 stream = Stream(path, headers=1, post_parse=[convert_to_text]) stream.open() if schema is None: schema = list() for fieldname in stream.headers: schema.append({'id': fieldname, 'type': 'text'}) data = { 'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key } self._write_to_hdx('datastore_create', data, 'id') if primary_key is None: method = 'insert' else: method = 'upsert' logger.debug('Uploading data from %s to datastore' % url) offset = 0 chunksize = 100 rowset = stream.read(keyed=True, limit=chunksize) while len(rowset) != 0: data = { 'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset } self._write_to_hdx('datastore_upsert', data, 'id') rowset = stream.read(keyed=True, limit=chunksize) logger.debug('Uploading: %s' % offset) offset += chunksize except Exception as e: six.raise_from(HDXError('Upload to datastore of %s failed!' % url), e) finally: if stream: stream.close() if delete_after_download: unlink(path) if zip_path: unlink(zip_path) else: if zip_path: unlink( path ) # ie. we keep the zip but remove the extracted file
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if isinstance(schema, Schema): self.__schema = schema elif schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: for foreign_key in self.schema.foreign_keys: row = _resolve_relations(row, headers, relations, foreign_key) if row is None: self.__stream.close() message = 'Foreign key "%s" violation in row "%s"' message = message % (foreign_key['fields'], row_number) raise exceptions.RelationError(message) # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers, confidence=confidence) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): """Table Schema table representation. Args: source (mixed): data source schema (Schema/dict/str): schema instance or descriptor/path/url backend (None/str): backend name like `sql` or `bigquery` options (dict): tabulator options or backend options """ # Public def __init__(self, source, schema=None, post_cast=None, backend=None, **options): # Defaults if post_cast is None: post_cast = [] # Schema self.__schema = None if isinstance(schema, (compat.str, dict)): self.__schema = Schema(schema) # Tabulator if backend is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) self.__stream.open() if self.__schema is None: self.__schema = Schema( infer(self.__stream.headers, self.__stream.sample)) # Storage else: module = 'tableschema.plugins.%s' % backend storage = import_module(module).Storage(**options) generator = partial(storage.iter, source) if self.__schema is None: self.__schema = Schema(storage.describe(source)) storage.describe(source, self.__schema.descriptor) self.__stream = Stream(generator, headers=self.__schema.headers) self.__stream.open() # Attributes self.__post_cast = post_cast @property def stream(self): """tabulator.Stream: stream instance """ return self.__stream @property def schema(self): """Schema: schema instance """ return self.__schema def iter(self, keyed=False, extended=False): """Yields table rows. Args: keyed (bool): yield keyed rows extended (bool): yield extended rows Yields: mixed[]/mixed{}: row or keyed row or extended row """ self.__stream.reset() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator) for number, headers, row in iterator: if extended: yield (number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row def read(self, keyed=False, extended=False, limit=None): """Read table rows. Args: limit (int): return this amount of rows Returns: list[]: table rows """ result = [] rows = self.iter(keyed=keyed, extended=extended) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def save(self, target, backend=None, **options): """Save table rows. NOTE: To save schema use `table.schema.save(target)` Args: target (str): saving target backend (None/str): backend name like sql` or `bigquery` options (dict): tabulator options or backend options Returns: None/Storage: storage instance if backend used """ # Tabulator if backend is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) # Storage else: module = 'tableschema.plugins.%s' % backend storage = import_module(module).Storage(**options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter()) return storage # Internal def __apply_processors(self, iterator): # Apply processors to iterator def builtin_processor(extended_rows): for number, headers, row in extended_rows: headers = self.__schema.headers row = self.__schema.cast_row(row) yield (number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: for foreign_key in self.schema.foreign_keys: row = _resolve_relations(row, headers, relations, foreign_key) if row is None: self.__stream.close() message = 'Foreign key "%s" violation in row "%s"' message = message % (foreign_key['fields'], row_number) raise exceptions.RelationError(message) # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator