def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage
def test_stream_skip_rows_with_headers_example_from_readme(): source = [['#comment'], ['name', 'order'], ['John', 1], ['Alex', 2]] with Stream(source, headers=1, skip_rows=['#']) as stream: assert stream.headers == ['name', 'order'] assert stream.read() == [['John', 1], ['Alex', 2]]
def test_stream_skip_rows_excel_empty_column(): source = 'data/special/skip-rows.xlsx' with Stream(source, headers=1, skip_rows=['']) as stream: assert stream.read() == [['A', 'B'], [8, 9]]
def test_stream_bytes_sample_size(): source = 'data/special/latin1.csv' with Stream(source) as stream: assert stream.encoding == 'cp1252' with Stream(source, sample_size=0, bytes_sample_size=10) as stream: assert stream.encoding == 'utf-8'
def test_stream_html_content_with_allow_html(): # Link to html file containing information about csv file source = 'https://github.com/frictionlessdata/tabulator-py/blob/master/data/table.csv' with Stream(source, allow_html=True) as stream: assert stream
def test_stream_encoding_utf_16(): # Bytes encoded as UTF-16 with BOM in platform order is detected bio = io.BytesIO(u'en,English\nja,日本語'.encode('utf-16')) with Stream(bio, format='csv') as stream: assert stream.encoding == 'utf-16' assert stream.read() == [[u'en', u'English'], [u'ja', u'日本語']]
def test_stream_http_error(): stream = Stream('http://github.com/bad_path.csv') with pytest.raises(exceptions.HTTPError) as excinfo: stream.open()
def test_stream_bad_options_warning(): Stream('', scheme='text', format='csv', bad_option=True).open() with pytest.warns(UserWarning) as record: Stream('', scheme='text', format='csv', bad_option=True).open() assert 'bad_option' in str(record[0].message.args[0])
def test_stream_format_error(): stream = Stream('', format='bad_format') with pytest.raises(exceptions.FormatError) as excinfo: stream.open() assert 'bad_format' in str(excinfo.value)
def test_stream_io_error(): stream = Stream('bad_path.csv') with pytest.raises(exceptions.IOError) as excinfo: stream.open() assert 'bad_path.csv' in str(excinfo.value)
def test_stream_scheme_error(): stream = Stream('', scheme='bad_scheme') with pytest.raises(exceptions.SchemeError) as excinfo: stream.open() assert 'bad_scheme' in str(excinfo.value)
def test_stream_format_error_html(): stream = Stream('data/special/table.csv.html', format='csv') with pytest.raises(exceptions.FormatError) as excinfo: stream.open()
def test_stream_source_error_data(): stream = Stream('[1,2]', scheme='text', format='json') with pytest.raises(exceptions.SourceError) as excinfo: stream.open() stream.read()
def test_stream_gsheet_bad_url(): stream = Stream('https://docs.google.com/spreadsheets/d/bad') with pytest.raises(exceptions.HTTPError) as excinfo: stream.open()
def test_stream_compression_error_gz(): source = 'id,filename\n\1,dump.tar.gz' stream = Stream(source, scheme='text', format='csv') stream.open()
def test_stream_compression_error_zip(): source = 'id,filename\n1,archive.zip' stream = Stream(source, scheme='text', format='csv') stream.open()
def test_stream_encoding_explicit_latin1(): with Stream('data/special/latin1.csv', encoding='latin1') as stream: assert stream.encoding == 'iso8859-1' assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '©']]
def test_stream_scheme_file(): with Stream('data/table.csv') as stream: assert stream.scheme == 'file'
def test_stream_html_content(): # Link to html file containing information about csv file source = 'https://github.com/frictionlessdata/tabulator-py/blob/master/data/table.csv' with pytest.raises(exceptions.FormatError) as excinfo: Stream(source).open() assert 'HTML' in str(excinfo.value)
def test_stream_scheme_https(): with Stream(BASE_URL % 'data/table.csv') as stream: assert stream.scheme == 'https'
def test_stream_sample(): source = [['id', 'name'], ['1', 'english'], ['2', '中国人']] with Stream(source, headers=1) as stream: assert stream.headers == ['id', 'name'] assert stream.sample == [['1', 'english'], ['2', '中国人']]
def test_stream_scheme_stream(): with Stream(io.open('data/table.csv', mode='rb'), format='csv') as stream: assert stream.scheme == 'stream'
def test_stream_skip_rows(): source = 'data/special/skip-rows.csv' with Stream(source, skip_rows=['#', 5]) as stream: assert stream.read() == [['id', 'name'], ['1', 'english']]
def test_stream_scheme_text(): with Stream('text://a\nb', format='csv') as stream: assert stream.scheme == 'text'
def test_stream_skip_rows_with_headers(): source = 'data/special/skip-rows.csv' with Stream(source, headers=1, skip_rows=['#']) as stream: assert stream.headers == ['id', 'name'] assert stream.read() == [['1', 'english'], ['2', '中国人']]
def test_stream_format_ndjson(): with Stream('data/table.ndjson') as stream: assert stream.format == 'ndjson'
def test_stream_json_property(): source = '{"root": [["value1", "value2"], ["value3", "value4"]]}' with Stream(source, scheme='text', format='json', property='root') as stream: assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
def test_stream_format_ods(): with Stream('data/table.ods') as stream: assert stream.format == 'ods'
def test_stream_format_tsv(): with Stream('data/table.tsv') as stream: assert stream.format == 'tsv'
def test_stream_format_xlsx(): with Stream('data/table.xlsx') as stream: assert stream.format == 'xlsx'
def test_stream_encoding_explicit_utf8(): with Stream('data/table.csv', encoding='utf-8') as stream: assert stream.encoding == 'utf-8' assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
def test_stream_read_closed(): stream = Stream('data/table.csv') with pytest.raises(exceptions.TabulatorException) as excinfo: stream.read() assert 'stream.open()' in str(excinfo.value)
class Table(object): # Public def __init__(self, source, schema=None, strict=False, post_cast=[], storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Set attributes self.__source = source self.__stream = None self.__schema = None self.__headers = None self.__storage = None self.__post_cast = copy(post_cast) # Schema if schema is not None: self.__schema = Schema(schema) # Stream (tabulator) if storage is None: options.setdefault('headers', 1) self.__stream = Stream(source, **options) # Stream (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) if self.__schema: storage.describe(source, self.__schema.descriptor) headers = Schema(storage.describe(source)).field_names self.__stream = Stream(partial(storage.iter, source), headers=headers) self.__storage = storage @property def headers(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__headers @property def schema(self): """https://github.com/frictionlessdata/tableschema-py#schema """ return self.__schema def iter(self, keyed=False, extended=False, cast=True, relations=False): """https://github.com/frictionlessdata/tableschema-py#schema """ # Prepare unique checks if cast: unique_fields_cache = {} if self.schema: unique_fields_cache = _create_unique_fields_cache(self.schema) # Open/iterate stream self.__stream.open() iterator = self.__stream.iter(extended=True) iterator = self.__apply_processors(iterator, cast=cast) for row_number, headers, row in iterator: # Get headers if not self.__headers: self.__headers = headers # Check headers if cast: if self.schema and self.headers: if self.headers != self.schema.field_names: self.__stream.close() message = 'Table headers don\'t match schema field names' raise exceptions.CastError(message) # Check unique if cast: for indexes, cache in unique_fields_cache.items(): values = tuple(value for i, value in enumerate(row) if i in indexes) if not all(map(lambda value: value is None, values)): if values in cache['data']: self.__stream.close() message = 'Field(s) "%s" duplicates in row "%s"' message = message % (cache['name'], row_number) raise exceptions.CastError(message) cache['data'].add(values) # Resolve relations if relations: if self.schema: for foreign_key in self.schema.foreign_keys: row = _resolve_relations(row, headers, relations, foreign_key) if row is None: self.__stream.close() message = 'Foreign key "%s" violation in row "%s"' message = message % (foreign_key['fields'], row_number) raise exceptions.RelationError(message) # Form row if extended: yield (row_number, headers, row) elif keyed: yield dict(zip(headers, row)) else: yield row # Close stream self.__stream.close() def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None): """https://github.com/frictionlessdata/tableschema-py#schema """ result = [] rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations) for count, row in enumerate(rows, start=1): result.append(row) if count == limit: break return result def infer(self, limit=100): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: # Infer (tabulator) if not self.__storage: with self.__stream as stream: if self.__schema is None: self.__schema = Schema() self.__schema.infer(stream.sample[:limit], headers=stream.headers) if self.__headers is None: self.__headers = stream.headers # Infer (storage) else: descriptor = self.__storage.describe(self.__source) if self.__schema is None: self.__schema = Schema(descriptor) if self.__headers is None: self.__headers = self.__schema.field_names return self.__schema.descriptor def save(self, target, storage=None, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ # Save (tabulator) if storage is None: with Stream(self.iter, headers=self.__schema.headers) as stream: stream.save(target, **options) return True # Save (storage) else: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) storage.create(target, self.__schema.descriptor, force=True) storage.write(target, self.iter(cast=False)) return storage # Private def __apply_processors(self, iterator, cast=True): # Apply processors to iterator def builtin_processor(extended_rows): for row_number, headers, row in extended_rows: if self.__schema and cast: row = self.__schema.cast_row(row) yield (row_number, headers, row) processors = [builtin_processor] + self.__post_cast for processor in processors: iterator = processor(iterator) return iterator