def test_stream_reset_on_close_issue_190(): source = [['1', 'english'], ['2', '中国人']] stream = Stream(source) stream.open() stream.read(limit=1) == [['1', 'english']] stream.open() stream.read(limit=1) == [['1', 'english']] stream.close()
def test_stream_local_csv_zip_multiple_open(): # That's how `tableschema.iter()` acts stream = Stream('data/table.csv.zip') stream.open() assert stream.headers is None assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] stream.close() stream.open() assert stream.headers is None assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] stream.close()
def read_list_from_csv( url: str, headers: Union[int, List[int], List[str], None] = None, dict_form: bool = False, **kwargs: Any, ) -> List[Union[Dict, List]]: """Read a list of rows in dict or list form from a csv. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined a list of strings. If not set, all rows will be treated as containing values. Args: url (str): URL or path to read from headers (Union[int, List[int], List[str], None]): Row number of headers. Defaults to None. dict_form (bool): Return dict (requires headers parameter) or list for each row. Defaults to False (list) **kwargs: Other arguments to pass to Tabulator Stream Returns: List[Union[Dict, List]]: List of rows in dict or list form """ if dict_form and headers is None: raise ValueError("If dict_form is True, headers must not be None!") stream = Stream(url, headers=headers, **kwargs) stream.open() result = stream.read(keyed=dict_form) stream.close() return result
def read_list_from_csv(filepath, dict_form=False, headers=None): # type: (str, bool, Optional[int]) -> List[Union[Dict, List]] """Read a list of rows in dict or list form from a csv. Args: filepath (str): Path to read from dict_form (bool): Return in dict form. Defaults to False. headers (Optional[List[str]]): Row number of headers. Defaults to None. Returns: List[Union[Dict, List]]: List of rows in dict or list form """ stream = Stream(filepath, headers=headers) stream.open() result = stream.read(keyed=dict_form) stream.close() return result
def create_datastore(self, schema=None, primary_key=None, delete_first=0, path=None): # type: (Optional[List[dict]], Optional[str], Optional[int], Optional[str]) -> None """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX. Args: schema (List[dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None. primary_key (Optional[str]): Primary key of schema. Defaults to None. delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0. path (Optional[str]): Local path to file that was uploaded. Defaults to None. Returns: None """ if delete_first == 0: pass elif delete_first == 1: self.delete_datastore() elif delete_first == 2: if primary_key is None: self.delete_datastore() else: raise HDXError( 'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)' ) if path is None: # Download the resource url, path = self.download() delete_after_download = True else: url = self.data.get('url', None) if not url: raise HDXError('No URL to download!') delete_after_download = False zip_path = None stream = None try: extension = splitext(path)[1] if extension.lower() == '.zip': zip_file = zipfile.ZipFile(path) filename = zip_file.namelist()[0] tempdir = dirname(abspath(path)) zip_file.extract(filename, tempdir) zip_path = path path = join(tempdir, filename) def convert_to_text(extended_rows): for number, headers, row in extended_rows: for i, val in enumerate(row): row[i] = str(val) yield (number, headers, row) tabulator.config.BYTES_SAMPLE_SIZE = 1000000 stream = Stream(path, headers=1, post_parse=[convert_to_text]) stream.open() if schema is None: schema = list() for fieldname in stream.headers: schema.append({'id': fieldname, 'type': 'text'}) data = { 'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key } self._write_to_hdx('datastore_create', data, 'id') if primary_key is None: method = 'insert' else: method = 'upsert' logger.debug('Uploading data from %s to datastore' % url) offset = 0 chunksize = 100 rowset = stream.read(keyed=True, limit=chunksize) while len(rowset) != 0: data = { 'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset } self._write_to_hdx('datastore_upsert', data, 'id') rowset = stream.read(keyed=True, limit=chunksize) logger.debug('Uploading: %s' % offset) offset += chunksize except Exception as e: six.raise_from(HDXError('Upload to datastore of %s failed!' % url), e) finally: if stream: stream.close() if delete_after_download: unlink(path) if zip_path: unlink(zip_path) else: if zip_path: unlink( path ) # ie. we keep the zip but remove the extracted file
def test_stream_read_closed(): stream = Stream('data/table.csv') with pytest.raises(exceptions.TabulatorException) as excinfo: stream.read() assert 'stream.open()' in str(excinfo.value)
def test_stream_source_error_data(): stream = Stream('[1,2]', scheme='text', format='json') with pytest.raises(exceptions.SourceError) as excinfo: stream.open() stream.read()