def test_read_limit(test_folder_connection_path_csv): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = PathBatchKwargs({ "path": os.path.join(str(test_folder_connection_path_csv), "test.csv"), # "reader_options": {"sep": ",", "header": 0, "index_col": 0}, "reader_options": { "sep": "," }, }) nested_update(batch_kwargs, datasource.process_batch_parameters(limit=1)) batch = datasource.get_batch(batch_kwargs=batch_kwargs) assert isinstance(batch, Batch) dataset = batch.data assert (dataset["col_1"] == [1]).all() assert len(dataset) == 1 # A datasource should always return an object with a typed batch_id assert isinstance(batch.batch_kwargs, PathBatchKwargs) assert isinstance(batch.batch_markers, BatchMarkers)
def _build_batch_kwargs_from_path(self, path, glob_config, reader_options=None, limit=None, partition_id=None): # We could add MD5 (e.g. for smallish files) # but currently don't want to assume the extra read is worth it # unless it's configurable # with open(path,'rb') as f: # md5 = hashlib.md5(f.read()).hexdigest() batch_kwargs = PathBatchKwargs({"path": path}) computed_partition_id = self._partitioner(path, glob_config) if partition_id and computed_partition_id: if partition_id != computed_partition_id: logger.warning( "Provided partition_id does not match computed partition_id; consider explicitly " "defining the asset or updating your partitioner.") batch_kwargs["partition_id"] = partition_id elif partition_id: batch_kwargs["partition_id"] = partition_id elif computed_partition_id: batch_kwargs["partition_id"] = computed_partition_id # Apply globally-configured reader options first batch_kwargs['reader_options'] = self.reader_options if reader_options: # Then update with any locally-specified reader options batch_kwargs['reader_options'].update(reader_options) if limit is not None: batch_kwargs['limit'] = limit return batch_kwargs
def _get_iterator(self, generator_asset, reader_options=None, limit=None): logger.debug("Beginning SubdirReaderGenerator _get_iterator for generator_asset: %s" % generator_asset) # If the generator_asset is a file, then return the path. # Otherwise, use files in a subdir as batches if os.path.isdir(os.path.join(self.base_directory, generator_asset)): subdir_options = os.listdir(os.path.join(self.base_directory, generator_asset)) batches = [] for file_option in subdir_options: for extension in self.known_extensions: if file_option.endswith(extension) and not file_option.startswith("."): batches.append(os.path.join(self.base_directory, generator_asset, file_option)) return self._build_batch_kwargs_path_iter(batches, reader_options=reader_options, limit=limit) else: for extension in self.known_extensions: path = os.path.join(self.base_directory, generator_asset + extension) if os.path.isfile(path): return iter([ self._build_batch_kwargs_from_path(path, reader_options=reader_options, limit=limit) ]) # If we haven't returned yet, raise raise BatchKwargsError("No valid files found when searching {:s} using configured known_extensions: " "{:s} ".format(os.path.join(self.base_directory, generator_asset), ', '.join(map(str, self.known_extensions))), batch_kwargs=PathBatchKwargs( path=os.path.join(self.base_directory, generator_asset)) )
def _build_batch_kwargs_from_path(self, path, reader_method=None, reader_options=None, limit=None): batch_kwargs = self._datasource.process_batch_parameters( reader_method=reader_method or self.reader_method, reader_options=reader_options or self.reader_options, limit=limit) batch_kwargs["path"] = path batch_kwargs["datasource"] = self._datasource.name return PathBatchKwargs(batch_kwargs)
def _build_batch_kwargs_from_path(self, path): # We could add MD5 (e.g. for smallish files) # but currently don't want to assume the extra read is worth it # unless it's configurable # with open(path,'rb') as f: # md5 = hashlib.md5(f.read()).hexdigest() batch_kwargs = PathBatchKwargs({ "path": path, }) partition_id = self._partitioner(path) if partition_id is not None: batch_kwargs.update({"partition_id": partition_id}) batch_kwargs.update(self.reader_options) return batch_kwargs
def test_standalone_pandas_datasource(test_folder_connection_path_csv): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) assert datasource.get_available_data_asset_names() == { "subdir_reader": { "names": [("test", "file")], "is_complete_list": True } } manual_batch_kwargs = PathBatchKwargs( path=os.path.join(str(test_folder_connection_path_csv), "test.csv")) generator = datasource.get_batch_kwargs_generator("subdir_reader") auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... # auto_batch_kwargs.update( # {"reader_options": {"sep": ",", "header": 0, "index_col": 0}} # ) auto_batch_kwargs.update({"reader_options": {"sep": ","}}) batch = datasource.get_batch(batch_kwargs=auto_batch_kwargs) assert isinstance(batch, Batch) dataset = batch.data assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() assert len(dataset) == 5 # A datasource should always return an object with a typed batch_id assert isinstance(batch.batch_kwargs, PathBatchKwargs) assert isinstance(batch.batch_markers, BatchMarkers)