def test_standalone_pandas_datasource(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == {"default": ["test"]} manual_batch_kwargs = PathBatchKwargs( path=os.path.join(str(test_folder_connection_path), "test.csv")) # Get the default (subdir_path) generator generator = datasource.get_generator() auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... # Note that we are using get_data_asset NOT get_batch here, since we are standalone (no batch concept) dataset = datasource.get_data_asset("test", generator_name="default", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() ## A datasource should always return an object with a typed batch_id assert isinstance(dataset.batch_kwargs, PathBatchKwargs) assert isinstance(dataset.batch_id, BatchId) assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_read_limit(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) dataset = datasource.get_data_asset( "test", generator_name="default", batch_kwargs=PathBatchKwargs({ "path": os.path.join(str(test_folder_connection_path), "test.csv"), "limit": 1 }), reader_options={ 'sep': ",", 'header': 0, 'index_col': 0 }) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1]).all() assert len(dataset) == 1 # A datasource should always return an object with a typed batch_id assert isinstance(dataset.batch_kwargs, PathBatchKwargs) assert isinstance(dataset.batch_id, BatchId) assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_invalid_reader_pandas_datasource(tmp_path_factory): basepath = str( tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource")) datasource = PandasDatasource('mypandassource', base_directory=basepath) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_data_asset( "idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.message with pytest.raises(BatchKwargsError) as exc: datasource.get_data_asset( "idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="blarg") assert "Unknown reader method: blarg" in exc.message dataset = datasource.get_data_asset( "idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="csv", header=0) assert dataset["a"][0] == 1