def test_subdir_reader_file_partitioning(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "20190101__asset_1.csv", "20190102__asset_1.csv", "20190103__asset_1.csv", "asset_2/20190101__asset_2.csv", "asset_2/20190102__asset_2.csv" ] for file in mock_files: if "/" in file: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() # If we have files, we should see them as individual assets subdir_reader_generator = SubdirReaderGenerator("test_generator", base_directory=base_directory) known_assets = subdir_reader_generator.get_available_data_asset_names() assert set(known_assets) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1", "asset_2" } # SubdirReaderGenerator uses the filename as partition name for root files known_partitions = subdir_reader_generator.get_available_partition_ids("20190101__asset_1") assert set(known_partitions) == {"20190101__asset_1"} kwargs = subdir_reader_generator.build_batch_kwargs_from_partition_id("20190101__asset_1", "20190101__asset_1") assert kwargs["path"] == os.path.join(base_directory, "20190101__asset_1.csv")
def test_file_kwargs_generator_extensions(tmp_path_factory): """csv, xls, parquet, json should be recognized file extensions""" basedir = str( tmp_path_factory.mktemp("test_file_kwargs_generator_extensions")) # Do not include: invalid extension with open(os.path.join(basedir, "f1.blarg"), "w") as outfile: outfile.write("\n\n\n") # Include with open(os.path.join(basedir, "f2.csv"), "w") as outfile: outfile.write("\n\n\n") # Do not include: valid subdir, but no valid files in it os.mkdir(os.path.join(basedir, "f3")) with open(os.path.join(basedir, "f3", "f3_1.blarg"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f3", "f3_2.blarg"), "w") as outfile: outfile.write("\n\n\n") # Include: valid subdir with valid files os.mkdir(os.path.join(basedir, "f4")) with open(os.path.join(basedir, "f4", "f4_1.csv"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f4", "f4_2.csv"), "w") as outfile: outfile.write("\n\n\n") # Do not include: valid extension, but dot prefix with open(os.path.join(basedir, ".f5.csv"), "w") as outfile: outfile.write("\n\n\n") # Include: valid extensions with open(os.path.join(basedir, "f6.tsv"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f7.xls"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f8.parquet"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f9.xls"), "w") as outfile: outfile.write("\n\n\n") with open(os.path.join(basedir, "f0.json"), "w") as outfile: outfile.write("\n\n\n") g1 = SubdirReaderGenerator(base_directory=basedir) g1_assets = g1.get_available_data_asset_names() # Use set in test to avoid order issues assert set(g1_assets) == {"f2", "f4", "f6", "f7", "f8", "f9", "f0"}
def test_subdir_reader_configurable_reader_method(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "20190101__asset_1.dat", "20190102__asset_1.dat", "20190103__asset_1.dat", "asset_2/20190101__asset_2.dat", "asset_2/20190102__asset_2.dat" ] for file in mock_files: if "/" in file: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() # If we have files, we should see them as individual assets subdir_reader_generator = SubdirReaderGenerator( "test_generator", base_directory=base_directory, reader_method='csv', known_extensions=['.dat']) batch_kwargs = next(subdir_reader_generator.get_iterator('asset_2')) assert batch_kwargs['reader_method'] == 'csv'
def test_subdir_reader_path_partitioning(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "asset_1/20190101__asset_1.csv", "asset_1/20190102__asset_1.csv", "asset_1/20190103__asset_1.csv", "asset_2/20190101__asset_2.csv", "asset_2/20190102__asset_2.csv" ] for file in mock_files: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() subdir_reader_generator = SubdirReaderGenerator("test_generator", base_directory=base_directory) # We should see two assets known_assets = subdir_reader_generator.get_available_data_asset_names() # Use set in test to avoid order issues assert set(known_assets) == {"asset_1", "asset_2"} # We should see three partitions for the first: known_partitions = subdir_reader_generator.get_available_partition_ids("asset_1") assert set(known_partitions) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1" } asset_1_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("asset_1")] asset_2_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("asset_2")] with pytest.raises(BatchKwargsError): not_an_asset_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("not_an_asset")] assert len(asset_1_kwargs) == 3 paths = [kwargs["path"] for kwargs in asset_1_kwargs] assert set(paths) == { os.path.join(base_directory, "asset_1/20190101__asset_1.csv"), os.path.join(base_directory, "asset_1/20190102__asset_1.csv"), os.path.join(base_directory, "asset_1/20190103__asset_1.csv") } partitions = [kwargs["partition_id"] for kwargs in asset_1_kwargs] # SubdirReaderGenerator uses filenames from subdirectories to generate partition names assert set(partitions) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1" } assert len(asset_1_kwargs[0].keys()) == 3 assert len(asset_2_kwargs) == 2 paths = [kwargs["path"] for kwargs in asset_2_kwargs] assert set(paths) == { os.path.join(base_directory, "asset_2/20190101__asset_2.csv"), os.path.join(base_directory, "asset_2/20190102__asset_2.csv") } partitions = [kwargs["partition_id"] for kwargs in asset_2_kwargs] assert set(partitions) == { "20190101__asset_2", "20190102__asset_2" } assert len(asset_2_kwargs[0].keys()) == 3