def test_raises_fileexistserror_on_existing_archive_file( self, archive_params: Tuple[str, SortedDict], archive_dir: LocalPath): filename: str schema: SortedDict filename, schema = archive_params datafile = os.path.join(get_data_path(), filename) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) expected_df = DataFrame(read_csv(datafile, dtype=str)) expected_paths: SortedList = _get_expected_paths( archive_dir, schema, expected_df, filename) for e in expected_paths: os.makedirs(os.path.dirname(e), exist_ok=True) with open(e, mode="w") as f: f.write(rand_string()) with pytest.raises(FileExistsError, match=os.path.basename(datafile)): syphon.archive(archive_dir, [datafile], schema_filepath=schemafile, overwrite=False) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_raises_indexerror_when_a_schema_column_does_not_exist( self, archive_meta_params: Tuple[str, str, SortedDict], archive_dir: LocalPath): bad_column = "non_existent_column" filename: str expectedfilename: str schema: SortedDict filename, expectedfilename, schema = archive_meta_params # Add a bad column. local_schema = schema.copy() local_schema["%d" % len(local_schema)] = bad_column datafile = os.path.join(get_data_path(), filename + ".csv") metafile = os.path.join(get_data_path(), filename + ".meta") schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(local_schema, schemafile) with pytest.raises(IndexError, match=bad_column): syphon.archive( archive_dir, [datafile], meta_files=[metafile], schema_filepath=schemafile, overwrite=True, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_raises_filenotfounderror_when_data_cannot_be_found( self, archive_dir: LocalPath): datafile = os.path.join(get_data_path(), "nonexistantfile.csv") with pytest.raises(FileNotFoundError, match="data file"): syphon.archive(archive_dir, [datafile]) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_raises_filenotfounderror_when_schema_cannot_be_found( self, archive_params: Tuple[str, SortedDict], archive_dir: LocalPath): filename: str schema: SortedDict filename, schema = archive_params datafile = os.path.join(get_data_path(), filename) with pytest.raises(FileNotFoundError, match="schema file"): syphon.archive(archive_dir, [datafile], schema_filepath=rand_string()) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_empty_datafile(self, capsys: CaptureFixture, archive_dir: LocalPath, verbose: bool): datafile = os.path.join(get_data_path(), "empty.csv") assert not syphon.archive(archive_dir, [datafile], verbose=verbose) assert_captured_outerr(capsys.readouterr(), verbose, False) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_without_metadata_with_schema( self, capsys: CaptureFixture, archive_params: Tuple[str, SortedDict], archive_dir: LocalPath, overwrite: bool, verbose: bool, ): filename: str schema: SortedDict filename, schema = archive_params datafile = os.path.join(get_data_path(), filename) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) expected_df = DataFrame(read_csv(datafile, dtype=str)) expected_df.sort_values(list(expected_df.columns), inplace=True) expected_df.reset_index(drop=True, inplace=True) expected_paths: SortedList = _get_expected_paths( archive_dir, schema, expected_df, filename) if overwrite: for e in expected_paths: os.makedirs(os.path.dirname(e), exist_ok=True) with open(e, mode="w") as fd: fd.write(rand_string()) assert syphon.archive( archive_dir, [datafile], schema_filepath=schemafile, overwrite=overwrite, verbose=verbose, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock")) actual_frame = DataFrame() actual_paths = SortedList() for root, _, files in os.walk(archive_dir): for f in files: if ".csv" in f: filepath: str = os.path.join(root, f) actual_paths.add(filepath) actual_frame = concat([ actual_frame, DataFrame(read_csv(filepath, dtype=str)) ]) actual_frame.sort_values(list(actual_frame.columns), inplace=True) actual_frame.reset_index(drop=True, inplace=True) assert expected_paths == actual_paths assert_frame_equal(expected_df, actual_frame) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_with_metadata_without_schema( self, capsys: CaptureFixture, archive_meta_params: Tuple[str, str, SortedDict], archive_dir: LocalPath, overwrite: bool, verbose: bool, ): filename: str expectedfilename: str filename, expectedfilename, _ = archive_meta_params datafile = os.path.join(get_data_path(), filename + ".csv") metafile = os.path.join(get_data_path(), filename + ".meta") expected_df = DataFrame( # Read our dedicated *-combined.csv file instead of the import target. read_csv(os.path.join(get_data_path(), expectedfilename), dtype=str)) expected_df.sort_values(list(expected_df.columns), inplace=True) expected_df.reset_index(drop=True, inplace=True) expected_paths: SortedList = _get_expected_paths( archive_dir, SortedDict(), expected_df, filename + ".csv") if overwrite: for e in expected_paths: os.makedirs(os.path.dirname(e), exist_ok=True) with open(e, mode="w") as fd: fd.write(rand_string()) assert syphon.archive( archive_dir, [datafile], meta_files=[metafile], overwrite=overwrite, verbose=verbose, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock")) actual_df = DataFrame() actual_paths = SortedList() for root, _, files in os.walk(archive_dir): for f in files: if ".csv" in f: filepath: str = os.path.join(root, f) actual_paths.add(filepath) actual_df = concat( [actual_df, DataFrame(read_csv(filepath, dtype=str))]) actual_df.sort_values(list(actual_df.columns), inplace=True) actual_df.reset_index(drop=True, inplace=True) assert expected_paths == actual_paths assert_frame_equal(expected_df, actual_df)
def test_raises_valueerror_when_metadata_is_inconsistent( self, archive_meta_params: Tuple[str, str, SortedDict], archive_dir: LocalPath, import_dir: LocalPath, ): filename: str expectedfilename: str schema: SortedDict filename, expectedfilename, schema = archive_meta_params datafile = os.path.join(get_data_path(), filename + ".csv") bad_metafile = LocalPath( os.path.join(get_data_path(), filename + "-inconsistent.meta")) metafile = import_dir.join(filename + ".meta") bad_metafile.copy(metafile) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) # Find the column that will be in the message. metaframe = DataFrame(read_csv(metafile, dtype=str)) column: Optional[str] = None for column in metaframe.columns: if len(metaframe[column].drop_duplicates().values) > 1: break del metaframe assert column is not None with pytest.raises(ValueError, match=column): syphon.archive( archive_dir, [datafile], meta_files=[metafile], schema_filepath=schemafile, overwrite=True, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_incremental_fails_when_check_fails( capsys: CaptureFixture, schema: bool, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) if schema: syphon.init(schema, schemafile) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile if schema else None) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) LocalPath(datafile).copy(cache_file) assert os.path.exists(cache_file) # "check" ought to fail when the hash file does not exist. assert not syphon.check(cache_file, hash_filepath=hash_file) # If "check" fails, then the incremental build fails. assert not syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=True, overwrite=True, post_hash=post_hash, verbose=verbose, ) assert_post_hash(False, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_full_build_with_schema_maintains_data_fidelity( capsys: CaptureFixture, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], overwrite: bool, post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile, overwrite=overwrite) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile, overwrite=overwrite) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) if overwrite: cache_file.write(rand_string()) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=overwrite, post_hash=post_hash, verbose=verbose, ) assert_post_hash(post_hash, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_only_update_hash_file_when_post_hash_true( capsys: CaptureFixture, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") assert syphon.archive(archive_dir, [datafile]) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) cache_file.write(rand_string()) resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_file is None else hash_file) pathlib.Path(resolved_hashfile).touch() with syphon.hash.HashFile(resolved_hashfile) as hashfile: hashfile.update(syphon.hash.HashEntry(cache_file)) assert syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=False, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert not syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=True, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
def test_build_uses_unmodified_output_path_in_hash_entry( self, fs: "TestBuildHashEntryPath.FS", path_type: PathType): # NOTE: Current working directory is changed if PathType.NONE! target: Union[str, LocalPath] = fs.cache(path_type) datafile: str = os.path.join(get_data_path(), "iris.csv") assert syphon.archive(fs.archive, [datafile]) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) assert syphon.build( target, *get_data_files(fs.archive), hash_filepath=fs.hashfile, incremental=False, post_hash=True, ) with fs.hashfile.open(mode="r") as hf: actual_hash_entry = hf.readline() assert str(target) in actual_hash_entry
def test_incremental_becomes_full_build_when_cache_does_not_exist( capsys: CaptureFixture, schema: bool, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) if schema: syphon.init(schema, schemafile) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile if schema else None) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) # Raises a FileExistsError unless a full build is performed. assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=True, post_hash=post_hash, verbose=verbose, ) assert_post_hash(post_hash, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_raises_fileexistserror_when_cache_exists( archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], incremental: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") assert syphon.archive(archive_dir, [datafile], overwrite=True) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) cache_file.write(rand_string()) with pytest.raises(FileExistsError) as errinfo: syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=incremental, overwrite=False, post_hash=False, ) assert datafile in str(errinfo.value) assert_post_hash(False, cache_file, hash_filepath=hash_file)
def test_no_datafiles(self, capsys: CaptureFixture, archive_dir: LocalPath, verbose: bool): assert not syphon.archive(archive_dir, [], verbose=verbose) assert_captured_outerr(capsys.readouterr(), verbose, False)
def __call__(self, *args, **kwargs) -> bool: with self._monkeypatch.context() as m: m.setattr(syphon.core.build, "build", value=self._build_shim) return syphon.archive(*args, **kwargs)
def test_incremental_maintains_data_fidelity_when_new_data_new_and_missing_columns( capsys: CaptureFixture, archive_dir: LocalPath, import_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], verbose: bool, ): """Incremental build maintains data fidelity when new data * has columns not present in the existing data cache. * is missing columns found in the existing data cache. """ pre_datafiles: List[str] = [ os.path.join(get_data_path(), "iris_plus_partial-1-of-2-no-species.csv") ] datafiles: List[str] = [ os.path.join(get_data_path(), "iris_plus_partial-2-of-2-no-petalcolor.csv") ] resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_file is None else hash_file) assert syphon.archive(archive_dir, pre_datafiles) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) # Pre-build assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=False, post_hash=True, verbose=False, ) # Get the hash of the cache file before our main build. pre_cache_hash: str = syphon.hash.HashEntry(cache_file).hash # Get the hash of the hash file for easy file change checking. pre_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash # Main build assert syphon.build( cache_file, *datafiles, hash_filepath=hash_file, incremental=True, overwrite=True, post_hash=True, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) post_cache_hash: str = syphon.hash.HashEntry(cache_file).hash post_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash expected_frame = DataFrame( read_csv( os.path.join( get_data_path(), "iris_plus_partial-new-data-new-and-missing-columns.csv", ), dtype=str, index_col="Index", )) expected_frame.sort_index(inplace=True) assert pre_cache_hash != post_cache_hash assert pre_hash_hash != post_hash_hash with syphon.hash.HashFile(resolved_hashfile) as hashfile: for entry in hashfile: if os.path.samefile(entry.filepath, str(cache_file)): assert post_cache_hash == entry.hash actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True)