def test_build_no_hash(archive_dir: LocalPath, cache_file: LocalPath): assert not os.path.exists(cache_file) assert syphon.__main__.main(_init_args(archive_dir)) == 0 assert syphon.__main__.main(_archive_args(archive_dir)) == 0 arguments = _build_args(archive_dir, cache_file) arguments.append("--no-hash") assert syphon.__main__.main(arguments) == 0 assert os.path.exists(cache_file) assert not os.path.exists( cache_file.dirpath(syphon.core.check.DEFAULT_FILE)) assert cache_file.size() > 0
def assert_post_hash( post_hash: bool, cache_file: LocalPath, hash_filepath: Optional[LocalPath], verbose: bool = False, ): import syphon.core.check resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_filepath is None else hash_filepath) if post_hash: assert syphon.core.check.check(cache_file, hash_filepath=resolved_hashfile, verbose=verbose) else: assert not os.path.exists(resolved_hashfile)
def test_only_update_hash_file_when_post_hash_true( capsys: CaptureFixture, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") assert syphon.archive(archive_dir, [datafile]) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) cache_file.write(rand_string()) resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_file is None else hash_file) pathlib.Path(resolved_hashfile).touch() with syphon.hash.HashFile(resolved_hashfile) as hashfile: hashfile.update(syphon.hash.HashEntry(cache_file)) assert syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=False, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert not syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=True, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
def test_build( archive_dir: LocalPath, cache_file: LocalPath, hash_file: LocalPath, specify_hashfile: bool, ): assert not os.path.exists(cache_file) assert syphon.__main__.main(_init_args(archive_dir)) == 0 assert syphon.__main__.main(_archive_args(archive_dir)) == 0 arguments = _build_args(archive_dir, cache_file) if specify_hashfile: arguments.append(str(hash_file)) assert syphon.__main__.main(arguments) == 0 assert os.path.exists(cache_file) assert os.path.exists(hash_file if specify_hashfile else cache_file. dirpath(syphon.core.check.DEFAULT_FILE)) # If we're using our own hash file, then the default should not be created. if specify_hashfile: assert not os.path.exists( cache_file.dirpath(syphon.core.check.DEFAULT_FILE)) assert cache_file.size() > 0
def schema_file(request: FixtureRequest, archive_dir: LocalPath) -> Optional[LocalPath]: return None if request.param else archive_dir.dirpath("schemafile")
def test_incremental_maintains_data_fidelity_when_new_data_new_and_missing_columns( capsys: CaptureFixture, archive_dir: LocalPath, import_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], verbose: bool, ): """Incremental build maintains data fidelity when new data * has columns not present in the existing data cache. * is missing columns found in the existing data cache. """ pre_datafiles: List[str] = [ os.path.join(get_data_path(), "iris_plus_partial-1-of-2-no-species.csv") ] datafiles: List[str] = [ os.path.join(get_data_path(), "iris_plus_partial-2-of-2-no-petalcolor.csv") ] resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_file is None else hash_file) assert syphon.archive(archive_dir, pre_datafiles) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) # Pre-build assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=False, post_hash=True, verbose=False, ) # Get the hash of the cache file before our main build. pre_cache_hash: str = syphon.hash.HashEntry(cache_file).hash # Get the hash of the hash file for easy file change checking. pre_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash # Main build assert syphon.build( cache_file, *datafiles, hash_filepath=hash_file, incremental=True, overwrite=True, post_hash=True, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) post_cache_hash: str = syphon.hash.HashEntry(cache_file).hash post_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash expected_frame = DataFrame( read_csv( os.path.join( get_data_path(), "iris_plus_partial-new-data-new-and-missing-columns.csv", ), dtype=str, index_col="Index", )) expected_frame.sort_index(inplace=True) assert pre_cache_hash != post_cache_hash assert pre_hash_hash != post_hash_hash with syphon.hash.HashFile(resolved_hashfile) as hashfile: for entry in hashfile: if os.path.samefile(entry.filepath, str(cache_file)): assert post_cache_hash == entry.hash actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True)