Exemplo n.º 1
0
    def test_raises_fileexistserror_on_existing_archive_file(
            self, archive_params: Tuple[str, SortedDict],
            archive_dir: LocalPath):
        filename: str
        schema: SortedDict
        filename, schema = archive_params

        datafile = os.path.join(get_data_path(), filename)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        expected_df = DataFrame(read_csv(datafile, dtype=str))

        expected_paths: SortedList = _get_expected_paths(
            archive_dir, schema, expected_df, filename)

        for e in expected_paths:
            os.makedirs(os.path.dirname(e), exist_ok=True)
            with open(e, mode="w") as f:
                f.write(rand_string())

        with pytest.raises(FileExistsError, match=os.path.basename(datafile)):
            syphon.archive(archive_dir, [datafile],
                           schema_filepath=schemafile,
                           overwrite=False)

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 2
0
    def test_raises_indexerror_when_a_schema_column_does_not_exist(
            self, archive_meta_params: Tuple[str, str, SortedDict],
            archive_dir: LocalPath):
        bad_column = "non_existent_column"

        filename: str
        expectedfilename: str
        schema: SortedDict
        filename, expectedfilename, schema = archive_meta_params

        # Add a bad column.
        local_schema = schema.copy()
        local_schema["%d" % len(local_schema)] = bad_column

        datafile = os.path.join(get_data_path(), filename + ".csv")
        metafile = os.path.join(get_data_path(), filename + ".meta")
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(local_schema, schemafile)

        with pytest.raises(IndexError, match=bad_column):
            syphon.archive(
                archive_dir,
                [datafile],
                meta_files=[metafile],
                schema_filepath=schemafile,
                overwrite=True,
            )

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 3
0
    def test_raises_filenotfounderror_when_data_cannot_be_found(
            self, archive_dir: LocalPath):
        datafile = os.path.join(get_data_path(), "nonexistantfile.csv")

        with pytest.raises(FileNotFoundError, match="data file"):
            syphon.archive(archive_dir, [datafile])

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 4
0
    def test_raises_filenotfounderror_when_schema_cannot_be_found(
            self, archive_params: Tuple[str, SortedDict],
            archive_dir: LocalPath):
        filename: str
        schema: SortedDict
        filename, schema = archive_params

        datafile = os.path.join(get_data_path(), filename)

        with pytest.raises(FileNotFoundError, match="schema file"):
            syphon.archive(archive_dir, [datafile],
                           schema_filepath=rand_string())

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 5
0
    def test_empty_datafile(self, capsys: CaptureFixture,
                            archive_dir: LocalPath, verbose: bool):
        datafile = os.path.join(get_data_path(), "empty.csv")

        assert not syphon.archive(archive_dir, [datafile], verbose=verbose)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 6
0
    def test_without_metadata_with_schema(
        self,
        capsys: CaptureFixture,
        archive_params: Tuple[str, SortedDict],
        archive_dir: LocalPath,
        overwrite: bool,
        verbose: bool,
    ):
        filename: str
        schema: SortedDict
        filename, schema = archive_params

        datafile = os.path.join(get_data_path(), filename)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        expected_df = DataFrame(read_csv(datafile, dtype=str))
        expected_df.sort_values(list(expected_df.columns), inplace=True)
        expected_df.reset_index(drop=True, inplace=True)

        expected_paths: SortedList = _get_expected_paths(
            archive_dir, schema, expected_df, filename)

        if overwrite:
            for e in expected_paths:
                os.makedirs(os.path.dirname(e), exist_ok=True)
                with open(e, mode="w") as fd:
                    fd.write(rand_string())

        assert syphon.archive(
            archive_dir,
            [datafile],
            schema_filepath=schemafile,
            overwrite=overwrite,
            verbose=verbose,
        )
        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))

        actual_frame = DataFrame()
        actual_paths = SortedList()
        for root, _, files in os.walk(archive_dir):
            for f in files:
                if ".csv" in f:
                    filepath: str = os.path.join(root, f)
                    actual_paths.add(filepath)
                    actual_frame = concat([
                        actual_frame,
                        DataFrame(read_csv(filepath, dtype=str))
                    ])

        actual_frame.sort_values(list(actual_frame.columns), inplace=True)
        actual_frame.reset_index(drop=True, inplace=True)

        assert expected_paths == actual_paths
        assert_frame_equal(expected_df, actual_frame)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemplo n.º 7
0
    def test_with_metadata_without_schema(
        self,
        capsys: CaptureFixture,
        archive_meta_params: Tuple[str, str, SortedDict],
        archive_dir: LocalPath,
        overwrite: bool,
        verbose: bool,
    ):
        filename: str
        expectedfilename: str
        filename, expectedfilename, _ = archive_meta_params

        datafile = os.path.join(get_data_path(), filename + ".csv")
        metafile = os.path.join(get_data_path(), filename + ".meta")

        expected_df = DataFrame(
            # Read our dedicated *-combined.csv file instead of the import target.
            read_csv(os.path.join(get_data_path(), expectedfilename),
                     dtype=str))
        expected_df.sort_values(list(expected_df.columns), inplace=True)
        expected_df.reset_index(drop=True, inplace=True)

        expected_paths: SortedList = _get_expected_paths(
            archive_dir, SortedDict(), expected_df, filename + ".csv")

        if overwrite:
            for e in expected_paths:
                os.makedirs(os.path.dirname(e), exist_ok=True)
                with open(e, mode="w") as fd:
                    fd.write(rand_string())

        assert syphon.archive(
            archive_dir,
            [datafile],
            meta_files=[metafile],
            overwrite=overwrite,
            verbose=verbose,
        )
        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))

        actual_df = DataFrame()
        actual_paths = SortedList()
        for root, _, files in os.walk(archive_dir):
            for f in files:
                if ".csv" in f:
                    filepath: str = os.path.join(root, f)
                    actual_paths.add(filepath)
                    actual_df = concat(
                        [actual_df,
                         DataFrame(read_csv(filepath, dtype=str))])

        actual_df.sort_values(list(actual_df.columns), inplace=True)
        actual_df.reset_index(drop=True, inplace=True)

        assert expected_paths == actual_paths
        assert_frame_equal(expected_df, actual_df)
Exemplo n.º 8
0
    def test_raises_valueerror_when_metadata_is_inconsistent(
        self,
        archive_meta_params: Tuple[str, str, SortedDict],
        archive_dir: LocalPath,
        import_dir: LocalPath,
    ):
        filename: str
        expectedfilename: str
        schema: SortedDict
        filename, expectedfilename, schema = archive_meta_params

        datafile = os.path.join(get_data_path(), filename + ".csv")
        bad_metafile = LocalPath(
            os.path.join(get_data_path(), filename + "-inconsistent.meta"))
        metafile = import_dir.join(filename + ".meta")
        bad_metafile.copy(metafile)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        # Find the column that will be in the message.
        metaframe = DataFrame(read_csv(metafile, dtype=str))
        column: Optional[str] = None
        for column in metaframe.columns:
            if len(metaframe[column].drop_duplicates().values) > 1:
                break
        del metaframe

        assert column is not None
        with pytest.raises(ValueError, match=column):
            syphon.archive(
                archive_dir,
                [datafile],
                meta_files=[metafile],
                schema_filepath=schemafile,
                overwrite=True,
            )

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
Exemplo n.º 9
0
    def test_incremental_fails_when_check_fails(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        LocalPath(datafile).copy(cache_file)
        assert os.path.exists(cache_file)

        # "check" ought to fail when the hash file does not exist.
        assert not syphon.check(cache_file, hash_filepath=hash_file)
        # If "check" fails, then the incremental build fails.
        assert not syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(False, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemplo n.º 10
0
    def test_full_build_with_schema_maintains_data_fidelity(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        overwrite: bool,
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile, overwrite=overwrite)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile,
                              overwrite=overwrite)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        if overwrite:
            cache_file.write(rand_string())

        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=overwrite,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemplo n.º 11
0
    def test_only_update_hash_file_when_post_hash_true(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(archive_dir, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)
        pathlib.Path(resolved_hashfile).touch()
        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            hashfile.update(syphon.hash.HashEntry(cache_file))

        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=False,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert not syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
Exemplo n.º 12
0
    def test_build_uses_unmodified_output_path_in_hash_entry(
            self, fs: "TestBuildHashEntryPath.FS", path_type: PathType):
        # NOTE: Current working directory is changed if PathType.NONE!
        target: Union[str, LocalPath] = fs.cache(path_type)

        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(fs.archive, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        assert syphon.build(
            target,
            *get_data_files(fs.archive),
            hash_filepath=fs.hashfile,
            incremental=False,
            post_hash=True,
        )

        with fs.hashfile.open(mode="r") as hf:
            actual_hash_entry = hf.readline()

        assert str(target) in actual_hash_entry
Exemplo n.º 13
0
    def test_incremental_becomes_full_build_when_cache_does_not_exist(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        # Raises a FileExistsError unless a full build is performed.
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemplo n.º 14
0
    def test_raises_fileexistserror_when_cache_exists(
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        incremental: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")

        assert syphon.archive(archive_dir, [datafile], overwrite=True)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        with pytest.raises(FileExistsError) as errinfo:
            syphon.build(
                cache_file,
                *get_data_files(archive_dir),
                hash_filepath=hash_file,
                incremental=incremental,
                overwrite=False,
                post_hash=False,
            )
            assert datafile in str(errinfo.value)
        assert_post_hash(False, cache_file, hash_filepath=hash_file)
Exemplo n.º 15
0
 def test_no_datafiles(self, capsys: CaptureFixture, archive_dir: LocalPath,
                       verbose: bool):
     assert not syphon.archive(archive_dir, [], verbose=verbose)
     assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemplo n.º 16
0
 def __call__(self, *args, **kwargs) -> bool:
     with self._monkeypatch.context() as m:
         m.setattr(syphon.core.build, "build", value=self._build_shim)
         return syphon.archive(*args, **kwargs)
Exemplo n.º 17
0
    def test_incremental_maintains_data_fidelity_when_new_data_new_and_missing_columns(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        import_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        """Incremental build maintains data fidelity when new data

        * has columns not present in the existing data cache.

        * is missing columns found in the existing data cache.
        """

        pre_datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-1-of-2-no-species.csv")
        ]
        datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-2-of-2-no-petalcolor.csv")
        ]

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)

        assert syphon.archive(archive_dir, pre_datafiles)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        # Pre-build
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=False,
            post_hash=True,
            verbose=False,
        )
        # Get the hash of the cache file before our main build.
        pre_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        # Get the hash of the hash file for easy file change checking.
        pre_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        # Main build
        assert syphon.build(
            cache_file,
            *datafiles,
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)

        post_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        post_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        expected_frame = DataFrame(
            read_csv(
                os.path.join(
                    get_data_path(),
                    "iris_plus_partial-new-data-new-and-missing-columns.csv",
                ),
                dtype=str,
                index_col="Index",
            ))
        expected_frame.sort_index(inplace=True)

        assert pre_cache_hash != post_cache_hash
        assert pre_hash_hash != post_hash_hash

        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            for entry in hashfile:
                if os.path.samefile(entry.filepath, str(cache_file)):
                    assert post_cache_hash == entry.hash

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)