예제 #1
0
    def test_raises_indexerror_when_a_schema_column_does_not_exist(
            self, archive_meta_params: Tuple[str, str, SortedDict],
            archive_dir: LocalPath):
        bad_column = "non_existent_column"

        filename: str
        expectedfilename: str
        schema: SortedDict
        filename, expectedfilename, schema = archive_meta_params

        # Add a bad column.
        local_schema = schema.copy()
        local_schema["%d" % len(local_schema)] = bad_column

        datafile = os.path.join(get_data_path(), filename + ".csv")
        metafile = os.path.join(get_data_path(), filename + ".meta")
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(local_schema, schemafile)

        with pytest.raises(IndexError, match=bad_column):
            syphon.archive(
                archive_dir,
                [datafile],
                meta_files=[metafile],
                schema_filepath=schemafile,
                overwrite=True,
            )

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
예제 #2
0
    def test_raises_fileexistserror_on_existing_archive_file(
            self, archive_params: Tuple[str, SortedDict],
            archive_dir: LocalPath):
        filename: str
        schema: SortedDict
        filename, schema = archive_params

        datafile = os.path.join(get_data_path(), filename)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        expected_df = DataFrame(read_csv(datafile, dtype=str))

        expected_paths: SortedList = _get_expected_paths(
            archive_dir, schema, expected_df, filename)

        for e in expected_paths:
            os.makedirs(os.path.dirname(e), exist_ok=True)
            with open(e, mode="w") as f:
                f.write(rand_string())

        with pytest.raises(FileExistsError, match=os.path.basename(datafile)):
            syphon.archive(archive_dir, [datafile],
                           schema_filepath=schemafile,
                           overwrite=False)

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
예제 #3
0
    def test_without_metadata_with_schema(
        self,
        capsys: CaptureFixture,
        archive_params: Tuple[str, SortedDict],
        archive_dir: LocalPath,
        overwrite: bool,
        verbose: bool,
    ):
        filename: str
        schema: SortedDict
        filename, schema = archive_params

        datafile = os.path.join(get_data_path(), filename)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        expected_df = DataFrame(read_csv(datafile, dtype=str))
        expected_df.sort_values(list(expected_df.columns), inplace=True)
        expected_df.reset_index(drop=True, inplace=True)

        expected_paths: SortedList = _get_expected_paths(
            archive_dir, schema, expected_df, filename)

        if overwrite:
            for e in expected_paths:
                os.makedirs(os.path.dirname(e), exist_ok=True)
                with open(e, mode="w") as fd:
                    fd.write(rand_string())

        assert syphon.archive(
            archive_dir,
            [datafile],
            schema_filepath=schemafile,
            overwrite=overwrite,
            verbose=verbose,
        )
        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))

        actual_frame = DataFrame()
        actual_paths = SortedList()
        for root, _, files in os.walk(archive_dir):
            for f in files:
                if ".csv" in f:
                    filepath: str = os.path.join(root, f)
                    actual_paths.add(filepath)
                    actual_frame = concat([
                        actual_frame,
                        DataFrame(read_csv(filepath, dtype=str))
                    ])

        actual_frame.sort_values(list(actual_frame.columns), inplace=True)
        actual_frame.reset_index(drop=True, inplace=True)

        assert expected_paths == actual_paths
        assert_frame_equal(expected_df, actual_frame)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
예제 #4
0
def test_init_fileexistserror(archive_dir: LocalPath,
                              init_schema_fixture: SortedDict):
    schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

    with open(schemafile, mode="w") as f:
        f.write("content")

    with pytest.raises(FileExistsError):
        syphon.init(init_schema_fixture, schemafile, overwrite=False)
예제 #5
0
def test_init(
    capsys: CaptureFixture,
    archive_dir: LocalPath,
    init_schema_fixture: SortedDict,
    overwrite: bool,
    verbose: bool,
):
    schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

    syphon.init(init_schema_fixture, schemafile, overwrite, verbose)

    with open(schemafile, "r") as f:
        actual = SortedDict(loads(f.read()))

    assert actual == init_schema_fixture

    assert_captured_outerr(capsys.readouterr(), verbose, False)
예제 #6
0
    def test_incremental_fails_when_check_fails(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        LocalPath(datafile).copy(cache_file)
        assert os.path.exists(cache_file)

        # "check" ought to fail when the hash file does not exist.
        assert not syphon.check(cache_file, hash_filepath=hash_file)
        # If "check" fails, then the incremental build fails.
        assert not syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(False, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
예제 #7
0
    def test_full_build_with_schema_maintains_data_fidelity(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        overwrite: bool,
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile, overwrite=overwrite)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile,
                              overwrite=overwrite)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        if overwrite:
            cache_file.write(rand_string())

        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=overwrite,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
예제 #8
0
    def test_incremental_becomes_full_build_when_cache_does_not_exist(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        # Raises a FileExistsError unless a full build is performed.
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
예제 #9
0
    def test_raises_valueerror_when_metadata_is_inconsistent(
        self,
        archive_meta_params: Tuple[str, str, SortedDict],
        archive_dir: LocalPath,
        import_dir: LocalPath,
    ):
        filename: str
        expectedfilename: str
        schema: SortedDict
        filename, expectedfilename, schema = archive_meta_params

        datafile = os.path.join(get_data_path(), filename + ".csv")
        bad_metafile = LocalPath(
            os.path.join(get_data_path(), filename + "-inconsistent.meta"))
        metafile = import_dir.join(filename + ".meta")
        bad_metafile.copy(metafile)
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile)

        # Find the column that will be in the message.
        metaframe = DataFrame(read_csv(metafile, dtype=str))
        column: Optional[str] = None
        for column in metaframe.columns:
            if len(metaframe[column].drop_duplicates().values) > 1:
                break
        del metaframe

        assert column is not None
        with pytest.raises(ValueError, match=column):
            syphon.archive(
                archive_dir,
                [datafile],
                meta_files=[metafile],
                schema_filepath=schemafile,
                overwrite=True,
            )

        assert not os.path.exists(
            os.path.join(os.path.dirname(datafile), "#lock"))
예제 #10
0
    def test_increment_without_metadata_with_schema(
        self,
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker",
        schema_file: Optional[LocalPath],
        verbose: bool,
    ):
        # List of (expected frame filename, data filename) tuples
        targets: List[Tuple[str, str]] = [
            ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6-combined.csv"),
            ("iris-part-1-2.csv", "iris-part-2-of-6-combined.csv"),
            ("iris-part-1-2-3.csv", "iris-part-3-of-6-combined.csv"),
            ("iris-part-1-2-3-4.csv", "iris-part-4-of-6-combined.csv"),
            ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6-combined.csv"),
            ("iris_plus.csv", "iris-part-6-of-6-combined.csv"),
        ]

        expected_hashfile = (
            LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if
            archive_fixture.hash_file is None else archive_fixture.hash_file)
        assert not os.path.exists(expected_hashfile)
        assert not os.path.exists(archive_fixture.cache_file)
        assert len(archive_dir.listdir()) == 0

        expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE)
                               if schema_file is None else schema_file)
        assert not os.path.exists(expected_schemafile)
        syphon.init(SortedDict({
            "0": "PetalColor",
            "1": "Species"
        }), expected_schemafile)
        assert os.path.exists(expected_schemafile)

        for expected_frame_filename, data_filename in targets:
            assert archive_fixture(
                archive_dir,
                [os.path.join(get_data_path(), data_filename)],
                schema_filepath=schema_file,
                cache_filepath=archive_fixture.cache_file,
                hash_filepath=archive_fixture.hash_file,
                verbose=verbose,
            )
            assert_captured_outerr(capsys.readouterr(), verbose, False)

            expected_frame = DataFrame(
                read_csv(
                    os.path.join(get_data_path(), expected_frame_filename),
                    dtype=str,
                    index_col="Index",
                ))
            expected_frame.sort_index(inplace=True)
            actual_frame = DataFrame(
                read_csv(str(archive_fixture.cache_file),
                         dtype=str,
                         index_col="Index"))
            actual_frame.sort_index(inplace=True)
            assert_captured_outerr(capsys.readouterr(), False, False)

            assert_frame_equal(expected_frame, actual_frame)
            assert os.path.exists(expected_hashfile)
            assert syphon.check(
                archive_fixture.cache_file,
                hash_filepath=expected_hashfile,
                verbose=verbose,
            )