Пример #1
0
    def test_overwrite(self, client, created_entities, experiment_run, s3_bucket):
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)

        dataset_version = dataset.create_version(__file__)
        experiment_run.log_dataset_version('train', dataset_version)

        new_dataset_version = dataset.create_version("conftest.py")
        experiment_run.log_dataset_version('train', new_dataset_version, overwrite=True)

        retrieved_dataset_version = experiment_run.get_dataset_version('train')
        path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path
        assert path.endswith("conftest.py")
Пример #2
0
    def test_reincarnation(self, client, created_datasets):
        """Consecutive identical versions are assigned the same ID."""
        dataset = client.set_dataset(type="local")
        created_datasets.append(dataset)

        version1 = dataset.create_version(path=__file__)
        version2 = dataset.create_version(path=__file__)
        assert version1.id == version2.id

        versions = dataset.get_all_versions()
        assert len(versions) == 1

        version = dataset.get_latest_version(ascending=True)
        assert version.id == version1.id
Пример #3
0
    def test_mngd_ver_to_sibling_dir(self, dataset):
        """Download to sibling directory works as expected."""
        child_dirname = "child"
        os.mkdir(child_dirname)
        sibling_dirname = "sibling"
        os.mkdir(sibling_dirname)
        filename = "tiny1.bin"
        FILE_CONTENTS = os.urandom(2**16)

        with utils.chdir(child_dirname):
            with open(filename, 'wb') as f:
                f.write(FILE_CONTENTS)
            blob_path = "data"

            dataset_blob = verta.dataset.Path(filename,
                                              enable_mdb_versioning=True)
            dataset_blob = dataset.create_version(dataset_blob).get_content()

            # download to sibling dir
            download_to_path = os.path.join("..", sibling_dirname, filename)
            filepath = dataset_blob.download(filename, download_to_path)
            assert os.path.isfile(filepath)
            assert filepath == os.path.abspath(download_to_path)
            with open(filepath, 'rb') as f:
                assert f.read() == FILE_CONTENTS
Пример #4
0
    def test_concat(self, dataset):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket1 = "verta-starter"
        key1 = "models/model.pkl"
        bucket2 = "verta-versioned-bucket"
        key2 = "tiny-files/tiny2.bin"

        # create dir for reference files
        reference_dir = "reference"
        filepath1 = os.path.join(reference_dir, bucket1, key1)
        pathlib2.Path(filepath1).parent.mkdir(parents=True, exist_ok=True)
        filepath2 = os.path.join(reference_dir, bucket2, key2)
        pathlib2.Path(filepath2).parent.mkdir(parents=True, exist_ok=True)

        # download files directly from S3 for reference
        s3.download_file(bucket1, key1, filepath1)
        s3.download_file(bucket2, key2, filepath2)

        # create and concatenate datasets
        dataset1 = verta.dataset.S3(
            "s3://{}/{}".format(bucket1, key1),
            enable_mdb_versioning=True,
        )
        dataset2 = verta.dataset.S3(
            "s3://{}/{}".format(bucket2, key2),
            enable_mdb_versioning=True,
        )
        dataset_blob = dataset1 + dataset2
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        dirpath = dataset_blob.download()
        assert_dirs_match(dirpath, reference_dir)
Пример #5
0
    def test_mngd_ver_folder(self, dataset):
        reference_dir = "reference/"
        dirname = "tiny-files/"
        os.mkdir(dirname)
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(dirname, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        blob_path = "data"
        dataset_blob = verta.dataset.Path(dirname, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()
        shutil.move(dirname, reference_dir)  # move sources to avoid collision

        # download to implicit path
        dirpath = dataset_blob.download(dirname)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        assert_dirs_match(dirpath, reference_dir)

        # download to implicit path without collision
        dirpath2 = dataset_blob.download(dirname)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        assert_dirs_match(dirpath2, reference_dir)

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset_blob.download(dirname, dirpath)
        assert dirpath3 == dirpath
        assert_dirs_match(dirpath3, reference_dir)
        assert os.path.getmtime(dirpath) > last_updated
Пример #6
0
    def test_base_path(self, dataset):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        # three .file files in tiny-files/
        for filename in ["tiny{}.file".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        sub_dir = "bin/"
        os.mkdir(os.path.join(reference_dir, sub_dir))
        # three .bin files in tiny-files/bin/
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, sub_dir, filename),
                      'wb') as f:
                f.write(os.urandom(2**16))

        # log & get dataset blob
        blob_path = "data"
        dataset_blob = verta.dataset.Path(
            reference_dir,
            base_path=reference_dir,
            enable_mdb_versioning=True,
        )
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        # `reference_dir` was dropped as base path, so KeyError
        with pytest.raises(KeyError):
            dataset_blob.download(reference_dir)

        dirpath = dataset_blob.download()
        assert os.path.abspath(dirpath) != os.path.abspath(reference_dir)
        assert_dirs_match(dirpath, reference_dir)
Пример #7
0
    def test_mngd_ver_file(self, dataset):
        filename = "tiny1.bin"
        FILE_CONTENTS = os.urandom(2**16)
        with open(filename, 'wb') as f:
            f.write(FILE_CONTENTS)
        blob_path = "data"

        dataset_blob = verta.dataset.Path(filename, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()
        os.remove(filename)  # delete for first download test

        # download to implicit path
        filepath = dataset_blob.download(filename)
        assert os.path.isfile(filepath)
        assert filepath == os.path.abspath(filename)
        with open(filepath, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to implicit path without collision
        filepath2 = dataset_blob.download(filename)
        assert os.path.isfile(filepath2)
        assert filepath2 != filepath
        with open(filepath2, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(filepath)
        filepath3 = dataset_blob.download(filename, filepath)
        assert filepath3 == filepath
        with open(filepath3, 'rb') as f:
            assert f.read() == FILE_CONTENTS
        assert os.path.getmtime(filepath) > last_updated
Пример #8
0
    def test_download_all(self, dataset):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)

        # get files' contents directly from S3 for reference
        reference_dir = "reference/"
        for s3_obj in s3.list_objects_v2(Bucket=bucket,
                                         Prefix=dirname)['Contents']:
            key = s3_obj['Key']
            filepath = os.path.join(reference_dir, bucket, key)
            pathlib2.Path(filepath).parent.mkdir(
                parents=True, exist_ok=True)  # create parent dirs

            s3.download_file(bucket, key, filepath)

        # log & get dataset blob
        dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        dirpath = dataset_blob.download()
        assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR)

        assert os.path.isdir(dirpath)
        assert_dirs_match(dirpath, reference_dir)
Пример #9
0
    def test_creation_from_scratch(self, client, created_datasets):
        dataset = client.set_dataset(type="local")
        created_datasets.append(dataset)

        version = dataset.create_version(__file__)
        assert version._dataset_type == _DatasetService.DatasetTypeEnum.PATH
        assert version.id
Пример #10
0
    def test_filesystem_dataset_version_creation(self, client, created_entities):
        dir_name, _ = self.create_dir_with_files(num_files=3)
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)
        dataset_version = dataset.create_version(dir_name)

        assert len(dataset_version.dataset_version_info.dataset_part_infos) == 3
        shutil.rmtree(dir_name)
Пример #11
0
    def test_log_dataset_version_diff_workspaces(self, client, organization, created_entities, experiment_run):
        dataset = client.set_dataset(type="local", workspace=organization.name)
        created_entities.append(dataset)

        dataset_version = dataset.create_version(__file__)
        experiment_run.log_dataset_version('train', dataset_version)

        retrieved_dataset_version = experiment_run.get_dataset_version('train')
        assert retrieved_dataset_version.id == dataset_version.id
Пример #12
0
    def test_creation_by_id(self, client, created_entities):
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)

        version = dataset.create_version(__file__)
        assert version.id

        same_version = client.get_dataset_version(id=version.id)
        assert version.id == same_version.id
Пример #13
0
    def test_get_versions(self, client, created_entities):
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)

        version1 = dataset.create_version(path=__file__)
        assert version1.id

        version2 = dataset.create_version(path=pytest.__file__)
        assert version2.id

        versions = dataset.get_all_versions()
        assert len(versions) == 2

        dataset_version1 = client.get_dataset_version(id=version1.id)
        assert dataset_version1.id == version1.id

        version = dataset.get_latest_version(ascending=True)
        assert version.id == version1.id
Пример #14
0
    def test_get_latest_printing(self, client, created_entities, capsys):
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)

        version = dataset.create_version(path=__file__)
        dataset.get_latest_version(ascending=True)

        captured = capsys.readouterr()
        assert "got existing dataset version: {}".format(version.id) in captured.out
Пример #15
0
    def test_creation_by_id(self, client, created_datasets):
        dataset = client.set_dataset(type="local")
        created_datasets.append(dataset)

        version = dataset.create_version(__file__)
        assert version._dataset_type == _DatasetService.DatasetTypeEnum.PATH
        assert version.id

        same_version = client.get_dataset_version(id=version.id)
        assert version.id == same_version.id
Пример #16
0
    def test_log_dataset_version(self, client, created_entities, experiment_run):
        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)

        dataset_version = dataset.create_version(__file__)
        experiment_run.log_dataset_version('train', dataset_version)

        retrieved_dataset_version = experiment_run.get_dataset_version('train')
        path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path
        assert path.endswith(__file__)
Пример #17
0
    def test_rdbms_version_creation(self, client, created_entities):
        dataset = client.set_dataset(type="postgres")
        created_entities.append(dataset)
        dataset_version = dataset.create_version(query="SELECT * FROM ner-table",
                                                 db_connection_str="localhost:6543",
                                                 num_records=100)

        assert dataset_version.dataset_version_info.query == "SELECT * FROM ner-table"
        assert dataset_version.dataset_version_info.data_source_uri == "localhost:6543"
        assert dataset_version.dataset_version_info.num_records == 100
Пример #18
0
    def test_s3_dataset_version_creation(self, client, s3_bucket, created_entities):
        botocore = pytest.importorskip("botocore")

        try:
            dataset = client.set_dataset(type="s3")
            created_entities.append(dataset)
            dataset_version = dataset.create_version(s3_bucket)

            assert len(dataset_version.dataset_version_info.dataset_part_infos) >= 1
        except botocore.exceptions.ClientError:
            pytest.skip("insufficient AWS credentials")
Пример #19
0
    def test_tags_is_list_of_str(self, client, created_entities, tags):
        dataset = client.set_dataset(tags=tags)
        created_entities.append(dataset)
        version = dataset.create_version("conftest.py", tags=tags)

        endpoint = "{}://{}/api/v1/modeldb/dataset-version/getDatasetVersionTags".format(
            client._conn.scheme,
            client._conn.socket,
        )
        response = verta._internal_utils._utils.make_request("GET", endpoint, client._conn, params={'id': version.id})
        verta._internal_utils._utils.raise_for_http_error(response)
        assert response.json().get('tags', []) == [TAG]
Пример #20
0
    def test_log_dataset_version(self, client, created_datasets,
                                 experiment_run):
        dataset = client.set_dataset(type="local")
        created_datasets.append(dataset)
        assert dataset._dataset_type == _DatasetService.DatasetTypeEnum.PATH

        dataset_version = dataset.create_version(__file__)
        experiment_run.log_dataset_version('train', dataset_version)

        retrieved_dataset_version = experiment_run.get_dataset_version('train')
        path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path
        assert path.endswith(__file__)
Пример #21
0
    def test_log_dataset_version_diff_workspaces_no_access_error(
            self, client_2, created_entities, experiment_run):
        dataset = client_2.set_dataset(type="local")
        created_entities.append(dataset)

        dataset_version = dataset.create_version(__file__)

        with pytest.raises(requests.HTTPError) as excinfo:
            experiment_run.log_dataset_version('train', dataset_version)

        excinfo_value = str(excinfo.value).strip()
        assert "403" in excinfo_value
Пример #22
0
    def test_local_file(self, client, created_entities):
        filepath = "conftest.py"

        dataset = client.set_dataset(type="local")
        created_entities.append(dataset)
        version = dataset.create_version(filepath)

        retrieved = dataset.get_latest_version()
        assert version.id == retrieved.id  # of course, but just to be sure

        base_path = os.path.abspath(filepath)
        self.assert_base_path(version, base_path)
        self.assert_base_path(retrieved, base_path)
Пример #23
0
    def test_not_to_s3_dir(self, dataset):
        """If the user specifies "s3://", things shouldn't go into an "s3:" dir."""
        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)
        blob_path = "data"

        # log & get dataset blob
        dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        dirpath = dataset_blob.download("s3://")
        assert "s3:" not in pathlib2.Path(dirpath).parts
Пример #24
0
    def test_local_dir(self, client, created_datasets):
        dirpath = "."

        dataset = client.set_dataset(type="local")
        created_datasets.append(dataset)
        version = dataset.create_version(dirpath)

        retrieved = dataset.get_latest_version()
        assert version.id == retrieved.id  # of course, but just to be sure

        base_path = os.path.abspath(dirpath)
        self.assert_base_path(version, base_path)
        self.assert_base_path(retrieved, base_path)
Пример #25
0
    def test_s3_bucket(self, client, created_entities):
        bucket_name = "verta-starter"

        botocore = pytest.importorskip("botocore")
        try:
            dataset = client.set_dataset(type="s3")
            created_entities.append(dataset)
            version = dataset.create_version(bucket_name)
        except botocore.exceptions.ClientError:
            pytest.skip("insufficient AWS credentials")

        retrieved = dataset.get_latest_version()
        assert version.id == retrieved.id  # of course, but just to be sure

        self.assert_base_path(version, bucket_name)
        self.assert_base_path(retrieved, bucket_name)
Пример #26
0
    def test_big_query_dataset_version_creation(self, client, bq_query, bq_location, created_entities):
        google = pytest.importorskip("google")
        bigquery = pytest.importorskip("google.cloud.bigquery")

        try:
            query_job = google.cloud.bigquery.Client().query(
                bq_query,
                # Location must match that of the dataset(s) referenced in the query.
                location=bq_location,
            )
            dataset = client.set_dataset(type="big query")
            created_entities.append(dataset)
            dataset_version = dataset.create_version(job_id=query_job.job_id, location=bq_location)

            assert dataset_version.dataset_version_info.query == bq_query
        except google.auth.exceptions.GoogleAuthError:
            pytest.skip("insufficient GCP credentials")
Пример #27
0
    def test_mngd_ver_folder(self, dataset):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)
        blob_path = "data"

        # get files' contents directly from S3 for reference
        reference_dir = "reference/"
        for s3_obj in s3.list_objects_v2(Bucket=bucket,
                                         Prefix=dirname)['Contents']:
            key = s3_obj['Key']
            filepath = os.path.join(reference_dir, key)
            pathlib2.Path(filepath).parent.mkdir(
                parents=True, exist_ok=True)  # create parent dirs

            s3.download_file(bucket, key, filepath)

        # Since we're retrieving files with the S3 prefix `dirname`, the downloaded filetree won't
        # start with `dirname`, so we have to go deeper for `reference_dir` to account for that.
        reference_dir = os.path.join(reference_dir, dirname)

        # log & get dataset blob
        dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        # download to implicit path
        dirpath = dataset_blob.download(s3_folder)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        assert_dirs_match(dirpath, reference_dir)

        # download to implicit path without collision
        dirpath2 = dataset_blob.download(s3_folder)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        assert_dirs_match(dirpath2, reference_dir)

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset_blob.download(s3_folder, dirpath)
        assert dirpath3 == dirpath
        assert_dirs_match(dirpath3, reference_dir)
        assert os.path.getmtime(dirpath) > last_updated
Пример #28
0
    def test_download_all(self, dataset):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        # log & get dataset blob
        blob_path = "data"
        dataset_blob = verta.dataset.Path(reference_dir, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        dirpath = dataset_blob.download()
        assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR)

        # uploaded filetree was recreated within `DEFAULT_DOWNLOAD_DIR`
        destination_dir = os.path.join(_dataset.DEFAULT_DOWNLOAD_DIR, reference_dir)
        assert os.path.isdir(destination_dir)
        assert_dirs_match(destination_dir, reference_dir)
Пример #29
0
    def test_mngd_ver_file(self, dataset):
        s3 = pytest.importorskip("boto3").client('s3')

        filename = "tiny1.bin"
        bucket = "verta-versioned-bucket"
        key = "tiny-files/{}".format(filename)
        s3_key = "s3://{}/{}".format(bucket, key)
        blob_path = "data"

        # get file contents directly from S3 for reference
        s3.download_file(bucket, key, filename)
        with open(filename, 'rb') as f:
            FILE_CONTENTS = f.read()
        os.remove(filename)

        # log & get dataset blob
        dataset_blob = verta.dataset.S3(s3_key, enable_mdb_versioning=True)
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        # download to implicit path
        filepath = dataset_blob.download(s3_key)
        assert os.path.isfile(filepath)
        assert filepath == os.path.abspath(filename)
        with open(filepath, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to implicit path without collision
        filepath2 = dataset_blob.download(s3_key)
        assert os.path.isfile(filepath2)
        assert filepath2 != filepath
        with open(filepath2, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(filepath)
        filepath3 = dataset_blob.download(s3_key, filepath)
        assert filepath3 == filepath
        with open(filepath3, 'rb') as f:
            assert f.read() == FILE_CONTENTS
        assert os.path.getmtime(filepath) > last_updated
Пример #30
0
    def test_concat(self, dataset):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        # two .file files in tiny-files/
        for filename in ["tiny{}.file".format(i) for i in range(2)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        # create and concatenate datasets
        dataset1 = verta.dataset.Path(
            "tiny-files/tiny0.file",
            enable_mdb_versioning=True,
        )
        dataset2 = verta.dataset.Path(
            "tiny-files/tiny1.file",
            enable_mdb_versioning=True,
        )
        dataset_blob = dataset1 + dataset2
        dataset_blob = dataset.create_version(dataset_blob).get_content()

        dirpath = dataset_blob.download()
        dirpath = os.path.join(dirpath, reference_dir)  # "tiny-files/" nested in new dir
        assert_dirs_match(dirpath, reference_dir)