Пример #1
0
    def download(self, cloud_path):
        """Download cityscapes dataset

        Note:
            The current implementation assumes a GCS cloud path.
            Should we keep this method here if we want to support other cloud
            storage system?

        Args:
            cloud_path (str): cloud path of the dataset
        """
        path = Path(self.root)
        path.mkdir(parents=True, exist_ok=True)

        for zipfile in ZIPFILES:
            localfile = os.path.join(self.root, zipfile)
            if os.path.isfile(localfile):
                # TODO: Check file hash to verify file integrity
                logger.debug(f"File {localfile} exists. Skip download.")
                continue
            client = GCSClient()
            object_key = os.path.join(CITYSCAPES_GCS_PATH, zipfile)

            logger.debug(
                f"Downloading file {localfile} from gs://{const.GCS_BUCKET}/"
                f"{object_key}")
            client.download(const.GCS_BUCKET, object_key, localfile)
Пример #2
0
 def __init__(self, cloud_path, prefix, *, suffix=DEFAULT_SUFFIX):
     self._tempdir = tempfile.TemporaryDirectory().name
     self._client = GCSClient()
     self.cloud_path = cloud_path
     self._writer = LocalEstimatorWriter(self._tempdir,
                                         prefix,
                                         create_dir=True,
                                         suffix=suffix)
Пример #3
0
def test_MD5_hex():
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        actual_result = client._md5_hex(md5_hash)
        expected_result = "69b7"
        assert actual_result == expected_result
Пример #4
0
def test_gcs_client_download_folder_url(mock_download_folder, mock_is_file):
    mocked_gcs_client = MagicMock()
    mock_is_file.return_value = False
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        client.download(local_path=local_path, url=base_url)
        mock_download_folder.assert_called_with(mocked_gcs_client.get_bucket(),
                                                base_key, local_path)
Пример #5
0
    def download(self, cloud_path):
        """Download nyu_v2 dataset
        The directory structure of the downloaded data is
        |--self.root
           |--nyudepth
               |--nyu_data.zip
               |--data
                   |--nyu2_test.csv
                   |--nyu2_test
                         |--00000_colors.png
                         |--00000_depth.png ...
                         |--01448_colors.png
                         |--01448_depth.png
                   |--nyu2_train.csv
                   |--nyu2_train
                         |--basement_0001a_out
                              |--1.jpg
                              |--1.png ...
                              |--281.jpg
                              |--281.png
                         ...
                         |--study_room_0005b_out
                              |--1.jpg
                              |--1.png ...
                              |--133.jpg
                              |--133.png
        Args:
            cloud_path (str): cloud path of the dataset
        """
        zip_file = os.path.join(self.root, ZIPFILE)
        unzip_dir = os.path.join(self.root, UNZIP_NAME)

        if os.path.isfile(zip_file):
            logger.debug(f"File {zip_file} exists. Skip download.")
        else:
            client = GCSClient()
            object_key = os.path.join(NYU_GCS_PATH, ZIPFILE)

            logger.debug(
                f"Downloading file {zip_file} from gs://{const.GCS_BUCKET}/"
                f"{object_key}")
            client.download(
                local_path=self.root,
                bucket=const.GCS_BUCKET,
                key=object_key,
            )

        if os.path.isdir(unzip_dir):
            logger.debug(f"File {unzip_dir} exists. Skip unzip.")
        else:
            # unzip the file
            with ZipFile(zip_file, "r") as zip_ref:
                zip_ref.extractall(self.root)
                logger.debug(f"Unzip file from {zip_file}")
Пример #6
0
def test_download_validate(mock_checksum, mock_download_blob):
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_blob = MagicMock()
        client._download_validate(mocked_blob, local_path)
        mock_checksum.assert_called_with(mocked_blob, local_path)
        mock_download_blob.assert_called_with(mocked_blob, local_path)
Пример #7
0
def test_download_folder(mock_download_validate):
    object_key = "path/to" + file_name
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_blob = MagicMock()
        mocked_bucket = MagicMock()
        mocked_bucket.list_blobs = MagicMock(return_value=[mocked_blob])
        mocked_blob.name = object_key
        client._download_folder(mocked_bucket, object_key, local_path)
        mock_download_validate.assert_called_with(mocked_blob, local_path)
Пример #8
0
def test_gcs_client_download_file_bucket_key(mock_download_file, mock_is_file):
    mocked_gcs_client = MagicMock()
    mock_is_file.return_value = True
    object_key = base_key + file_name
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        client.download(local_path=local_path,
                        bucket=bucket_name,
                        key=object_key)
        mock_download_file.assert_called_with(mocked_gcs_client.get_bucket(),
                                              object_key, local_path)
Пример #9
0
def test_gcs_client_upload_file_bucket_key(mock_isdir, mock_upload_file):
    localfile = local_path + file_name
    mocked_gcs_client = MagicMock()
    mock_isdir.return_value = False
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        client.upload(local_path=localfile, bucket=bucket_name, key=base_key)
        mock_upload_file.assert_called_with(
            bucket=mocked_gcs_client.get_bucket(),
            key=base_key,
            local_path=localfile,
        )
Пример #10
0
def load_from_gcs(estimator, full_cloud_path):
    """Load estimator from checkpoint files on GCS.

    Args:
        estimator (datasetinsights.estimators.Estimator):
            datasetinsights estimator object.
        full_cloud_path: full path to the checkpoint file

    """
    filename = os.path.basename(full_cloud_path)
    with tempfile.TemporaryDirectory() as temp_dir:
        path = os.path.join(temp_dir, filename)
        logger.debug(f"Downloading estimator from {full_cloud_path} to {path}")
        client = GCSClient()
        client.download(local_path=temp_dir, url=full_cloud_path)
        estimator.load(path)
Пример #11
0
 def download(self, version="v1.0-mini"):
     """
     download the nuscenes dataset version specified to /data/sets/nuscenes
     """
     self.cloud_client = GCSClient()
     if version == "v1.0-mini":
         return self._download_mini()
     elif version == "v1.0-trainval":
         meta_local_path, trainval_tars = self._download_trainval_tars()
         with tarfile.open(meta_local_path) as t:
             t.extractall(self.root)
         first_data_blob = trainval_tars[0]
         with tarfile.open(first_data_blob) as t:
             logger.info(f"extracting files from {first_data_blob}")
             t.extractall(self.root)
         os.remove(trainval_tars[0])
         for tar_path in trainval_tars[1:]:
             with tarfile.open(tar_path) as t:
                 logger.info(f"extracting files from {tar_path}")
                 with tempfile.TemporaryDirectory() as tmpdirname:
                     t.extractall(tmpdirname)
                     self._merge_data_blobs(blob=tmpdirname,
                                            canonical_dir=self.root)
         return self.root
     else:
         raise ValueError()
Пример #12
0
 def download(self, cloud_path=KITTI_GCS_PATH):
     logger.info(f"downloading kitti dataset from cloud storage")
     # todo is currently only downloading left color images
     cloud_client = GCSClient()
     self._download_sample_indices_file(cloud_client=cloud_client)
     calib_zip, left_images_zip, labels_zip = self.download_kitti_zips(
         cloud_client=cloud_client)
     with zipfile.ZipFile(left_images_zip, "r") as zip_ref:
         zip_ref.extractall(self.root)
     testing_dir = os.path.join(self.root, "testing")
     training_dir = os.path.join(self.root, "training")
     self._unzip2dir(
         zip_path=calib_zip,
         src=os.path.join("testing", "calib"),
         dst=os.path.join(testing_dir, "calib"),
     )
     self._unzip2dir(
         zip_path=calib_zip,
         src=os.path.join("training", "calib"),
         dst=os.path.join(training_dir, "calib"),
     )
     self._unzip2dir(
         zip_path=labels_zip,
         src=os.path.join("training", "label_2"),
         dst=os.path.join(training_dir, "label_2"),
     )
Пример #13
0
def test_is_file():
    object_key = base_key + file_name
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_bucket.get_blob = MagicMock(return_value=mocked_blob)
        mocked_blob.name = object_key
        actual_result = client._is_file(mocked_bucket, object_key)
        mocked_bucket.get_blob.assert_called_with(object_key)
        expected_result = True
        assert actual_result == expected_result
Пример #14
0
def test_download_file(mock_download_validate):
    object_key = base_key + file_name
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_bucket.get_blob = MagicMock(return_value=mocked_blob)
        mocked_blob.name = object_key
        client._download_file(mocked_bucket, object_key, local_path)
        mocked_bucket.get_blob.assert_called_with(object_key)
        mock_download_validate.assert_called_with(mocked_blob,
                                                  local_path + file_name)
Пример #15
0
def test_download_blob(mock_isdir):
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        mock_isdir.return_value = True
        object_key = base_key + file_name
        client = GCSClient()
        mocked_blob = MagicMock()
        mocked_blob.name = object_key
        mocked_download_blob = MagicMock()
        mocked_blob.download_to_filename = mocked_download_blob

        client._download_blob(mocked_blob, local_path)
        mocked_blob.download_to_filename.assert_called_with(local_path)
Пример #16
0
def test_gcs_parse():
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        th_bucket = "some_bucket_name"
        th_path = "some/cloud/path"
        url = "gs://some_bucket_name/some/cloud/path"

        bucket, path = client._parse(url)
        assert (bucket, path) == (th_bucket, th_path)

        bad_url = "s3://path/to/bad/url"
        with pytest.raises(ValueError, match=r"Specified destination prefix:"):
            client._parse(bad_url)
Пример #17
0
def test_upload_file():
    localfile = local_path + file_name
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket)
        mocked_bucket.blob = MagicMock(return_value=mocked_blob)
        mocked_blob.upload_from_filename = MagicMock()

        client._upload_file(local_path=localfile,
                            bucket=mocked_bucket,
                            key=base_key)
        mocked_blob.upload_from_filename.assert_called_with(localfile)
Пример #18
0
def test_checksum(mock_checksum, mock_md5_hex):
    local_file_path = local_path + file_name
    mocked_gcs_client = MagicMock()
    mock_md5_hex.return_value = md5_hash_hex
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket)
        mocked_bucket.get_blob = MagicMock(return_value=mocked_blob)
        mocked_blob.md5_hash = md5_hash
        client._checksum(mocked_blob, local_file_path)
        mock_checksum.assert_called_with(local_file_path,
                                         md5_hash_hex,
                                         algorithm="MD5")
Пример #19
0
class GCSDatasetDownloader(DatasetDownloader, protocol="gs://"):
    """ This class is used to download data from GCS
    """
    def __init__(self, **kwargs):
        """ initiating GCSDownloader
        """
        self.client = GCSClient()

    def download(self, source_uri=None, output=None, **kwargs):
        """

        Args:
            source_uri: This is the downloader-uri that indicates where on
                GCS the dataset should be downloaded from.
                The expected source-uri follows these patterns
                gs://bucket/folder or gs://bucket/folder/data.zip

            output: This is the path to the directory
                where the download will store the dataset.
        """
        self.client.download(local_path=output, url=source_uri)
Пример #20
0
class GCSEstimatorWriter:
    """Writes (saves) estimator checkpoints on GCS.

    Args:
        cloud_path (str): GCS cloud path (e.g. gs://bucket/path/to/directoy)
        prefix (str): filename prefix of the checkpoint files
        suffix (str): filename suffix of the checkpoint files

    """
    def __init__(self, cloud_path, prefix, *, suffix=DEFAULT_SUFFIX):
        self._tempdir = tempfile.TemporaryDirectory().name
        self._client = GCSClient()
        self._bucket, self._gcs_path = gcs_bucket_and_path(cloud_path)
        self._writer = LocalEstimatorWriter(self._tempdir,
                                            prefix,
                                            create_dir=True,
                                            suffix=suffix)

    def save(self, estimator, epoch=None):
        """Save estimator to checkpoint files on GCS.

        Args:
            estimator (datasetinsights.estimators.Estimator):
                datasetinsights estimator object.
            epoch (int): the current epoch number. Default: None

        Returns:
            Full GCS cloud path to the saved checkpoint file.

        """
        path = self._writer.save(estimator, epoch)
        filename = os.path.basename(path)
        object_key = os.path.join(self._gcs_path, filename)

        full_cloud_path = f"gs://{self._bucket}/{object_key}"

        logger.debug(f"Copying estimator from {path} to {full_cloud_path}")
        self._client.upload(path, self._bucket, object_key)

        return full_cloud_path
Пример #21
0
 def download(self, cloud_path=COCO_GCS_PATH):
     path = Path(self.root)
     path.mkdir(parents=True, exist_ok=True)
     client = GCSClient()
     annotations_zip_gcs = f"{cloud_path}/annotations_trainval2017.zip"
     annotations_zip_2017 = self._get_local_annotations_zip()
     logger.info(f"checking for local copy of data")
     if not os.path.exists(annotations_zip_2017):
         logger.info(f"no annotations zip file found, will download.")
         client.download(
             bucket_name=const.GCS_BUCKET,
             object_key=annotations_zip_gcs,
             localfile=annotations_zip_2017,
         )
         with zipfile.ZipFile(annotations_zip_2017, "r") as zip_dir:
             zip_dir.extractall(self.root)
     images_local = self._get_local_images_zip()
     images_gcs = f"{cloud_path}/{self.split}2017.zip"
     if not os.path.exists(images_local):
         logger.info(f"no zip file for images for {self.split} found,"
                     f" will download")
         client.download(
             bucket_name=const.GCS_BUCKET,
             object_key=images_gcs,
             localfile=images_local,
         )
         with zipfile.ZipFile(images_local, "r") as zip_dir:
             zip_dir.extractall(self.root)
Пример #22
0
def test_gcs_client_warpper():
    bucket_name = "fake_bucket"
    object_key = "path/to/object"
    localfile = "path/to/local/file"

    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket)
        mocked_bucket.blob = MagicMock(return_value=mocked_blob)

        mocked_blob.download_to_filename = MagicMock()
        client.download(bucket_name, object_key, localfile)
        mocked_gcs_client.get_bucket.assert_called_with(bucket_name)
        mocked_bucket.blob.assert_called_with(object_key)
        mocked_blob.download_to_filename.assert_called_with(localfile)

        mocked_blob.upload_from_filename = MagicMock()
        client.upload(localfile, bucket_name, object_key)
        mocked_blob.upload_from_filename.assert_called_with(localfile)
Пример #23
0
    def download(self):
        """Download dataset from GCS
        """
        cloud_path = f"gs://{const.GCS_BUCKET}/{self.GCS_PATH}"
        client = GCSClient()
        # download label file
        label_zip = self.LABEL_ZIP
        client.download(url=cloud_path, local_path=self.root)
        with zipfile.ZipFile(label_zip, "r") as zip_dir:
            zip_dir.extractall(self.root)

        # download tfexamples for a dataset split
        tfexamples_zip = self.SPLITS_ZIP.get(self.split)
        client.download(url=cloud_path, local_path=self.root)
        with zipfile.ZipFile(tfexamples_zip, "r") as zip_dir:
            zip_dir.extractall(self.root)
Пример #24
0
def test_checksum_error(mock_checksum, mock_remove):
    local_file_path = local_path + file_name
    mocked_gcs_client = MagicMock()
    with patch(
            "datasetinsights.io.gcs.Client",
            MagicMock(return_value=mocked_gcs_client),
    ):
        client = GCSClient()
        mocked_bucket = MagicMock()
        mocked_blob = MagicMock()
        mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket)
        mocked_bucket.get_blob = MagicMock(return_value=mocked_blob)
        mocked_blob.md5_hash = md5_hash
        client._MD5_hex = MagicMock(return_value=md5_hash_hex)
        client._checksum(mocked_blob, local_file_path)

        mock_checksum.side_effect = ChecksumError
        with pytest.raises(ChecksumError):
            client._checksum(mocked_blob, local_file_path)
            mock_remove.assert_called_once()
Пример #25
0
 def __init__(self, **kwargs):
     """ initiating GCSDownloader
     """
     self.client = GCSClient()