def download(self, cloud_path): """Download cityscapes dataset Note: The current implementation assumes a GCS cloud path. Should we keep this method here if we want to support other cloud storage system? Args: cloud_path (str): cloud path of the dataset """ path = Path(self.root) path.mkdir(parents=True, exist_ok=True) for zipfile in ZIPFILES: localfile = os.path.join(self.root, zipfile) if os.path.isfile(localfile): # TODO: Check file hash to verify file integrity logger.debug(f"File {localfile} exists. Skip download.") continue client = GCSClient() object_key = os.path.join(CITYSCAPES_GCS_PATH, zipfile) logger.debug( f"Downloading file {localfile} from gs://{const.GCS_BUCKET}/" f"{object_key}") client.download(const.GCS_BUCKET, object_key, localfile)
def __init__(self, cloud_path, prefix, *, suffix=DEFAULT_SUFFIX): self._tempdir = tempfile.TemporaryDirectory().name self._client = GCSClient() self.cloud_path = cloud_path self._writer = LocalEstimatorWriter(self._tempdir, prefix, create_dir=True, suffix=suffix)
def test_MD5_hex(): mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() actual_result = client._md5_hex(md5_hash) expected_result = "69b7" assert actual_result == expected_result
def test_gcs_client_download_folder_url(mock_download_folder, mock_is_file): mocked_gcs_client = MagicMock() mock_is_file.return_value = False with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() client.download(local_path=local_path, url=base_url) mock_download_folder.assert_called_with(mocked_gcs_client.get_bucket(), base_key, local_path)
def download(self, cloud_path): """Download nyu_v2 dataset The directory structure of the downloaded data is |--self.root |--nyudepth |--nyu_data.zip |--data |--nyu2_test.csv |--nyu2_test |--00000_colors.png |--00000_depth.png ... |--01448_colors.png |--01448_depth.png |--nyu2_train.csv |--nyu2_train |--basement_0001a_out |--1.jpg |--1.png ... |--281.jpg |--281.png ... |--study_room_0005b_out |--1.jpg |--1.png ... |--133.jpg |--133.png Args: cloud_path (str): cloud path of the dataset """ zip_file = os.path.join(self.root, ZIPFILE) unzip_dir = os.path.join(self.root, UNZIP_NAME) if os.path.isfile(zip_file): logger.debug(f"File {zip_file} exists. Skip download.") else: client = GCSClient() object_key = os.path.join(NYU_GCS_PATH, ZIPFILE) logger.debug( f"Downloading file {zip_file} from gs://{const.GCS_BUCKET}/" f"{object_key}") client.download( local_path=self.root, bucket=const.GCS_BUCKET, key=object_key, ) if os.path.isdir(unzip_dir): logger.debug(f"File {unzip_dir} exists. Skip unzip.") else: # unzip the file with ZipFile(zip_file, "r") as zip_ref: zip_ref.extractall(self.root) logger.debug(f"Unzip file from {zip_file}")
def test_download_validate(mock_checksum, mock_download_blob): mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_blob = MagicMock() client._download_validate(mocked_blob, local_path) mock_checksum.assert_called_with(mocked_blob, local_path) mock_download_blob.assert_called_with(mocked_blob, local_path)
def test_download_folder(mock_download_validate): object_key = "path/to" + file_name mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_blob = MagicMock() mocked_bucket = MagicMock() mocked_bucket.list_blobs = MagicMock(return_value=[mocked_blob]) mocked_blob.name = object_key client._download_folder(mocked_bucket, object_key, local_path) mock_download_validate.assert_called_with(mocked_blob, local_path)
def test_gcs_client_download_file_bucket_key(mock_download_file, mock_is_file): mocked_gcs_client = MagicMock() mock_is_file.return_value = True object_key = base_key + file_name with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() client.download(local_path=local_path, bucket=bucket_name, key=object_key) mock_download_file.assert_called_with(mocked_gcs_client.get_bucket(), object_key, local_path)
def test_gcs_client_upload_file_bucket_key(mock_isdir, mock_upload_file): localfile = local_path + file_name mocked_gcs_client = MagicMock() mock_isdir.return_value = False with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() client.upload(local_path=localfile, bucket=bucket_name, key=base_key) mock_upload_file.assert_called_with( bucket=mocked_gcs_client.get_bucket(), key=base_key, local_path=localfile, )
def load_from_gcs(estimator, full_cloud_path): """Load estimator from checkpoint files on GCS. Args: estimator (datasetinsights.estimators.Estimator): datasetinsights estimator object. full_cloud_path: full path to the checkpoint file """ filename = os.path.basename(full_cloud_path) with tempfile.TemporaryDirectory() as temp_dir: path = os.path.join(temp_dir, filename) logger.debug(f"Downloading estimator from {full_cloud_path} to {path}") client = GCSClient() client.download(local_path=temp_dir, url=full_cloud_path) estimator.load(path)
def download(self, version="v1.0-mini"): """ download the nuscenes dataset version specified to /data/sets/nuscenes """ self.cloud_client = GCSClient() if version == "v1.0-mini": return self._download_mini() elif version == "v1.0-trainval": meta_local_path, trainval_tars = self._download_trainval_tars() with tarfile.open(meta_local_path) as t: t.extractall(self.root) first_data_blob = trainval_tars[0] with tarfile.open(first_data_blob) as t: logger.info(f"extracting files from {first_data_blob}") t.extractall(self.root) os.remove(trainval_tars[0]) for tar_path in trainval_tars[1:]: with tarfile.open(tar_path) as t: logger.info(f"extracting files from {tar_path}") with tempfile.TemporaryDirectory() as tmpdirname: t.extractall(tmpdirname) self._merge_data_blobs(blob=tmpdirname, canonical_dir=self.root) return self.root else: raise ValueError()
def download(self, cloud_path=KITTI_GCS_PATH): logger.info(f"downloading kitti dataset from cloud storage") # todo is currently only downloading left color images cloud_client = GCSClient() self._download_sample_indices_file(cloud_client=cloud_client) calib_zip, left_images_zip, labels_zip = self.download_kitti_zips( cloud_client=cloud_client) with zipfile.ZipFile(left_images_zip, "r") as zip_ref: zip_ref.extractall(self.root) testing_dir = os.path.join(self.root, "testing") training_dir = os.path.join(self.root, "training") self._unzip2dir( zip_path=calib_zip, src=os.path.join("testing", "calib"), dst=os.path.join(testing_dir, "calib"), ) self._unzip2dir( zip_path=calib_zip, src=os.path.join("training", "calib"), dst=os.path.join(training_dir, "calib"), ) self._unzip2dir( zip_path=labels_zip, src=os.path.join("training", "label_2"), dst=os.path.join(training_dir, "label_2"), )
def test_is_file(): object_key = base_key + file_name mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_bucket.get_blob = MagicMock(return_value=mocked_blob) mocked_blob.name = object_key actual_result = client._is_file(mocked_bucket, object_key) mocked_bucket.get_blob.assert_called_with(object_key) expected_result = True assert actual_result == expected_result
def test_download_file(mock_download_validate): object_key = base_key + file_name mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_bucket.get_blob = MagicMock(return_value=mocked_blob) mocked_blob.name = object_key client._download_file(mocked_bucket, object_key, local_path) mocked_bucket.get_blob.assert_called_with(object_key) mock_download_validate.assert_called_with(mocked_blob, local_path + file_name)
def test_download_blob(mock_isdir): mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): mock_isdir.return_value = True object_key = base_key + file_name client = GCSClient() mocked_blob = MagicMock() mocked_blob.name = object_key mocked_download_blob = MagicMock() mocked_blob.download_to_filename = mocked_download_blob client._download_blob(mocked_blob, local_path) mocked_blob.download_to_filename.assert_called_with(local_path)
def test_gcs_parse(): mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() th_bucket = "some_bucket_name" th_path = "some/cloud/path" url = "gs://some_bucket_name/some/cloud/path" bucket, path = client._parse(url) assert (bucket, path) == (th_bucket, th_path) bad_url = "s3://path/to/bad/url" with pytest.raises(ValueError, match=r"Specified destination prefix:"): client._parse(bad_url)
def test_upload_file(): localfile = local_path + file_name mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket) mocked_bucket.blob = MagicMock(return_value=mocked_blob) mocked_blob.upload_from_filename = MagicMock() client._upload_file(local_path=localfile, bucket=mocked_bucket, key=base_key) mocked_blob.upload_from_filename.assert_called_with(localfile)
def test_checksum(mock_checksum, mock_md5_hex): local_file_path = local_path + file_name mocked_gcs_client = MagicMock() mock_md5_hex.return_value = md5_hash_hex with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket) mocked_bucket.get_blob = MagicMock(return_value=mocked_blob) mocked_blob.md5_hash = md5_hash client._checksum(mocked_blob, local_file_path) mock_checksum.assert_called_with(local_file_path, md5_hash_hex, algorithm="MD5")
class GCSDatasetDownloader(DatasetDownloader, protocol="gs://"): """ This class is used to download data from GCS """ def __init__(self, **kwargs): """ initiating GCSDownloader """ self.client = GCSClient() def download(self, source_uri=None, output=None, **kwargs): """ Args: source_uri: This is the downloader-uri that indicates where on GCS the dataset should be downloaded from. The expected source-uri follows these patterns gs://bucket/folder or gs://bucket/folder/data.zip output: This is the path to the directory where the download will store the dataset. """ self.client.download(local_path=output, url=source_uri)
class GCSEstimatorWriter: """Writes (saves) estimator checkpoints on GCS. Args: cloud_path (str): GCS cloud path (e.g. gs://bucket/path/to/directoy) prefix (str): filename prefix of the checkpoint files suffix (str): filename suffix of the checkpoint files """ def __init__(self, cloud_path, prefix, *, suffix=DEFAULT_SUFFIX): self._tempdir = tempfile.TemporaryDirectory().name self._client = GCSClient() self._bucket, self._gcs_path = gcs_bucket_and_path(cloud_path) self._writer = LocalEstimatorWriter(self._tempdir, prefix, create_dir=True, suffix=suffix) def save(self, estimator, epoch=None): """Save estimator to checkpoint files on GCS. Args: estimator (datasetinsights.estimators.Estimator): datasetinsights estimator object. epoch (int): the current epoch number. Default: None Returns: Full GCS cloud path to the saved checkpoint file. """ path = self._writer.save(estimator, epoch) filename = os.path.basename(path) object_key = os.path.join(self._gcs_path, filename) full_cloud_path = f"gs://{self._bucket}/{object_key}" logger.debug(f"Copying estimator from {path} to {full_cloud_path}") self._client.upload(path, self._bucket, object_key) return full_cloud_path
def download(self, cloud_path=COCO_GCS_PATH): path = Path(self.root) path.mkdir(parents=True, exist_ok=True) client = GCSClient() annotations_zip_gcs = f"{cloud_path}/annotations_trainval2017.zip" annotations_zip_2017 = self._get_local_annotations_zip() logger.info(f"checking for local copy of data") if not os.path.exists(annotations_zip_2017): logger.info(f"no annotations zip file found, will download.") client.download( bucket_name=const.GCS_BUCKET, object_key=annotations_zip_gcs, localfile=annotations_zip_2017, ) with zipfile.ZipFile(annotations_zip_2017, "r") as zip_dir: zip_dir.extractall(self.root) images_local = self._get_local_images_zip() images_gcs = f"{cloud_path}/{self.split}2017.zip" if not os.path.exists(images_local): logger.info(f"no zip file for images for {self.split} found," f" will download") client.download( bucket_name=const.GCS_BUCKET, object_key=images_gcs, localfile=images_local, ) with zipfile.ZipFile(images_local, "r") as zip_dir: zip_dir.extractall(self.root)
def test_gcs_client_warpper(): bucket_name = "fake_bucket" object_key = "path/to/object" localfile = "path/to/local/file" mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket) mocked_bucket.blob = MagicMock(return_value=mocked_blob) mocked_blob.download_to_filename = MagicMock() client.download(bucket_name, object_key, localfile) mocked_gcs_client.get_bucket.assert_called_with(bucket_name) mocked_bucket.blob.assert_called_with(object_key) mocked_blob.download_to_filename.assert_called_with(localfile) mocked_blob.upload_from_filename = MagicMock() client.upload(localfile, bucket_name, object_key) mocked_blob.upload_from_filename.assert_called_with(localfile)
def download(self): """Download dataset from GCS """ cloud_path = f"gs://{const.GCS_BUCKET}/{self.GCS_PATH}" client = GCSClient() # download label file label_zip = self.LABEL_ZIP client.download(url=cloud_path, local_path=self.root) with zipfile.ZipFile(label_zip, "r") as zip_dir: zip_dir.extractall(self.root) # download tfexamples for a dataset split tfexamples_zip = self.SPLITS_ZIP.get(self.split) client.download(url=cloud_path, local_path=self.root) with zipfile.ZipFile(tfexamples_zip, "r") as zip_dir: zip_dir.extractall(self.root)
def test_checksum_error(mock_checksum, mock_remove): local_file_path = local_path + file_name mocked_gcs_client = MagicMock() with patch( "datasetinsights.io.gcs.Client", MagicMock(return_value=mocked_gcs_client), ): client = GCSClient() mocked_bucket = MagicMock() mocked_blob = MagicMock() mocked_gcs_client.get_bucket = MagicMock(return_value=mocked_bucket) mocked_bucket.get_blob = MagicMock(return_value=mocked_blob) mocked_blob.md5_hash = md5_hash client._MD5_hex = MagicMock(return_value=md5_hash_hex) client._checksum(mocked_blob, local_file_path) mock_checksum.side_effect = ChecksumError with pytest.raises(ChecksumError): client._checksum(mocked_blob, local_file_path) mock_remove.assert_called_once()
def __init__(self, **kwargs): """ initiating GCSDownloader """ self.client = GCSClient()