def __init__(self, cluster_metadata: MasterURLIdentifier) -> None: """Initializes the DataprocClusterManager with properties required to interface with the Dataproc ClusterControllerClient. """ self.cluster_metadata = cluster_metadata if self.cluster_metadata.region == 'global': # The global region is unsupported as it will be eventually deprecated. raise ValueError('Clusters in the global region are not supported.') elif not self.cluster_metadata.region: _LOGGER.warning( 'No region information was detected, defaulting Dataproc cluster ' 'region to: us-central1.') self.cluster_metadata.region = 'us-central1' if not self.cluster_metadata.cluster_name: self.cluster_metadata.cluster_name = ie.current_env( ).clusters.default_cluster_name self._cluster_client = dataproc_v1.ClusterControllerClient( client_options={ 'api_endpoint': \ f'{self.cluster_metadata.region}-dataproc.googleapis.com:443' }) if self.cluster_metadata in ie.current_env().clusters.master_urls.inverse: self.master_url = ie.current_env().clusters.master_urls.inverse[ self.cluster_metadata] self.dashboard = ie.current_env().clusters.master_urls_to_dashboards[ self.master_url] else: self.master_url = None self.dashboard = None self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) self._staging_directory = None
def cleanup(self): if self._cache_dir.startswith('gs://'): from apache_beam.io.gcp import gcsfilesystem from apache_beam.options.pipeline_options import PipelineOptions fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) fs.delete([self._cache_dir + '/full/']) elif filesystems.FileSystems.exists(self._cache_dir): filesystems.FileSystems.delete([self._cache_dir]) self._saved_pcoders = {}
def test_split(self): file_system = gcsfilesystem.GCSFileSystem() self.assertEqual(('gs://foo/bar', 'baz'), file_system.split('gs://foo/bar/baz')) self.assertEqual(('gs://foo', ''), file_system.split('gs://foo/')) self.assertEqual(('gs://foo', ''), file_system.split('gs://foo')) with self.assertRaises(ValueError): file_system.split('/no/gcs/prefix')
def test_open(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock # Issue file copy file_system = gcsfilesystem.GCSFileSystem() _ = file_system.open('gs://bucket/from1', 'application/octet-stream') gcsio_mock.open.assert_called_once_with( 'gs://bucket/from1', 'rb', mime_type='application/octet-stream')
def test_copy_tree(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock sources = ['gs://bucket1/'] destinations = ['gs://bucket2/'] # Issue directory copy file_system = gcsfilesystem.GCSFileSystem() file_system.copy(sources, destinations) gcsio_mock.copytree.assert_called_once_with('gs://bucket1/', 'gs://bucket2/')
def test_copy_file(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock sources = ['gs://bucket/from1'] destinations = ['gs://bucket/to1'] # Issue file copy file_system = gcsfilesystem.GCSFileSystem() file_system.copy(sources, destinations) gcsio_mock.copy.assert_called_once_with('gs://bucket/from1', 'gs://bucket/to1')
def __init__(self, cluster_metadata: ClusterMetadata) -> None: """Initializes the DataprocClusterManager with properties required to interface with the Dataproc ClusterControllerClient. """ self.cluster_metadata = cluster_metadata # Pipelines whose jobs are executed on the cluster. self.pipelines = set() self._cluster_client = dataproc_v1.ClusterControllerClient( client_options={ 'api_endpoint': \ f'{self.cluster_metadata.region}-dataproc.googleapis.com:443' }) self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) self._staging_directory = None
def save_np_image(self, np_image, destination): import PIL.Image final_image = PIL.Image.fromarray(np_image) import io final_image_bytes = io.BytesIO() final_image.save(final_image_bytes, format='JPEG') result_bytes = final_image_bytes.getvalue() from apache_beam.io.gcp import gcsfilesystem file_system = gcsfilesystem.GCSFileSystem() file = file_system.create(destination, 'image/jpeg') file.write(result_bytes) file.close()
def test_delete(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock files = [ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ] # Issue batch delete. file_system = gcsfilesystem.GCSFileSystem() file_system.delete(files) gcsio_mock.delete_batch.assert_called()
def test_match_multiples_error(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock exception = IOError('Failed') gcsio_mock.size_of_files_in_glob.side_effect = exception expected_results = {'gs://bucket/': exception} file_system = gcsfilesystem.GCSFileSystem() with self.assertRaisesRegexp(BeamIOError, r'^Match operation failed') as error: file_system.match(['gs://bucket/']) self.assertEqual(error.exception.exception_details, expected_results) gcsio_mock.size_of_files_in_glob.assert_called_once_with( 'gs://bucket/*', None)
def test_match_multiples_limit(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() limit = 1 gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock gcsio_mock.size_of_files_in_glob.return_value = { 'gs://bucket/file1': 1 } expected_results = set([FileMetadata('gs://bucket/file1', 1)]) file_system = gcsfilesystem.GCSFileSystem() match_result = file_system.match(['gs://bucket/'], [limit])[0] self.assertEqual(set(match_result.metadata_list), expected_results) self.assertEqual(len(match_result.metadata_list), limit) gcsio_mock.size_of_files_in_glob.assert_called_once_with( 'gs://bucket/*', 1)
def test_join(self): file_system = gcsfilesystem.GCSFileSystem() self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path', 'to', 'file')) self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path', 'to/file')) self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path', '/to/file')) self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path/', 'to', 'file')) self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path/', 'to/file')) self.assertEqual('gs://bucket/path/to/file', file_system.join('gs://bucket/path/', '/to/file')) with self.assertRaises(ValueError): file_system.join('/bucket/path/', '/to/file')
def test_match_multiple_patterns(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock gcsio_mock.size_of_files_in_glob.side_effect = [ {'gs://bucket/file1': 1}, {'gs://bucket/file2': 2}, ] expected_results = [ [FileMetadata('gs://bucket/file1', 1)], [FileMetadata('gs://bucket/file2', 2)] ] file_system = gcsfilesystem.GCSFileSystem() result = file_system.match(['gs://bucket/file1*', 'gs://bucket/file2*']) self.assertEqual( [mr.metadata_list for mr in result], expected_results)
def test_rename_error(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock sources = [ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ] destinations = [ 'gs://bucket/to1', 'gs://bucket/to2', 'gs://bucket/to3', ] exception = IOError('Failed') gcsio_mock.delete_batch.side_effect = [[(f, exception) for f in sources]] gcsio_mock.copy_batch.side_effect = [[ ('gs://bucket/from1', 'gs://bucket/to1', None), ('gs://bucket/from2', 'gs://bucket/to2', None), ('gs://bucket/from3', 'gs://bucket/to3', None), ]] # Issue batch rename. expected_results = {(s, d): exception for s, d in zip(sources, destinations)} # Issue batch rename. file_system = gcsfilesystem.GCSFileSystem() with self.assertRaises(BeamIOError) as error: file_system.rename(sources, destinations) self.assertTrue( error.exception.message.startswith('Rename operation failed')) self.assertEqual(error.exception.exception_details, expected_results) gcsio_mock.copy_batch.assert_called_once_with([ ('gs://bucket/from1', 'gs://bucket/to1'), ('gs://bucket/from2', 'gs://bucket/to2'), ('gs://bucket/from3', 'gs://bucket/to3'), ]) gcsio_mock.delete_batch.assert_called_once_with([ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ])
def test_delete_error(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock exception = IOError('Failed') gcsio_mock.delete_batch.side_effect = exception files = [ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ] expected_results = {f: exception for f in files} # Issue batch delete. file_system = gcsfilesystem.GCSFileSystem() with self.assertRaisesRegexp(BeamIOError, r'^Delete operation failed') as error: file_system.delete(files) self.assertEqual(error.exception.exception_details, expected_results) gcsio_mock.delete_batch.assert_called()
def test_rename(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock sources = [ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ] destinations = [ 'gs://bucket/to1', 'gs://bucket/to2', 'gs://bucket/to3', ] gcsio_mock.copy_batch.side_effect = [[ ('gs://bucket/from1', 'gs://bucket/to1', None), ('gs://bucket/from2', 'gs://bucket/to2', None), ('gs://bucket/from3', 'gs://bucket/to3', None), ]] gcsio_mock.delete_batch.side_effect = [[ ('gs://bucket/from1', None), ('gs://bucket/from2', None), ('gs://bucket/from3', None), ]] # Issue batch rename. file_system = gcsfilesystem.GCSFileSystem() file_system.rename(sources, destinations) gcsio_mock.copy_batch.assert_called_once_with([ ('gs://bucket/from1', 'gs://bucket/to1'), ('gs://bucket/from2', 'gs://bucket/to2'), ('gs://bucket/from3', 'gs://bucket/to3'), ]) gcsio_mock.delete_batch.assert_called_once_with([ 'gs://bucket/from1', 'gs://bucket/from2', 'gs://bucket/from3', ])
def test_copy_file_error(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock sources = ['gs://bucket/from1'] destinations = ['gs://bucket/to1'] exception = IOError('Failed') gcsio_mock.copy.side_effect = exception # Issue batch rename. expected_results = {(s, d): exception for s, d in zip(sources, destinations)} # Issue batch copy. file_system = gcsfilesystem.GCSFileSystem() with self.assertRaisesRegexp(BeamIOError, r'^Copy operation failed') as error: file_system.copy(sources, destinations) self.assertEqual(error.exception.exception_details, expected_results) gcsio_mock.copy.assert_called_once_with('gs://bucket/from1', 'gs://bucket/to1')
def test_scheme(self): file_system = gcsfilesystem.GCSFileSystem() self.assertEqual(file_system.scheme(), 'gs') self.assertEqual(gcsfilesystem.GCSFileSystem.scheme(), 'gs')
def setUp(self): pipeline_options = PipelineOptions() self.fs = gcsfilesystem.GCSFileSystem( pipeline_options=pipeline_options)