Пример #1
0
  def __init__(self, cluster_metadata: MasterURLIdentifier) -> None:
    """Initializes the DataprocClusterManager with properties required
    to interface with the Dataproc ClusterControllerClient.
    """
    self.cluster_metadata = cluster_metadata
    if self.cluster_metadata.region == 'global':
      # The global region is unsupported as it will be eventually deprecated.
      raise ValueError('Clusters in the global region are not supported.')
    elif not self.cluster_metadata.region:
      _LOGGER.warning(
          'No region information was detected, defaulting Dataproc cluster '
          'region to: us-central1.')
      self.cluster_metadata.region = 'us-central1'

    if not self.cluster_metadata.cluster_name:
      self.cluster_metadata.cluster_name = ie.current_env(
      ).clusters.default_cluster_name

    self._cluster_client = dataproc_v1.ClusterControllerClient(
        client_options={
            'api_endpoint': \
            f'{self.cluster_metadata.region}-dataproc.googleapis.com:443'
        })

    if self.cluster_metadata in ie.current_env().clusters.master_urls.inverse:
      self.master_url = ie.current_env().clusters.master_urls.inverse[
          self.cluster_metadata]
      self.dashboard = ie.current_env().clusters.master_urls_to_dashboards[
          self.master_url]
    else:
      self.master_url = None
      self.dashboard = None

    self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
    self._staging_directory = None
Пример #2
0
 def cleanup(self):
     if self._cache_dir.startswith('gs://'):
         from apache_beam.io.gcp import gcsfilesystem
         from apache_beam.options.pipeline_options import PipelineOptions
         fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
         fs.delete([self._cache_dir + '/full/'])
     elif filesystems.FileSystems.exists(self._cache_dir):
         filesystems.FileSystems.delete([self._cache_dir])
     self._saved_pcoders = {}
Пример #3
0
    def test_split(self):
        file_system = gcsfilesystem.GCSFileSystem()
        self.assertEqual(('gs://foo/bar', 'baz'),
                         file_system.split('gs://foo/bar/baz'))
        self.assertEqual(('gs://foo', ''), file_system.split('gs://foo/'))
        self.assertEqual(('gs://foo', ''), file_system.split('gs://foo'))

        with self.assertRaises(ValueError):
            file_system.split('/no/gcs/prefix')
Пример #4
0
    def test_open(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        # Issue file copy
        file_system = gcsfilesystem.GCSFileSystem()
        _ = file_system.open('gs://bucket/from1', 'application/octet-stream')

        gcsio_mock.open.assert_called_once_with(
            'gs://bucket/from1', 'rb', mime_type='application/octet-stream')
Пример #5
0
    def test_copy_tree(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        sources = ['gs://bucket1/']
        destinations = ['gs://bucket2/']

        # Issue directory copy
        file_system = gcsfilesystem.GCSFileSystem()
        file_system.copy(sources, destinations)

        gcsio_mock.copytree.assert_called_once_with('gs://bucket1/',
                                                    'gs://bucket2/')
Пример #6
0
    def test_copy_file(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        sources = ['gs://bucket/from1']
        destinations = ['gs://bucket/to1']

        # Issue file copy
        file_system = gcsfilesystem.GCSFileSystem()
        file_system.copy(sources, destinations)

        gcsio_mock.copy.assert_called_once_with('gs://bucket/from1',
                                                'gs://bucket/to1')
Пример #7
0
 def __init__(self, cluster_metadata: ClusterMetadata) -> None:
   """Initializes the DataprocClusterManager with properties required
   to interface with the Dataproc ClusterControllerClient.
   """
   self.cluster_metadata = cluster_metadata
   # Pipelines whose jobs are executed on the cluster.
   self.pipelines = set()
   self._cluster_client = dataproc_v1.ClusterControllerClient(
       client_options={
           'api_endpoint': \
           f'{self.cluster_metadata.region}-dataproc.googleapis.com:443'
       })
   self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
   self._staging_directory = None
Пример #8
0
    def save_np_image(self, np_image, destination):
        import PIL.Image
        final_image = PIL.Image.fromarray(np_image)
        import io
        final_image_bytes = io.BytesIO()
        final_image.save(final_image_bytes, format='JPEG')
        result_bytes = final_image_bytes.getvalue()
        from apache_beam.io.gcp import gcsfilesystem

        file_system = gcsfilesystem.GCSFileSystem()
        file = file_system.create(destination, 'image/jpeg')

        file.write(result_bytes)
        file.close()
Пример #9
0
    def test_delete(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        files = [
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ]

        # Issue batch delete.
        file_system = gcsfilesystem.GCSFileSystem()
        file_system.delete(files)
        gcsio_mock.delete_batch.assert_called()
Пример #10
0
    def test_match_multiples_error(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        exception = IOError('Failed')
        gcsio_mock.size_of_files_in_glob.side_effect = exception
        expected_results = {'gs://bucket/': exception}

        file_system = gcsfilesystem.GCSFileSystem()
        with self.assertRaisesRegexp(BeamIOError,
                                     r'^Match operation failed') as error:
            file_system.match(['gs://bucket/'])
        self.assertEqual(error.exception.exception_details, expected_results)
        gcsio_mock.size_of_files_in_glob.assert_called_once_with(
            'gs://bucket/*', None)
Пример #11
0
 def test_match_multiples_limit(self, mock_gcsio):
     # Prepare mocks.
     gcsio_mock = mock.MagicMock()
     limit = 1
     gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
     gcsio_mock.size_of_files_in_glob.return_value = {
         'gs://bucket/file1': 1
     }
     expected_results = set([FileMetadata('gs://bucket/file1', 1)])
     file_system = gcsfilesystem.GCSFileSystem()
     match_result = file_system.match(['gs://bucket/'], [limit])[0]
     self.assertEqual(set(match_result.metadata_list), expected_results)
     self.assertEqual(len(match_result.metadata_list), limit)
     gcsio_mock.size_of_files_in_glob.assert_called_once_with(
         'gs://bucket/*', 1)
Пример #12
0
 def test_join(self):
     file_system = gcsfilesystem.GCSFileSystem()
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path', 'to', 'file'))
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path', 'to/file'))
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path', '/to/file'))
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path/', 'to', 'file'))
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path/', 'to/file'))
     self.assertEqual('gs://bucket/path/to/file',
                      file_system.join('gs://bucket/path/', '/to/file'))
     with self.assertRaises(ValueError):
         file_system.join('/bucket/path/', '/to/file')
Пример #13
0
 def test_match_multiple_patterns(self, mock_gcsio):
   # Prepare mocks.
   gcsio_mock = mock.MagicMock()
   gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
   gcsio_mock.size_of_files_in_glob.side_effect = [
       {'gs://bucket/file1': 1},
       {'gs://bucket/file2': 2},
   ]
   expected_results = [
       [FileMetadata('gs://bucket/file1', 1)],
       [FileMetadata('gs://bucket/file2', 2)]
   ]
   file_system = gcsfilesystem.GCSFileSystem()
   result = file_system.match(['gs://bucket/file1*', 'gs://bucket/file2*'])
   self.assertEqual(
       [mr.metadata_list for mr in result],
       expected_results)
Пример #14
0
    def test_rename_error(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        sources = [
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ]
        destinations = [
            'gs://bucket/to1',
            'gs://bucket/to2',
            'gs://bucket/to3',
        ]
        exception = IOError('Failed')
        gcsio_mock.delete_batch.side_effect = [[(f, exception)
                                                for f in sources]]
        gcsio_mock.copy_batch.side_effect = [[
            ('gs://bucket/from1', 'gs://bucket/to1', None),
            ('gs://bucket/from2', 'gs://bucket/to2', None),
            ('gs://bucket/from3', 'gs://bucket/to3', None),
        ]]

        # Issue batch rename.
        expected_results = {(s, d): exception
                            for s, d in zip(sources, destinations)}

        # Issue batch rename.
        file_system = gcsfilesystem.GCSFileSystem()
        with self.assertRaises(BeamIOError) as error:
            file_system.rename(sources, destinations)
        self.assertTrue(
            error.exception.message.startswith('Rename operation failed'))
        self.assertEqual(error.exception.exception_details, expected_results)

        gcsio_mock.copy_batch.assert_called_once_with([
            ('gs://bucket/from1', 'gs://bucket/to1'),
            ('gs://bucket/from2', 'gs://bucket/to2'),
            ('gs://bucket/from3', 'gs://bucket/to3'),
        ])
        gcsio_mock.delete_batch.assert_called_once_with([
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ])
Пример #15
0
    def test_delete_error(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        exception = IOError('Failed')
        gcsio_mock.delete_batch.side_effect = exception
        files = [
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ]
        expected_results = {f: exception for f in files}

        # Issue batch delete.
        file_system = gcsfilesystem.GCSFileSystem()
        with self.assertRaisesRegexp(BeamIOError,
                                     r'^Delete operation failed') as error:
            file_system.delete(files)
        self.assertEqual(error.exception.exception_details, expected_results)
        gcsio_mock.delete_batch.assert_called()
Пример #16
0
    def test_rename(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        sources = [
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ]
        destinations = [
            'gs://bucket/to1',
            'gs://bucket/to2',
            'gs://bucket/to3',
        ]
        gcsio_mock.copy_batch.side_effect = [[
            ('gs://bucket/from1', 'gs://bucket/to1', None),
            ('gs://bucket/from2', 'gs://bucket/to2', None),
            ('gs://bucket/from3', 'gs://bucket/to3', None),
        ]]
        gcsio_mock.delete_batch.side_effect = [[
            ('gs://bucket/from1', None),
            ('gs://bucket/from2', None),
            ('gs://bucket/from3', None),
        ]]

        # Issue batch rename.
        file_system = gcsfilesystem.GCSFileSystem()
        file_system.rename(sources, destinations)

        gcsio_mock.copy_batch.assert_called_once_with([
            ('gs://bucket/from1', 'gs://bucket/to1'),
            ('gs://bucket/from2', 'gs://bucket/to2'),
            ('gs://bucket/from3', 'gs://bucket/to3'),
        ])
        gcsio_mock.delete_batch.assert_called_once_with([
            'gs://bucket/from1',
            'gs://bucket/from2',
            'gs://bucket/from3',
        ])
Пример #17
0
    def test_copy_file_error(self, mock_gcsio):
        # Prepare mocks.
        gcsio_mock = mock.MagicMock()
        gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
        sources = ['gs://bucket/from1']
        destinations = ['gs://bucket/to1']

        exception = IOError('Failed')
        gcsio_mock.copy.side_effect = exception

        # Issue batch rename.
        expected_results = {(s, d): exception
                            for s, d in zip(sources, destinations)}

        # Issue batch copy.
        file_system = gcsfilesystem.GCSFileSystem()
        with self.assertRaisesRegexp(BeamIOError,
                                     r'^Copy operation failed') as error:
            file_system.copy(sources, destinations)
        self.assertEqual(error.exception.exception_details, expected_results)

        gcsio_mock.copy.assert_called_once_with('gs://bucket/from1',
                                                'gs://bucket/to1')
Пример #18
0
 def test_scheme(self):
     file_system = gcsfilesystem.GCSFileSystem()
     self.assertEqual(file_system.scheme(), 'gs')
     self.assertEqual(gcsfilesystem.GCSFileSystem.scheme(), 'gs')
Пример #19
0
 def setUp(self):
     pipeline_options = PipelineOptions()
     self.fs = gcsfilesystem.GCSFileSystem(
         pipeline_options=pipeline_options)