def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when cleanup attempts to delete a cluster that does not exist. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue('Cluster does not exist' in context_manager.output[0])
def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when the exception is not handled by any other case under cleanup. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(MockException, cluster_manager.cleanup) self.assertTrue('Failed to delete cluster' in context_manager.output[0])
def test_create_cluster_region_does_not_exist(self, mock_cluster_client): """ Tests that an exception is thrown when a user specifies a region that does not exist. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue('Invalid region provided' in context_manager.output[0])
def test_create_cluster_other_exception(self, mock_cluster_client): """ Tests that an exception is thrown when the exception is not handled by any other case under _create_cluster. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(MockException, cluster_manager.create_cluster, {}) self.assertTrue('Unable to create cluster' in context_manager.output[0])
def test_create_cluster_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to write to a project while having insufficient permissions. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue('Due to insufficient project permissions' in context_manager.output[0])
def test_clusters_cleanup_otherwise(self, mock_cleanup): clusters = ie.current_env().clusters project = 'test-project' region = 'test-region' p = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url' clusters.cleanup(p)
def test_clusters_describe(self): clusters = ib.Clusters() project = 'test-project' region = 'test-region' p = beam.Pipeline( options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[p] = DataprocClusterManager( cluster_metadata) self.assertEqual('test-project', clusters.describe()[None] \ ['cluster_metadata'].project_id)
def test_cleanup_permission_denied(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when a user is trying to delete a project that they have insufficient permissions for. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue( 'Due to insufficient project permissions' in context_manager.output[0])
def test_cleanup_all_dataproc_clusters(self, mock_cleanup): env = ie.InteractiveEnvironment() project = 'test-project' region = 'test-region' p = beam.Pipeline( options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) env.clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) env._tracked_user_pipelines.add_user_pipeline(p) env.cleanup() self.assertEqual(env.clusters.dataproc_cluster_managers, {})
def test_reuse_a_cluster_for_a_known_pipeline(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') p = beam.Pipeline(runner=runner, options=options) meta = ClusterMetadata(project_id='test-project', region='test-region') dcm = DataprocClusterManager(meta) # Configure the clusters so that the pipeline is known. clusters.pipelines[p] = dcm runner.configure_for_flink(p, options) # A known cluster is reused. tuned_meta = clusters.cluster_metadata(p) self.assertIs(tuned_meta, meta)
def test_create_but_reuse_a_known_cluster(self): known_meta = ClusterMetadata(project_id='test-project', region='test-region') known_dcm = DataprocClusterManager(known_meta) known_meta.master_url = 'test-url' self.clusters.set_default_cluster(known_meta) self.clusters.dataproc_cluster_managers[known_meta] = known_dcm self.clusters.master_urls[known_meta.master_url] = known_meta # Use an equivalent meta as the identifier to create a cluster. cid_meta = ClusterMetadata(project_id=known_meta.project_id, region=known_meta.region, cluster_name=known_meta.cluster_name) dcm = self.clusters.create(cid_meta) # The known cluster manager is returned. self.assertIs(dcm, known_dcm) # Then use an equivalent master_url as the identifier. cid_master_url = known_meta.master_url dcm = self.clusters.create(cid_master_url) self.assertIs(dcm, known_dcm)
def _create_dataproc_cluster_if_applicable(self, user_pipeline): """ Creates a Dataproc cluster if the provided user_pipeline is running FlinkRunner and no flink_master_url was provided as an option. A cluster is not created when a flink_master_url is detected. Example pipeline options to enable automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--project=my-project', '--region=my-region', '--environment_type=DOCKER' ]) Example pipeline options to skip automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--flink_master=example.internal:41979', '--environment_type=DOCKER' ]) """ from apache_beam.runners.portability.flink_runner import FlinkRunner from apache_beam.options.pipeline_options import FlinkRunnerOptions flink_master = user_pipeline.options.view_as( FlinkRunnerOptions).flink_master clusters = ie.current_env().clusters # Only consider this logic when both below 2 conditions apply. if isinstance(self._underlying_runner, FlinkRunner) and clusters.dataproc_cluster_managers.get( str(id(user_pipeline)), None) is None: if flink_master == '[auto]': # The above condition is True when the user has not provided a # flink_master. if ie.current_env()._is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) project_id = ( user_pipeline.options.view_as(GoogleCloudOptions).project) region = ( user_pipeline.options.view_as(GoogleCloudOptions).region) cluster_name = ie.current_env().clusters.default_cluster_name cluster_metadata = MasterURLIdentifier( project_id=project_id, region=region, cluster_name=cluster_name) else: cluster_metadata = clusters.master_urls.inverse.get( flink_master, None) # else noop, no need to log anything because we allow a master_url # (not managed by us) provided by the user. if cluster_metadata: # create the cluster_manager and populate dicts in the clusters # instance if the pipeline is not already mapped to an existing # cluster_manager. cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager.create_flink_cluster() clusters.master_urls[ cluster_manager.master_url] = cluster_metadata clusters.dataproc_cluster_managers[str( id(user_pipeline))] = cluster_manager clusters.master_urls_to_pipelines[ cluster_manager.master_url].append(str(id(user_pipeline)))