def test_set_default_cluster(self): clusters = ie.current_env().clusters master_url = 'test-url' cluster_name = 'test-cluster' project = 'test-project' region = 'test-region' pipelines = ['pid'] dashboard = 'test-dashboard' cluster_id = obfuscate(project, region, cluster_name) ie.current_env().inspector._clusters = { cluster_id: { 'cluster_name': cluster_name, 'project': project, 'region': region, 'master_url': master_url, 'dashboard': dashboard, 'pipelines': pipelines } } clusters.master_urls[master_url] = MasterURLIdentifier( project, region, cluster_name) clusters.set_default_cluster( ie.current_env().inspector.get_cluster_master_url(cluster_id)) self.assertEqual(MasterURLIdentifier(project, region, cluster_name), clusters.default_cluster_metadata)
def test_clusters_cleanup_skip_on_duplicate(self, mock_master_url): clusters = ib.Clusters() project = 'test-project' region = 'test-region' p1 = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) p2 = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata_1 = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p1))] = DataprocClusterManager(cluster_metadata_1) clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url' clusters.master_urls_to_pipelines['test_url'].append(str(id(p1))) cluster_metadata_2 = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p1))] = DataprocClusterManager(cluster_metadata_2) clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url' clusters.master_urls_to_pipelines['test_url'].append(str(id(p2))) from apache_beam.runners.interactive.interactive_beam import _LOGGER with self.assertLogs(_LOGGER, level='WARNING') as context_manager: clusters.cleanup(p1) self.assertTrue('skipping deletion' in context_manager.output[0])
def test_delete_cluster(self): clusters = ie.current_env().clusters class MockClusterManager: master_url = 'test-url' def cleanup(self): pass master_url = 'test-url' cluster_name = 'test-cluster' project = 'test-project' region = 'test-region' metadata = MasterURLIdentifier(project, region, cluster_name) p = beam.Pipeline(ir.InteractiveRunner()) ie.current_env()._tracked_user_pipelines.add_user_pipeline(p) clusters.master_urls[master_url] = metadata clusters.master_urls_to_dashboards[master_url] = 'test-dashboard' clusters.dataproc_cluster_managers[str(id(p))] = MockClusterManager() clusters.master_urls_to_pipelines[master_url] = [str(id(p))] cluster_id = obfuscate(project, region, cluster_name) ie.current_env().inspector._clusters[cluster_id] = { 'master_url': master_url, 'pipelines': [str(id(p))] } clusters.delete_cluster( ie.current_env().inspector.get_cluster_master_url(cluster_id)) self.assertEqual(clusters.master_urls, {}) self.assertEqual(clusters.master_urls_to_pipelines, {})
def test_list_clusters(self): master_url = 'test-url' cluster_name = 'test-cluster' project = 'test-project' region = 'test-region' pipelines = ['pid'] dashboard = 'test-dashboard' ie.current_env( ).clusters.master_urls[master_url] = MasterURLIdentifier( project, region, cluster_name) ie.current_env( ).clusters.master_urls_to_pipelines[master_url] = pipelines ie.current_env( ).clusters.master_urls_to_dashboards[master_url] = dashboard ins = inspector.InteractiveEnvironmentInspector() cluster_id = obfuscate(project, region, cluster_name) self.assertEqual( { cluster_id: { 'cluster_name': cluster_name, 'project': project, 'region': region, 'master_url': master_url, 'dashboard': dashboard, 'pipelines': pipelines } }, json.loads(ins.list_clusters()))
def test_get_staging_location_exception(self, mock_cluster_client): """ Test to catch when an error is raised inside get_staging_location. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) with self.assertRaises(MockException): cluster_manager.get_staging_location(cluster_metadata)
def test_get_staging_location(self, mock_cluster_client, mock_list): """ Test to receive a mock staging location successfully under get_staging_location. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) self.assertEqual( cluster_manager.get_staging_location(cluster_metadata), 'gs://test-bucket/google-cloud-dataproc-metainfo/')
def test_create_cluster_default_already_exists(self, mock_cluster_client): """ Tests that no exception is thrown when a cluster already exists, but is using ie.current_env().clusters.default_cluster_name. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='INFO') as context_manager: cluster_manager.create_cluster({}) self.assertTrue('already exists' in context_manager.output[0])
def test_create_cluster_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to write to a project while having insufficient permissions. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue('Due to insufficient project permissions' in context_manager.output[0])
def test_create_cluster_region_does_not_exist(self, mock_cluster_client): """ Tests that an exception is thrown when a user specifies a region that does not exist. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue( 'Invalid region provided' in context_manager.output[0])
def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when cleanup attempts to delete a cluster that does not exist. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue( 'Cluster does not exist' in context_manager.output[0])
def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when the exception is not handled by any other case under cleanup. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(MockException, cluster_manager.cleanup) self.assertTrue( 'Failed to delete cluster' in context_manager.output[0])
def test_get_master_url_and_dashboard(self, mock_parse_method): """ Tests that get_master_url_and_dashboard detect the line containing the unique substring which identifies the location of the master_url and application id of the Flink master. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager._fs = MockFileSystem() master_url, dashboard = cluster_manager.get_master_url_and_dashboard( cluster_metadata, 'test-staging-bucket') self.assertEqual(master_url, 'test-master-url') self.assertEqual(dashboard, 'test-dashboard-link')
def test_clusters_cleanup_otherwise(self, mock_cleanup): clusters = ie.current_env().clusters project = 'test-project' region = 'test-region' p = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url' clusters.cleanup(p)
def test_get_cluster_details_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to get information for a project without sufficient permissions to do so. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs( _LOGGER, level='ERROR') as context_manager, self.assertRaises( ValueError): cluster_manager.get_cluster_details(cluster_metadata) self.assertTrue('Due to insufficient project permissions' in context_manager.output[0])
def test_cleanup_all_dataproc_clusters(self, mock_cleanup): env = ie.InteractiveEnvironment() project = 'test-project' region = 'test-region' p = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) env.clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) env._tracked_user_pipelines.add_user_pipeline(p) env.cleanup() self.assertEqual(env.clusters.dataproc_cluster_managers, {})
def test_clusters_describe(self): clusters = ib.Clusters() project = 'test-project' region = 'test-region' p = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) self.assertEqual( 'test-project', clusters.describe()[str(id(p))]['cluster_metadata'].project_id)
def test_parse_master_url_and_dashboard(self, mock_cluster_details): """ Tests that parse_master_url_and_dashboard properly parses the input string and produces a mock master_url and mock dashboard link. """ cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) line = 'test-line Found Web Interface test-master-url' \ ' of application \'test-app-id\'.\n' master_url, dashboard = cluster_manager.parse_master_url_and_dashboard( cluster_metadata, line) self.assertEqual('test-master-url', master_url) self.assertEqual( 'test-resource-manager/gateway/default/yarn/proxy/test-app-id/', dashboard)
def test_get_master_url_no_flink_master_and_master_url_exists(self, m_env): clusters = ib.Clusters() m_env().clusters = clusters from apache_beam.runners.portability.flink_runner import FlinkRunner runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) p = beam.Pipeline(options=PipelineOptions( project='test-project', region='test-region', )) cluster_name = clusters.default_cluster_name cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region', cluster_name=cluster_name) clusters.master_urls['test-url'] = cluster_metadata clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard' flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p) self.assertEqual( clusters.describe(p)['cluster_metadata'].project_id, 'test-project') self.assertEqual(flink_master, clusters.describe(p)['master_url'])
def _get_dataproc_cluster_master_url_if_applicable( self, user_pipeline: beam.Pipeline) -> str: """ Creates a Dataproc cluster if the provided user_pipeline is running FlinkRunner and no flink_master_url was provided as an option. A cluster is not created when a flink_master_url is detected. Example pipeline options to enable automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--project=my-project', '--region=my-region', '--environment_type=DOCKER' ]) Example pipeline options to skip automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--flink_master=example.internal:41979', '--environment_type=DOCKER' ]) """ from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager from apache_beam.runners.portability.flink_runner import FlinkRunner flink_master = user_pipeline.options.view_as( FlinkRunnerOptions).flink_master clusters = ie.current_env().clusters # Only consider this logic when both below 2 conditions apply. if isinstance(self._underlying_runner, FlinkRunner) and clusters.dataproc_cluster_managers.get( str(id(user_pipeline)), None) is None: if flink_master == '[auto]': # The above condition is True when the user has not provided a # flink_master. if ie.current_env()._is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) project_id = ( user_pipeline.options.view_as(GoogleCloudOptions).project) region = ( user_pipeline.options.view_as(GoogleCloudOptions).region) if not project_id: # When a Google Cloud project is not specified, we try to set the # cluster_metadata to be the default value set from the # 'Manage Clusters' JupyterLab extension. If a value has not been # specified, this value defaults to None. cluster_metadata = ie.current_env( ).clusters.default_cluster_metadata else: cluster_name = ie.current_env( ).clusters.default_cluster_name cluster_metadata = MasterURLIdentifier( project_id=project_id, region=region, cluster_name=cluster_name) else: cluster_metadata = clusters.master_urls.get(flink_master, None) # else noop, no need to log anything because we allow a master_url # (not managed by us) provided by the user. if cluster_metadata: # create the cluster_manager and populate dicts in the clusters # instance if the pipeline is not already mapped to an existing # cluster_manager. cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager.create_flink_cluster() clusters.master_urls[ cluster_manager.master_url] = cluster_metadata clusters.dataproc_cluster_managers[str( id(user_pipeline))] = cluster_manager clusters.master_urls_to_pipelines[ cluster_manager.master_url].append(str(id(user_pipeline))) clusters.master_urls_to_dashboards[ cluster_manager.master_url] = cluster_manager.dashboard return cluster_manager.master_url