def main(project_id, zone, cluster_name, bucket_name, pyspark_file=None, create_new_cluster=True, global_region=True): # [START dataproc_get_client] if global_region: region = 'global' # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() dataproc_job_client = dataproc_v1.JobControllerClient() else: region = get_region_from_zone(zone) # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) # [END dataproc_get_client] try: spark_file, spark_filename = get_pyspark_file(pyspark_file) if create_new_cluster: create_cluster(dataproc_cluster_client, project_id, zone, region, cluster_name) wait_for_cluster_creation() upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) list_clusters_with_details(dataproc_cluster_client, project_id, region) (cluster_id, output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client, project_id, region, cluster_name)) # [START dataproc_call_submit_pyspark_job] job_id = submit_pyspark_job(dataproc_job_client, project_id, region, cluster_name, bucket_name, spark_filename) # [END dataproc_call_submit_pyspark_job] wait_for_job(dataproc_job_client, project_id, region, job_id) output = download_output(project_id, cluster_id, output_bucket, job_id) print('Received job output {}'.format(output)) return output finally: if create_new_cluster: delete_cluster(dataproc_cluster_client, project_id, region, cluster_name) spark_file.close()
def main(project_id, region): if region == "global": # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() else: # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region) ) dataproc_cluster_client = dataproc_v1.ClusterControllerClient(client_transport) list_clusters(dataproc_cluster_client, project_id, region)
def test_delete_cluster(self): # Setup Expected Response expected_response = {} expected_response = empty_pb2.Empty(**expected_response) operation = operations_pb2.Operation( name='operations/test_delete_cluster', done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) client = dataproc_v1.ClusterControllerClient(channel=channel) # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' response = client.delete_cluster(project_id, region, cluster_name) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = clusters_pb2.DeleteClusterRequest( project_id=project_id, region=region, cluster_name=cluster_name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def post(self, request): print("List of clusters initiated ......") os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json" project_id = "deepak-cloud-trail" zone = request.POST["zone"] region = get_region_from_zone(zone) zone_uri = \ 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( project_id, zone) client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_client = dataproc_v1.ClusterControllerClient(client_transport) cluster_name = request.POST["cluster_name"] cluster_data = { 'project_id': project_id, 'cluster_name': cluster_name, 'config': { 'gce_cluster_config': { 'zone_uri': zone_uri }, 'master_config': { 'num_instances': 1, 'machine_type_uri': 'n1-standard-1' }, 'worker_config': { 'num_instances': 2, 'machine_type_uri': 'n1-standard-1' } } } cluster = dataproc_client.create_cluster(project_id, region, cluster_data)
def __init__(self, cluster_metadata: MasterURLIdentifier) -> None: """Initializes the DataprocClusterManager with properties required to interface with the Dataproc ClusterControllerClient. """ self.cluster_metadata = cluster_metadata if self.cluster_metadata.region == 'global': # The global region is unsupported as it will be eventually deprecated. raise ValueError('Clusters in the global region are not supported.') elif not self.cluster_metadata.region: _LOGGER.warning( 'No region information was detected, defaulting Dataproc cluster ' 'region to: us-central1.') self.cluster_metadata.region = 'us-central1' if not self.cluster_metadata.cluster_name: self.cluster_metadata.cluster_name = ie.current_env( ).clusters.default_cluster_name self._cluster_client = dataproc_v1.ClusterControllerClient( client_options={ 'api_endpoint': \ f'{self.cluster_metadata.region}-dataproc.googleapis.com:443' }) if self.cluster_metadata in ie.current_env().clusters.master_urls.inverse: self.master_url = ie.current_env().clusters.master_urls.inverse[ self.cluster_metadata] self.dashboard = ie.current_env().clusters.master_urls_to_dashboards[ self.master_url] else: self.master_url = None self.dashboard = None self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) self._staging_directory = None
def create_cluster(project_id, region, cluster_name, create_buckets=None): # Create a client with the endpoint set to the desired cluster region. cluster_client = dataproc.ClusterControllerClient( client_options={ "api_endpoint": f"{region}-dataproc.googleapis.com:443" }) staging_bucket = None tmp_bucket = None if create_buckets: staging_bucket = create_bucket_if_not_exists( project_id, region, f'{cluster_name}-staging') tmp_bucket = create_bucket_if_not_exists(project_id, region, f'{cluster_name}-tmp') # Create the cluster config. cluster = create_dataproc_config(project_id, cluster_name, region, staging_bucket, tmp_bucket) # Create the cluster. operation = cluster_client.create_cluster(request={ "project_id": project_id, "region": region, "cluster": cluster }) result = operation.result() # Output a success message. print(f"Cluster created successfully: {result.cluster_name}")
def post(request): cluster_client = dataproc.ClusterControllerClient( client_options={ 'api_endpoint': 'europe-west2-dataproc.googleapis.com:443' }) create_cluster(cluster_client, "bootcamp-bdmlv", "europe-west2-a", "europe-west2", "hive")
def test_list_clusters(self): project_id = os.environ["PROJECT_ID"] client = dataproc_v1.ClusterControllerClient() project_id_2 = project_id region = "global" response = client.list_clusters(project_id_2, region)
def setup_teardown(): storage_client = storage.Client() bucket = storage_client.create_bucket(STAGING_BUCKET) blob = bucket.blob(JOB_FILE_NAME) blob.upload_from_string(SORT_CODE) yield cluster_client = dataproc.ClusterControllerClient( client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(REGION)} ) # The quickstart sample deletes the cluster, but if the test fails # before cluster deletion occurs, it can be manually deleted here. clusters = cluster_client.list_clusters( request={"project_id": PROJECT_ID, "region": REGION} ) for cluster in clusters: if cluster.cluster_name == CLUSTER_NAME: cluster_client.delete_cluster( request={ "project_id": PROJECT_ID, "region": REGION, "cluster_name": CLUSTER_NAME, } ) blob.delete() bucket.delete()
def setup_and_teardown_cluster(): try: # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient( client_options={ "api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443" }) operation = cluster_client.create_cluster(project_id=PROJECT_ID, region=CLUSTER_REGION, cluster=CLUSTER_CONFIG) # Wait for cluster to provision operation.result() yield finally: try: # Delete cluster operation = cluster_client.delete_cluster( project_id=PROJECT_ID, region=CLUSTER_REGION, cluster_name=DATAPROC_CLUSTER) operation.result() except NotFound: print("Cluster already deleted")
def test_create_cluster(self): # Setup Expected Response project_id_2 = 'projectId2939242356' cluster_name = 'clusterName-1018081872' cluster_uuid = 'clusterUuid-1017854240' expected_response = { 'project_id': project_id_2, 'cluster_name': cluster_name, 'cluster_uuid': cluster_uuid } expected_response = clusters_pb2.Cluster(**expected_response) operation = operations_pb2.Operation( name='operations/test_create_cluster', done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) client = dataproc_v1.ClusterControllerClient(channel=channel) # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' cluster = {} response = client.create_cluster(project_id, region, cluster) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = clusters_pb2.CreateClusterRequest( project_id=project_id, region=region, cluster=cluster) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_get_cluster(self): # Setup Expected Response project_id_2 = 'projectId2939242356' cluster_name_2 = 'clusterName2875867491' cluster_uuid = 'clusterUuid-1017854240' expected_response = { 'project_id': project_id_2, 'cluster_name': cluster_name_2, 'cluster_uuid': cluster_uuid } expected_response = clusters_pb2.Cluster(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' response = client.get_cluster(project_id, region, cluster_name) assert expected_response == response assert len(channel.requests) == 1 expected_request = clusters_pb2.GetClusterRequest( project_id=project_id, region=region, cluster_name=cluster_name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_diagnose_cluster(self): # Setup Expected Response expected_response = {} expected_response = empty_pb2.Empty(**expected_response) operation = operations_pb2.Operation( name='operations/test_diagnose_cluster', done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' response = client.diagnose_cluster(project_id, region, cluster_name) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = clusters_pb2.DiagnoseClusterRequest( project_id=project_id, region=region, cluster_name=cluster_name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_list_clusters(self): # Setup Expected Response next_page_token = '' clusters_element = {} clusters = [clusters_element] expected_response = { 'next_page_token': next_page_token, 'clusters': clusters } expected_response = clusters_pb2.ListClustersResponse( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_clusters(project_id, region) resources = list(paged_list_response) assert len(resources) == 1 assert expected_response.clusters[0] == resources[0] assert len(channel.requests) == 1 expected_request = clusters_pb2.ListClustersRequest( project_id=project_id, region=region) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_update_cluster_exception(self): # Setup Response error = status_pb2.Status() operation = operations_pb2.Operation( name='operations/test_update_cluster_exception', done=True) operation.error.CopyFrom(error) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' cluster = {} update_mask = {} response = client.update_cluster(project_id, region, cluster_name, cluster, update_mask) exception = response.exception() assert exception.errors[0] == error
def create_cluster(project_id, region, cluster_name): """This sample walks a user through creating a Cloud Dataproc cluster using the Python client library. Args: project_id (string): Project to use for creating resources. region (string): Region where the resources should live. cluster_name (string): Name to use for creating a cluster. """ # Create a client with the endpoint set to the desired cluster region. cluster_client = dataproc.ClusterControllerClient( client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"} ) # Create the cluster config. cluster = { "project_id": project_id, "cluster_name": cluster_name, "config": { "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-1"}, "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-1"}, }, } # Create the cluster. operation = cluster_client.create_cluster( request={"project_id": project_id, "region": region, "cluster": cluster} ) result = operation.result() # Output a success message. print(f"Cluster created successfully: {result.cluster_name}")
def teardown(): yield client = dataproc.ClusterControllerClient( client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) # Client library function client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
def post(request): cluster_client = dataproc.ClusterControllerClient( client_options={ 'api_endpoint': 'europe-west2-dataproc.googleapis.com:443' }) operation = cluster_client.delete_cluster("bootcamp-bdmlv", "europe-west2", "hive") return operation.result()
def dataproc_cluster_client(zone): """ Lazily create a Dataproc ClusterController client to setup or tear down dataproc clusters """ region = get_region_from_zone(zone) client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region)) return dataproc_v1.ClusterControllerClient(client_transport)
def test_list_clusters_exception(self): channel = ChannelStub(responses=[CustomException()]) client = dataproc_v1.ClusterControllerClient(channel=channel) # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_clusters(project_id, region) with pytest.raises(CustomException): list(paged_list_response)
def hello_pubsub(event, context): cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='us-central1-dataproc.googleapis.com:443') dataproc_cluster_client = dataproc_v1.ClusterControllerClient( cluster_transport) project_id = '' region = 'us-central1' pubsub_message = base64.b64decode(event['data']).decode('utf-8') data = json.loads(pubsub_message) cluster_name = data['resource']['labels']['cluster_name'] print(cluster_name + " dataproc cluster created")
def teardown(): yield cluster_client = dataproc.ClusterControllerClient( client_options={ 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) # Client library function operation = cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME) # Wait for cluster to delete operation.result()
def test_get_cluster_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) client = dataproc_v1.ClusterControllerClient(channel=channel) # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' with pytest.raises(CustomException): client.get_cluster(project_id, region, cluster_name)
def dataproc_cluster_client(self): """ Lazily create a Dataproc ClusterController client to setup or tear down dataproc clusters """ if self._dataproc_cluster_client is None: client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(self._region)) self._dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) return self._dataproc_cluster_client
def main(event, context): ''' Triggered by a change to a Cloud Storage bucket. :param event: :param context: :return: ''' # Variables PROJECT_ID = 'sas-ivnard' CLUSTER_NAME = 'score-spark-demo' BUCKET_NAME = 'network-spark-migrate' REGION = 'europe-west6' ZONE = 'europe-west6-b' PIP_PACKAGES = "PyYAML==5.3.1 numpy==1.19.4 pandas==1.1.4 pyspark==3.0.1" JOB_ID = 'Batch_Model_Score' logging.basicConfig( format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # Create a client with the endpoint set to the desired cluster region. cluster_client = dataproc.ClusterControllerClient( client_options={ "api_endpoint": f"{REGION}-dataproc.googleapis.com:443" }) # Because we upload several files in a loop, the function will be trigger n times where # n is the number of files uploaded. We need to create a filter. # Notice: I have to investigate more. For demo purposes, it's enough. if event[ 'name'] == 'data/ML-MATT-CompetitionQT1920_val_processed.parquet/_SUCCESS': logging.info("A new scoring process is starting...") logging.info(f"Creating cluster {CLUSTER_NAME}...") cluster = create_cluster(PROJECT_ID, CLUSTER_NAME, BUCKET_NAME, REGION, ZONE, PIP_PACKAGES) logging.info(f"Submitting job {JOB_ID}...") cluster.add_done_callback(lambda _: submit_score_job( PROJECT_ID, CLUSTER_NAME, REGION, JOB_ID)) while check_job_state(PROJECT_ID, REGION, JOB_ID) != 'state.done': logging.info(f"Job {JOB_ID} is running...") time.sleep(5) logging.info(f"Job {JOB_ID} is done!") logging.info(f"Deleting cluster {CLUSTER_NAME}...") delete_cluster(PROJECT_ID, CLUSTER_NAME, REGION) while check_if_cluster(PROJECT_ID, CLUSTER_NAME, REGION) is not None: logging.info(f"Deleting {CLUSTER_NAME}...") time.sleep(2) logging.info(f"Cluster {CLUSTER_NAME} deleted!")
def __init__(self, cluster_metadata: ClusterMetadata) -> None: """Initializes the DataprocClusterManager with properties required to interface with the Dataproc ClusterControllerClient. """ self.cluster_metadata = cluster_metadata # Pipelines whose jobs are executed on the cluster. self.pipelines = set() self._cluster_client = dataproc_v1.ClusterControllerClient( client_options={ 'api_endpoint': \ f'{self.cluster_metadata.region}-dataproc.googleapis.com:443' }) self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) self._staging_directory = None
def test_list_clusters_exception(self): channel = ChannelStub(responses=[CustomException()]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_clusters(project_id, region) with pytest.raises(CustomException): list(paged_list_response)
def delete_cluster(): # Create a client with the endpoint set to the desired cluster region. cluster_client = dataproc.ClusterControllerClient(client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format('europe-west1') }) # Create the cluster. operation = cluster_client.delete_cluster('big-data-architecture-ricardo', 'europe-west1', 'dataproc-bda') result = operation.result() # Output a success message. return 'Cluster deleted successfully'
def teardown(): yield cluster_client = dataproc.ClusterControllerClient( client_options={ "api_endpoint": f"{REGION}-dataproc.googleapis.com:443" }) # Client library function operation = cluster_client.delete_cluster(request={ "project_id": PROJECT_ID, "region": REGION, "cluster_name": CLUSTER_NAME, }) # Wait for cluster to delete operation.result()
def test_get_cluster_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.ClusterControllerClient() # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' cluster_name = 'clusterName-1018081872' with pytest.raises(CustomException): client.get_cluster(project_id, region, cluster_name)