def post(self, request): print("List of clusters initiated ......") os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json" project_id = "deepak-cloud-trail" zone = request.POST["zone"] region = get_region_from_zone(zone) zone_uri = \ 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( project_id, zone) client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_client = dataproc_v1.ClusterControllerClient(client_transport) cluster_name = request.POST["cluster_name"] cluster_data = { 'project_id': project_id, 'cluster_name': cluster_name, 'config': { 'gce_cluster_config': { 'zone_uri': zone_uri }, 'master_config': { 'num_instances': 1, 'machine_type_uri': 'n1-standard-1' }, 'worker_config': { 'num_instances': 2, 'machine_type_uri': 'n1-standard-1' } } } cluster = dataproc_client.create_cluster(project_id, region, cluster_data)
def main(project_id, zone, cluster_name, bucket_name, pyspark_file=None, create_new_cluster=True, global_region=True): # [START dataproc_get_client] if global_region: region = 'global' # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() dataproc_job_client = dataproc_v1.JobControllerClient() else: region = get_region_from_zone(zone) # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) # [END dataproc_get_client] try: spark_file, spark_filename = get_pyspark_file(pyspark_file) if create_new_cluster: create_cluster(dataproc_cluster_client, project_id, zone, region, cluster_name) wait_for_cluster_creation() upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) list_clusters_with_details(dataproc_cluster_client, project_id, region) (cluster_id, output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client, project_id, region, cluster_name)) # [START dataproc_call_submit_pyspark_job] job_id = submit_pyspark_job(dataproc_job_client, project_id, region, cluster_name, bucket_name, spark_filename) # [END dataproc_call_submit_pyspark_job] wait_for_job(dataproc_job_client, project_id, region, job_id) output = download_output(project_id, cluster_id, output_bucket, job_id) print('Received job output {}'.format(output)) return output finally: if create_new_cluster: delete_cluster(dataproc_cluster_client, project_id, region, cluster_name) spark_file.close()
def dataproc_cluster_client(zone): """ Lazily create a Dataproc ClusterController client to setup or tear down dataproc clusters """ region = get_region_from_zone(zone) client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region)) return dataproc_v1.ClusterControllerClient(client_transport)
def hello_pubsub(event, context): cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='us-central1-dataproc.googleapis.com:443') dataproc_cluster_client = dataproc_v1.ClusterControllerClient( cluster_transport) project_id = '' region = 'us-central1' pubsub_message = base64.b64decode(event['data']).decode('utf-8') data = json.loads(pubsub_message) cluster_name = data['resource']['labels']['cluster_name'] print(cluster_name + " dataproc cluster created")
def dataproc_cluster_client(self): """ Lazily create a Dataproc ClusterController client to setup or tear down dataproc clusters """ if self._dataproc_cluster_client is None: client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(self._region)) self._dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) return self._dataproc_cluster_client
def main(project_id, region): if region == "global": # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() else: # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region) ) dataproc_cluster_client = dataproc_v1.ClusterControllerClient(client_transport) list_clusters(dataproc_cluster_client, project_id, region)
def post(self, request): print("List of clusters initiated ......") os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json" region = request.POST["region"] project_id = "deepak-cloud-trail" client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_client = dataproc_v1.ClusterControllerClient(client_transport) list_clusters = dataproc_client.list_clusters(project_id, region) print(list_clusters) for cluster in list_clusters: print("$$$$$$$$", cluster.cluster_name)
def set_cluster_clients(): global dataproc_cluster_client, dataproc_job_client if not dataproc_cluster_client or not dataproc_job_client: region = os.environ[GCP_REGION] # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) return dataproc_cluster_client, dataproc_job_client
def update_firewall_rule(event, context): import base64 import json from google.cloud import dataproc_v1 from google.cloud.dataproc_v1.gapic.transports import cluster_controller_grpc_transport from googleapiclient.discovery import build project_id='playground' firewall = 'test2' region='us-central1' zone='us-central1-c' ip=[] cluster_name='' network='global/networks/test' compute = build('compute', 'v1') cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(address='us-central1-dataproc.googleapis.com:443') dataproc_cluster_client = dataproc_v1.ClusterControllerClient(cluster_transport) pubsub_message = base64.b64decode(event['data']).decode('utf-8') data=json.loads(pubsub_message) cluster_name=data['resource']['labels']['cluster_name'] cluster = dataproc_cluster_client.get_cluster(project_id, region,cluster_name) master_nodes=list(cluster.config.master_config.instance_names) worker_nodes=list(cluster.config.worker_config.instance_names) result = compute.instances().list(project=project_id, zone=zone).execute() for instance in result["items"]: if instance['name'] in master_nodes or instance['name'] in worker_nodes: ip.append(instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']) firewall_body = {'sourceRanges': ip,'allowed':[{'IPProtocol': 'tcp','ports': ['22']}],'network':network} request = compute.firewalls().update(project=project_id, firewall=firewall, body=firewall_body) response = request.execute()
def __init__( self, transport=None, channel=None, credentials=None, client_config=None, client_info=None, client_options=None, ): """Constructor. Args: transport (Union[~.ClusterControllerGrpcTransport, Callable[[~.Credentials, type], ~.ClusterControllerGrpcTransport]): A transport instance, responsible for actually making the API calls. The default transport uses the gRPC protocol. This argument may also be a callable which returns a transport instance. Callables will be sent the credentials as the first argument and the default transport class as the second argument. channel (grpc.Channel): DEPRECATED. A ``Channel`` instance through which to make calls. This argument is mutually exclusive with ``credentials``; providing both will raise an exception. credentials (google.auth.credentials.Credentials): The authorization credentials to attach to requests. These credentials identify this application to the service. If none are specified, the client will attempt to ascertain the credentials from the environment. This argument is mutually exclusive with providing a transport instance to ``transport``; doing so will raise an exception. client_config (dict): DEPRECATED. A dictionary of call options for each method. If not specified, the default configuration is used. client_info (google.api_core.gapic_v1.client_info.ClientInfo): The client info used to send a user-agent string along with API requests. If ``None``, then default info will be used. Generally, you only need to set this if you're developing your own client library. client_options (Union[dict, google.api_core.client_options.ClientOptions]): Client options used to set user options on the client. API Endpoint should be set through client_options. """ # Raise deprecation warnings for things we want to go away. if client_config is not None: warnings.warn( "The `client_config` argument is deprecated.", PendingDeprecationWarning, stacklevel=2, ) else: client_config = cluster_controller_client_config.config if channel: warnings.warn( "The `channel` argument is deprecated; use " "`transport` instead.", PendingDeprecationWarning, stacklevel=2, ) api_endpoint = self.SERVICE_ADDRESS if client_options: if type(client_options) == dict: client_options = google.api_core.client_options.from_dict( client_options) if client_options.api_endpoint: api_endpoint = client_options.api_endpoint # Instantiate the transport. # The transport is responsible for handling serialization and # deserialization and actually sending data to the service. if transport: if callable(transport): self.transport = transport( credentials=credentials, default_class=cluster_controller_grpc_transport. ClusterControllerGrpcTransport, address=api_endpoint, ) else: if credentials: raise ValueError( "Received both a transport instance and " "credentials; these are mutually exclusive.") self.transport = transport else: self.transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address=api_endpoint, channel=channel, credentials=credentials) if client_info is None: client_info = google.api_core.gapic_v1.client_info.ClientInfo( gapic_version=_GAPIC_LIBRARY_VERSION) else: client_info.gapic_version = _GAPIC_LIBRARY_VERSION self._client_info = client_info # Parse out the default settings for retry and timeout for each RPC # from the client configuration. # (Ordinarily, these are the defaults specified in the `*_config.py` # file next to this one.) self._method_configs = google.api_core.gapic_v1.config.parse_method_configs( client_config["interfaces"][self._INTERFACE_NAME]) # Save a dictionary of cached API call functions. # These are the actual callables which invoke the proper # transport methods, wrapped with `wrap_method` to add retry, # timeout, and the like. self._inner_api_calls = {}
def dataproc_get_client(region): transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(address="{0}-dataproc.googleapis.com:443".format(region)) return dataproc_v1.ClusterControllerClient(transport)
import base64 import json from google.cloud import dataproc_v1 from google.cloud.dataproc_v1.gapic.transports import cluster_controller_grpc_transport from googleapiclient.discovery import build project_id = 'playground-s-11-b2c3df' firewall = 'test2' region = 'us-central1' zone = 'us-central1-c' ip = [] cluster_name = 'cluster-ec21' network = 'global/networks/test' compute = build('compute', 'v1') cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='us-central1-dataproc.googleapis.com:443') dataproc_cluster_client = dataproc_v1.ClusterControllerClient( cluster_transport) cluster = dataproc_cluster_client.get_cluster(project_id, region, cluster_name) master_nodes = list(cluster.config.master_config.instance_names) worker_nodes = list(cluster.config.worker_config.instance_names) result = compute.instances().list(project=project_id, zone=zone).execute() for instance in result["items"]: if instance['name'] in master_nodes or instance['name'] in worker_nodes: ip.append( instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']) firewall_body = { 'sourceRanges': ip,
def trigger_dataproc_jobs(message, context): """ Entry point for the CloudFunction Captures a Pubsub message from the configured source topic and constructs a Dataproc Inline Workflow request to run the jobs specified in the message request. message: the Pubsub message context: the Cloud Function context information """ if not 'data' in message: print("no data in the Pubsub message, nothing to do...") return event = json.loads(base64.b64decode(message['data']).decode('utf-8')) if not "jobs" in event.keys(): print("jobs property not present in the event, no work to be done...") return # initialize needed GCP clients wf_client_transport = ( workflow_template_service_grpc_transport. WorkflowTemplateServiceGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region_id))) dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient( wf_client_transport) dp_client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region_id))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( dp_client_transport) storage_client = storage.Client() # retrieves the cloud function configuration for storage config = retrieve_configuration(storage_client) # build parent region path for dataproc api requests parent = dataproc_workflow_client.region_path(project_id, region_id) # extract events parameters zone = event.get('zone', zone_id) job_name = event.get('job_name', 'dataproc-workflow-test') template_name = "projects/{}/regions/{}/workflowTemplates/{}".format( project_id, region_id, job_name) cluster_name = 'cluster-' + job_name cluster_init_actions = event.get('cluster_init_actions', []) request_id = event.get('request_id', template_name.replace('/', '_')) job_labels = event.get('labels', {}) job_labels['job_name'] = job_name job_labels['request_id'] = request_id req_metadata = event.get('metadata', {}) # lets check if there is another cluster with the same labels already running # randomizing the wait time we can improve the chances of catching duplicated requests time.sleep(random.randint(1, 5)) for cluster in dataproc_cluster_client.list_clusters( project_id, region_id, 'labels.job_name = {} AND labels.request_id = {}'.format( job_name, request_id)): print( "workflow instance already running for same pair job_name and request_id ({},{}), exiting" .format(job_name, request_id)) return if not isinstance(cluster_init_actions, list): print("cluster initialization actions should be a list") return # check on the functions configuration for an entry for the job name in the execution request cluster_config = None if not job_name in config.keys(): # if no particular configuration exists use default one cluster_config = config['default_cluster_config'] else: cluster_config = config[job_name] cluster_config['labels'] = {**cluster_config['labels'], **job_labels} cluster_config['cluster_name'] = cluster_name cluster_config['config']['gce_cluster_config']['metadata'] = { **cluster_config['config']['gce_cluster_config']['metadata'], **req_metadata } cluster_config['config']['gce_cluster_config']['zone_uri'] = zone cluster_config['config']['initialization_actions'] = cluster_config[ 'config']['initialization_actions'] + cluster_init_actions for action in cluster_config['config']['initialization_actions']: if 'execution_timeout' in action: timeout = Duration(seconds=action['execution_timeout']) action['execution_timeout'] = timeout # creates inline template request inline_template = { 'name': template_name, 'placement': { 'managed_cluster': cluster_config }, 'jobs': event['jobs'] } # sends the request to instantiate the workflow inlined template response = dataproc_workflow_client.instantiate_inline_workflow_template( parent, inline_template, request_id=request_id, metadata=[('job_name', job_name)]) # captures operation name for the execution's metadata along with other request parameters metadata = { 'operation_name': response.operation.name, 'template_name': template_name, 'cluster_name': cluster_name } print('workflow instance created, request id {}, operation\'s name: {}'. format(request_id, metadata['operation_name'])) # Sets the future to be called when the workflow execution completes. # This partial function gets populated with local information, normally # not present at the callback execution time, to enrich logging and event # results propagation. response.add_done_callback(partial(execution_callback, metadata=metadata))