def main(project_id, zone, cluster_name, bucket_name, pyspark_file=None, create_new_cluster=True, global_region=True): # [START dataproc_get_client] if global_region: region = 'global' # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() dataproc_job_client = dataproc_v1.JobControllerClient() else: region = get_region_from_zone(zone) # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) # [END dataproc_get_client] try: spark_file, spark_filename = get_pyspark_file(pyspark_file) if create_new_cluster: create_cluster(dataproc_cluster_client, project_id, zone, region, cluster_name) wait_for_cluster_creation() upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) list_clusters_with_details(dataproc_cluster_client, project_id, region) (cluster_id, output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client, project_id, region, cluster_name)) # [START dataproc_call_submit_pyspark_job] job_id = submit_pyspark_job(dataproc_job_client, project_id, region, cluster_name, bucket_name, spark_filename) # [END dataproc_call_submit_pyspark_job] wait_for_job(dataproc_job_client, project_id, region, job_id) output = download_output(project_id, cluster_id, output_bucket, job_id) print('Received job output {}'.format(output)) return output finally: if create_new_cluster: delete_cluster(dataproc_cluster_client, project_id, region, cluster_name) spark_file.close()
def test_submit_job(self): # Setup Expected Response driver_output_resource_uri = "driverOutputResourceUri-542229086" driver_control_files_uri = "driverControlFilesUri207057643" job_uuid = "jobUuid-1615012099" expected_response = { "driver_output_resource_uri": driver_output_resource_uri, "driver_control_files_uri": driver_control_files_uri, "job_uuid": job_uuid, } expected_response = jobs_pb2.Job(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup Request project_id = "projectId-1969970175" region = "region-934795532" job = {} response = client.submit_job(project_id, region, job) assert expected_response == response assert len(channel.requests) == 1 expected_request = jobs_pb2.SubmitJobRequest(project_id=project_id, region=region, job=job) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_cancel_job(self): # Setup Expected Response driver_output_resource_uri = 'driverOutputResourceUri-542229086' driver_control_files_uri = 'driverControlFilesUri207057643' expected_response = { 'driver_output_resource_uri': driver_output_resource_uri, 'driver_control_files_uri': driver_control_files_uri } expected_response = jobs_pb2.Job(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' job_id = 'jobId-1154752291' response = client.cancel_job(project_id, region, job_id) assert expected_response == response assert len(channel.requests) == 1 expected_request = jobs_pb2.CancelJobRequest(project_id=project_id, region=region, job_id=job_id) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_list_jobs(self): # Setup Expected Response next_page_token = '' jobs_element = {} jobs = [jobs_element] expected_response = {'next_page_token': next_page_token, 'jobs': jobs} expected_response = jobs_pb2.ListJobsResponse(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_jobs(project_id, region) resources = list(paged_list_response) assert len(resources) == 1 assert expected_response.jobs[0] == resources[0] assert len(channel.requests) == 1 expected_request = jobs_pb2.ListJobsRequest(project_id=project_id, region=region) actual_request = channel.requests[0][1] assert expected_request == actual_request
def http_request(request): """Responds to any HTTP request. Args: request (flask.Request): HTTP request object. Returns: The response text or any set of values that can be turned into a Response object using `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`. """ region = "europe-west1" project = "big-data-keepcoding" cluster_name = "kc-airbnb-cluster" create_cluster(project, region, cluster_name) job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_job_client = dataproc.JobControllerClient(job_transport) job_id = submit_job(dataproc_job_client, project, region, cluster_name, "kc-airbnb", "sql/load_data.sql") wait_for_job(dataproc_job_client, project, region, job_id) job_id = submit_job(dataproc_job_client, project, region, cluster_name, "kc-airbnb", "sql/compute_recommendations.sql") wait_for_job(dataproc_job_client, project, region, job_id) send_message() delete_cluster(project, region, cluster_name) return 'OK'
def test_list_jobs(self): # Setup Expected Response next_page_token = '' jobs_element = {} jobs = [jobs_element] expected_response = {'next_page_token': next_page_token, 'jobs': jobs} expected_response = jobs_pb2.ListJobsResponse(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) client = dataproc_v1.JobControllerClient(channel=channel) # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_jobs(project_id, region) resources = list(paged_list_response) assert len(resources) == 1 assert expected_response.jobs[0] == resources[0] assert len(channel.requests) == 1 expected_request = jobs_pb2.ListJobsRequest( project_id=project_id, region=region) actual_request = channel.requests[0][1] assert expected_request == actual_request
def submit_train_job (project_id, cluster_name, region, job_id): ''' Submit batch train job :param project_id: The name of project to use for creating resources. :param cluster_name: The name of cluster :param region: The name of the region :param job_id: The name of the job :return: None ''' # Create the job client. job_client = dataproc.JobControllerClient(client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) }) # Create the job config. # gcloud dataproc jobs submit pyspark gs://network-spark-migrate/model/train.py --cluster train-spark-demo # --region europe-west6 --files=gs://network-spark-migrate/model/demo-config.yml -- --configfile ./demo-config.yml job = { 'reference': { 'project_id': project_id, 'job_id': job_id }, 'placement': { 'cluster_name': cluster_name }, 'pyspark_job': { 'main_python_file_uri': 'gs://network-spark-migrate/model/train.py', 'file_uris': ['gs://network-spark-migrate/model/demo-config.yml'], 'args': ['--configfile', './demo-config.yml'] } } job_client.submit_job( request={"project_id": project_id, "region": region, "job": job} )
def test_submit_job(self): # Setup Expected Response driver_output_resource_uri = 'driverOutputResourceUri-542229086' driver_control_files_uri = 'driverControlFilesUri207057643' expected_response = { 'driver_output_resource_uri': driver_output_resource_uri, 'driver_control_files_uri': driver_control_files_uri } expected_response = jobs_pb2.Job(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) client = dataproc_v1.JobControllerClient(channel=channel) # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' job = {} response = client.submit_job(project_id, region, job) assert expected_response == response assert len(channel.requests) == 1 expected_request = jobs_pb2.SubmitJobRequest( project_id=project_id, region=region, job=job) actual_request = channel.requests[0][1] assert expected_request == actual_request
def __init__( self, cluster_name: str, staging_location: str, region: str, project_id: str, ): """ Initialize a dataproc job controller client, used internally for job submission and result retrieval. Args: cluster_name (str): Dataproc cluster name. staging_location (str): GCS directory for the storage of files generated by the launcher, such as the pyspark scripts. region (str): Dataproc cluster region. project_id (str: GCP project id for the dataproc cluster. """ self.cluster_name = cluster_name scheme, self.staging_bucket, self.remote_path, _, _, _ = urlparse( staging_location) if scheme != "gs": raise ValueError( "Only GCS staging location is supported for DataprocLauncher.") self.project_id = project_id self.region = region self.job_client = dataproc_v1.JobControllerClient( client_options={ "api_endpoint": f"{region}-dataproc.googleapis.com:443" })
def test_spark_streaming_from_pubsublite( subscription: Subscription, dataproc_cluster: dataproc_v1.Cluster) -> None: # Create a Dataproc job client. job_client = dataproc_v1.JobControllerClient(client_options={ "api_endpoint": "{}-dataproc.googleapis.com:443".format(CLOUD_REGION) }) # Create the job config. job = { # Use the subscription prefix and the first four alphanumeric # characters of the UUID as job ID "reference": { "job_id": subscription.name.split("/")[-1][:-28] }, "placement": { "cluster_name": dataproc_cluster.cluster_name }, "pyspark_job": { "main_python_file_uri": pyfile("spark_streaming_from_pubsublite_example.py"), "jar_file_uris": [ "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar" ], "properties": { "spark.master": "yarn" }, "logging_config": { "driver_log_levels": { "root": LoggingConfig.Level.INFO } }, "args": [ f"--project_number={PROJECT_NUMBER}", f"--location={CLOUD_REGION}-{ZONE_ID}", f"--subscription_id={SUBSCRIPTION_ID}", ], }, } operation = job_client.submit_job_as_operation( request={ "project_id": PROJECT_ID, "region": CLOUD_REGION, "job": job, "request_id": "read-" + UUID, }) response = operation.result() # Dataproc job output gets saved to the Google Cloud Storage bucket # allocated to the job. Use a regex to obtain the bucket and blob info. matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) output = (storage.Client().get_bucket(matches.group(1)).blob( f"{matches.group(2)}.000000000").download_as_text()) assert "Batch: 0\n" in output
def test_clean(): """Tests clean.py by submitting it to a Dataproc cluster""" # Submit job to Dataproc cluster job_client = dataproc.JobControllerClient( client_options={ "api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443" }) operation = job_client.submit_job_as_operation(project_id=PROJECT_ID, region=CLUSTER_REGION, job=DATAPROC_JOB) # Wait for job to complete result = operation.result() # Get job output output_location = result.driver_output_resource_uri + ".000000000" blob = get_blob_from_path(output_location) out = blob.download_as_string().decode("utf-8") # trip duration assert not is_in_table(r"\d*.\d* s", out) assert not is_in_table(r"\d*.\d* min", out) assert not is_in_table(r"\d*.\d* h", out) # station latitude & longitude assert not is_in_table(r"\d+" + "\u00B0" + r"\d+\'\d+\"", out) assert is_in_table(r"\d*.\d*", out) # gender assert not is_in_table("M", out) assert not is_in_table("m", out) assert not is_in_table("male", out) assert not is_in_table("MALE", out) assert not is_in_table("F", out) assert not is_in_table("f", out) assert not is_in_table("female", out) assert not is_in_table("FEMALE", out) assert not is_in_table("U", out) assert not is_in_table("u", out) assert not is_in_table("unknown", out) assert not is_in_table("UNKNOWN", out) assert is_in_table("Male", out) assert is_in_table("Female", out) # customer plan assert not is_in_table("subscriber", out) assert not is_in_table("SUBSCRIBER", out) assert not is_in_table("sub", out) assert not is_in_table("customer", out) assert not is_in_table("CUSTOMER", out) assert not is_in_table("cust", out) assert is_in_table("Subscriber", out) assert is_in_table("Customer", out)
def dataproc_job_client(self): """ Lazily obtain a GCP Dataproc JobController client """ if self._dataproc_job_client is None: job_transport = job_controller_grpc_transport.JobControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(self._region)) self._dataproc_job_client = dataproc_v1.JobControllerClient( job_transport) return self._dataproc_job_client
def test_list_jobs_exception(self): channel = ChannelStub(responses=[CustomException()]) client = dataproc_v1.JobControllerClient(channel=channel) # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_jobs(project_id, region) with pytest.raises(CustomException): list(paged_list_response)
def post(request): job_client = dataproc.JobControllerClient( client_options={ 'api_endpoint': 'europe-west2-dataproc.googleapis.com:443' }) job = request.get_json() job_response = job_client.submit_job('bootcamp-bdmlv', 'europe-west2', job) job_id = job_response.reference.job_id return (f'Submitted job \"{job_id}\".')
def test_submit_job_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) client = dataproc_v1.JobControllerClient(channel=channel) # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' job = {} with pytest.raises(CustomException): client.submit_job(project_id, region, job)
def __init__(self, bucket, zone, cluster, project_id, platform, job_path='jobs-root', use_cloud_engine_credentials=False): self.__bucket = bucket self.__jobs_path = job_path self.__zone = zone self.__cluster = cluster self.__project_id = project_id self.__region = None self.__cluster_uuid = None self.__platform = platform if self.__platform == 'GCP': if self.__zone == 'global': self.__region = self.__zone else: self.__region = self.get_region_from_zone(self.__zone) credentials = None if use_cloud_engine_credentials: credentials = compute_engine.Credentials() if cluster is None and job_path is None: self._cloudml = discovery.build('ml', 'v1', credentials=credentials) else: if self.zone == 'global': self._dataproc_job_client = dataproc_v1.JobControllerClient(credentials=credentials) else: job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(self.__region), credentials=credentials)) self._dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) else: self._session = boto3.Session() self._sm_session = sagemaker.Session() if not use_cloud_engine_credentials: self._role = sagemaker.get_execution_role() else: self._role = use_cloud_engine_credentials
def test_list_jobs_exception(self): channel = ChannelStub(responses=[CustomException()]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' paged_list_response = client.list_jobs(project_id, region) with pytest.raises(CustomException): list(paged_list_response)
def test_spark_streaming_to_pubsublite(topic: Topic) -> None: from google.cloud.dataproc_v1.types import LoggingConfig # Create a Dataproc job client. job_client = dataproc_v1.JobControllerClient( client_options={ "api_endpoint": "{}-dataproc.googleapis.com:443".format(CLOUD_REGION) } ) # Create the job config. job = { "placement": {"cluster_name": CLUSTER_ID}, "pyspark_job": { "main_python_file_uri": pyfile("spark_streaming_to_pubsublite_example.py"), "jar_file_uris": [ "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar" ], "properties": {"spark.master": "yarn"}, "logging_config": {"driver_log_levels": {"root": LoggingConfig.Level.INFO}}, "args": [ f"--project_number={PROJECT_NUMBER}", f"--location={CLOUD_REGION}-{ZONE_ID}", f"--topic_id={TOPIC_ID}", ], }, } operation = job_client.submit_job_as_operation( request={ "project_id": PROJECT_ID, "region": CLOUD_REGION, "job": job, "request_id": "write-" + UUID, } ) response = operation.result() # Dataproc job output gets saved to the Google Cloud Storage bucket # allocated to the job. Use a regex to obtain the bucket and blob info. matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) output = ( storage.Client() .get_bucket(matches.group(1)) .blob(f"{matches.group(2)}.000000000") .download_as_text() ) assert "Committed 1 messages for epochId" in output
def submit_job(): job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(address='{}-dataproc.googleapis.com:443'.format('europe-west1'))) job_details = { 'placement': { 'cluster_name': 'dataproc-bda' }, 'hive_job': { 'query_file_uri': 'gs://{}/{}'.format('bda5-keepcoding-ricardo1', 'scripts/query_lat.txt') } } dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) result = dataproc_job_client.submit_job(project_id='big-data-architecture-ricardo', region='europe-west1', job=job_details) job_id = result.reference.job_id print('Submitted job ID {}.'.format(job_id))
def test_delete_job_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup request project_id = "projectId-1969970175" region = "region-934795532" job_id = "jobId-1154752291" with pytest.raises(CustomException): client.delete_job(project_id, region, job_id)
def test_submit_job_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup request project_id = 'projectId-1969970175' region = 'region-934795532' job = {} with pytest.raises(CustomException): client.submit_job(project_id, region, job)
def test_delete_job(self): channel = ChannelStub() client = dataproc_v1.JobControllerClient(channel=channel) # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' job_id = 'jobId-1154752291' client.delete_job(project_id, region, job_id) assert len(channel.requests) == 1 expected_request = jobs_pb2.DeleteJobRequest( project_id=project_id, region=region, job_id=job_id) actual_request = channel.requests[0][1] assert expected_request == actual_request
def set_cluster_clients(): global dataproc_cluster_client, dataproc_job_client if not dataproc_cluster_client or not dataproc_job_client: region = os.environ[GCP_REGION] # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) return dataproc_cluster_client, dataproc_job_client
def execute(self, context: bigflow.JobContext): logger.info("Run job %r", self.id) job_internal_id = self._generate_internal_jobid(context) client_options = { 'api_endpoint': f"{self.gcp_region}-dataproc.googleapis.com:443" } storage_client = storage.Client(project=self.gcp_project_id) dataproc_job_client = dataproc_v1.JobControllerClient( client_options=client_options) driver_script = self._prepare_driver_script(context) logger.info("Prapare and upload python package...") bucket = storage_client.get_bucket(self.bucket_id) egg_local_path = str( bigflow.build.reflect.build_egg(self._project_pkg_path)) egg_path = _upload_egg(egg_local_path, bucket, job_internal_id) driver_path = f"{job_internal_id}/{self.driver_filename}" _upload_driver_script(driver_script, bucket, driver_path) with self._with_temp_cluster(job_internal_id) as cluster_name: job = _submit_single_pyspark_job( dataproc_job_client=dataproc_job_client, project_id=self.gcp_project_id, region=self.gcp_region, cluster_name=cluster_name, bucket_id=self.bucket_id, jar_file_uris=self.jar_file_uris, driver_path=driver_path, egg_path=egg_path, properties=self._prepare_pyspark_properties(context), ) try: _wait_for_job_to_finish(dataproc_job_client, self.gcp_project_id, self.gcp_region, job) finally: _print_job_output_log(storage_client, dataproc_job_client, self.gcp_project_id, self.gcp_region, job) logger.info("Job %r was finished", self.id)
def loaded(): if request.method == 'POST': # Get folder fileArr = [] folder = [] files = request.files.getlist('file') for f in files: fileArr.append(f.filename) folder = fileArr[0].split("/") # Dataproc API transport = job_controller_grpc_transport.JobControllerGrpcTransport( address='us-west1-dataproc.googleapis.com:443') project_id = 'imperial-sphere-273422' region = 'us-west1' # Define Job arguments: job_args = [] job_args.append( 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/' + folder[0]) job_args.append( 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/new') job_client = dataproc_v1.JobControllerClient(transport) # Create Hadoop Job hadoop_job = dataproc_v1.types.HadoopJob(jar_file_uris=[ 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/JAR/invertedindex.jar' ], main_class='InvertedIndex', args=job_args) # Define Remote cluster to send Job job_placement = dataproc_v1.types.JobPlacement() job_placement.cluster_name = 'cluster-f010' # Define Job configuration main_job = dataproc_v1.types.Job(hadoop_job=hadoop_job, placement=job_placement) # Send job result = job_client.submit_job(project_id, region, main_job) job_id = result.reference.job_id """Wait for job to complete or error out.""" while True: job = job_client.get_job(project_id, region, job_id) if job.status.State.Name(job.status.state) == 'DONE': return render_template("loaded.html") return render_template("loaded.html")
def check_job_state (project_id, region, job_id): ''' Check Job state :param project_id: The name of project to use for creating resources. :param region: The name of the region :param job_id: The name of the job :return: job_state: A string with job state ''' # Create the job client. job_client = dataproc.JobControllerClient(client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) }) job_instance = job_client.get_job( request={"project_id": project_id, "region": region, "job_id": job_id} ) job_state = str.lower(str(job_instance.status.state)) return job_state
def test_delete_job(self): channel = ChannelStub() patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.JobControllerClient() # Setup Request project_id = 'projectId-1969970175' region = 'region-934795532' job_id = 'jobId-1154752291' client.delete_job(project_id, region, job_id) assert len(channel.requests) == 1 expected_request = jobs_pb2.DeleteJobRequest(project_id=project_id, region=region, job_id=job_id) actual_request = channel.requests[0][1] assert expected_request == actual_request
def submit_job(project_id, region, cluster_name): # Create the job client. job_client = dataproc.JobControllerClient(client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) }) # Create the job config. 'main_jar_file_uri' can also be a # Google Cloud Storage URL. job = { 'placement': { 'cluster_name': cluster_name }, 'spark_job': { 'main_class': 'org.apache.spark.examples.SparkPi', 'jar_file_uris': ['file:///usr/lib/spark/examples/jars/spark-examples.jar'], 'args': ['1000'] } } operation = job_client.submit_job_as_operation( request={"project_id": project_id, "region": region, "job": job} ) response = operation.result() # Dataproc job output gets saved to the Google Cloud Storage bucket # allocated to the job. Use a regex to obtain the bucket and blob info. matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) output = ( storage.Client() .get_bucket(matches.group(1)) .blob(f"{matches.group(2)}.000000000") .download_as_string() ) print(f"Job finished successfully: {output}")
if job.status.State.Name(job.status.state) == 'ERROR': raise Exception(job.status.details) elif job.status.State.Name(job.status.state) == 'DONE': print('Job finished.') return job #ToDo project = 'enter project id' region = 'enter region of cluster' cluster_name = 'enter cluster name' bucket_name = 'enter bucket name' job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) ##GUI Code import PySimpleGUI as sg sg.change_look_and_feel('Light Blue 2') layout = [[sg.Text('Select Offline to train and Online to Predict')], [ sg.Text('Mode', size=(15, 1)), sg.Drop(values=('Offline', 'Online'), auto_size_text=True) ], [sg.Text('Enter data path')], [sg.Text('File Path:', size=(8, 1)), sg.Input(), sg.FileBrowse()], [sg.Text('Enter data table name')], [sg.Text('TableName:', size=(8, 1)),
def test_spark_streaming_from_pubsublite(subscription: Subscription) -> None: from google.cloud.dataproc_v1.types import LoggingConfig # Create a Dataproc job client. job_client = dataproc_v1.JobControllerClient( client_options={ "api_endpoint": "{}-dataproc.googleapis.com:443".format(CLOUD_REGION) } ) # Create the job config. job = { "placement": {"cluster_name": CLUSTER_ID}, "pyspark_job": { "main_python_file_uri": pyfile( "spark_streaming_from_pubsublite_example.py" ), "jar_file_uris": [ "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar" ], "properties": {"spark.master": "yarn"}, "logging_config": {"driver_log_levels": {"root": LoggingConfig.Level.INFO}}, "args": [ f"--project_number={PROJECT_NUMBER}", f"--location={CLOUD_REGION}-{ZONE_ID}", f"--subscription_id={SUBSCRIPTION_ID}", ], }, } operation = job_client.submit_job_as_operation( request={ "project_id": PROJECT_ID, "region": CLOUD_REGION, "job": job, "request_id": "read-" + UUID, } ) response = operation.result() # Dataproc job output gets saved to the Google Cloud Storage bucket # allocated to the job. Use a regex to obtain the bucket and blob info. matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) output = ( storage.Client() .get_bucket(matches.group(1)) .blob(f"{matches.group(2)}.000000000") .download_as_text() ) assert "Batch: 0\n" in output assert ( "+--------------------+---------+------+----+------+" + "--------------------+--------------------+----------+\n" + "| subscription|partition|offset| key| data" + "| publish_timestamp| event_timestamp|attributes|\n" + "+--------------------+---------+------+----+------+" + "--------------------+--------------------+----------+\n" + "|projects/10126164...| 0| 0|[34]|353534" + "|2021-09-15 21:55:...|2021-09-15 00:04:...| []|\n" in output )