def dataproc_submit( self, job_params: SparkJobParameters ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]: local_job_id = str(uuid.uuid4()) main_file_uri = self._stage_file(job_params.get_main_file_path(), local_job_id) job_config: Dict[str, Any] = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, "labels": { self.JOB_TYPE_LABEL_KEY: job_params.get_job_type().name.lower() }, } # Add job hash to labels only for the stream ingestion job if isinstance(job_params, StreamIngestionJobParameters): job_config["labels"][ self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash() if job_params.get_class_name(): job_config.update({ "spark_job": { "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS, "main_class": job_params.get_class_name(), "args": job_params.get_arguments(), "properties": { "spark.yarn.user.classpath.first": "true" }, } }) else: job_config.update({ "pyspark_job": { "main_python_file_uri": main_file_uri, "jar_file_uris": self.EXTERNAL_JARS, "args": job_params.get_arguments(), } }) job = self.job_client.submit_job( request={ "project_id": self.project_id, "region": self.region, "job": job_config, }) refresh_fn = partial( self.job_client.get_job, project_id=self.project_id, region=self.region, job_id=job.reference.job_id, ) cancel_fn = partial(self.dataproc_cancel, job.reference.job_id) return job, refresh_fn, cancel_fn
def dataproc_submit(self, job_params: SparkJobParameters) -> Operation: local_job_id = str(uuid.uuid4()) main_file_uri = self._stage_files(job_params.get_main_file_path(), local_job_id) job_config: Dict[str, Any] = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, } if job_params.get_class_name(): job_config.update({ "spark_job": { "jar_file_uris": [main_file_uri], "main_class": job_params.get_class_name(), "args": job_params.get_arguments(), } }) else: job_config.update({ "pyspark_job": { "main_python_file_uri": main_file_uri, "args": job_params.get_arguments(), } }) return self.job_client.submit_job_as_operation( request={ "project_id": self.project_id, "region": self.region, "job": job_config, })
def spark_submit(self, job_params: SparkJobParameters, ui_port: int = None) -> subprocess.Popen: submission_cmd = [ self.spark_submit_script_path, "--master", self.master_url, "--name", job_params.get_name(), ] if job_params.get_class_name(): submission_cmd.extend(["--class", job_params.get_class_name()]) if ui_port: submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"]) # Workaround for https://github.com/apache/spark/pull/26552 # Fix running spark job with bigquery connector (w/ shadowing) on JDK 9+ submission_cmd.extend([ "--conf", "spark.executor.extraJavaOptions=" "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT", "--conf", "spark.driver.extraJavaOptions=" "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT", "--conf", "spark.sql.session.timeZone=UTC", # ignore local timezone "--packages", f"com.google.cloud.spark:spark-bigquery-with-dependencies_{self.BQ_CONNECTOR_VERSION}", "--jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar," "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar," "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar", "--conf", "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", "--conf", "spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem", ]) if self.additional_options is not None: for option, value in self.additional_options.items(): submission_cmd.extend(["--conf", f'"{option}"="{value}"']) submission_cmd.append(job_params.get_main_file_path()) submission_cmd.extend(job_params.get_arguments()) return subprocess.Popen(submission_cmd)
def spark_submit(self, job_params: SparkJobParameters) -> subprocess.Popen: submission_cmd = [ self.spark_submit_script_path, "--master", self.master_url, "--name", job_params.get_name(), ] if job_params.get_class_name(): submission_cmd.extend(["--class", job_params.get_class_name()]) submission_cmd.append(job_params.get_main_file_path()) submission_cmd.extend(job_params.get_arguments()) return subprocess.Popen(submission_cmd)
def dataproc_submit(self, job_params: SparkJobParameters) -> Operation: local_job_id = str(uuid.uuid4()) pyspark_gcs = self._stage_files(job_params.get_main_file_path(), local_job_id) job_config = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, "pyspark_job": { "main_python_file_uri": pyspark_gcs, "args": job_params.get_arguments(), }, } return self.job_client.submit_job_as_operation( request={ "project_id": self.project_id, "region": self.region, "job": job_config, })
def spark_submit(self, job_params: SparkJobParameters, ui_port: int = None) -> subprocess.Popen: submission_cmd = [ self.spark_submit_script_path, "--master", self.master_url, "--name", job_params.get_name(), ] if job_params.get_class_name(): submission_cmd.extend(["--class", job_params.get_class_name()]) if ui_port: submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"]) if job_params.get_extra_options(): submission_cmd.extend(job_params.get_extra_options().split(" ")) submission_cmd.append(job_params.get_main_file_path()) submission_cmd.extend(job_params.get_arguments()) return subprocess.Popen(submission_cmd)
def dataproc_submit( self, job_params: SparkJobParameters, extra_properties: Dict[str, str] ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]: local_job_id = str(uuid.uuid4()) main_file_uri = self._stage_file(job_params.get_main_file_path(), local_job_id) job_config: Dict[str, Any] = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, "labels": { self.JOB_TYPE_LABEL_KEY: job_params.get_job_type().name.lower() }, } maven_package_properties = { "spark.jars.packages": ",".join(job_params.get_extra_packages()) } common_properties = { "spark.executor.instances": self.executor_instances, "spark.executor.cores": self.executor_cores, "spark.executor.memory": self.executor_memory, } if isinstance(job_params, StreamIngestionJobParameters): job_config["labels"][ self.FEATURE_TABLE_LABEL_KEY] = _truncate_label( job_params.get_feature_table_name()) # Add job hash to labels only for the stream ingestion job job_config["labels"][ self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash() if isinstance(job_params, BatchIngestionJobParameters): job_config["labels"][ self.FEATURE_TABLE_LABEL_KEY] = _truncate_label( job_params.get_feature_table_name()) if job_params.get_class_name(): scala_job_properties = { "spark.yarn.user.classpath.first": "true", "spark.executor.instances": self.executor_instances, "spark.executor.cores": self.executor_cores, "spark.executor.memory": self.executor_memory, "spark.pyspark.driver.python": "python3.7", "spark.pyspark.python": "python3.7", } job_config.update({ "spark_job": { "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS, "main_class": job_params.get_class_name(), "args": job_params.get_arguments(), "properties": { **scala_job_properties, **common_properties, **maven_package_properties, **extra_properties, }, } }) else: job_config.update({ "pyspark_job": { "main_python_file_uri": main_file_uri, "jar_file_uris": self.EXTERNAL_JARS, "args": job_params.get_arguments(), "properties": { **common_properties, **maven_package_properties, **extra_properties, }, } }) job = self.job_client.submit_job( request={ "project_id": self.project_id, "region": self.region, "job": job_config, }) refresh_fn = partial( self.job_client.get_job, project_id=self.project_id, region=self.region, job_id=job.reference.job_id, ) cancel_fn = partial(self.dataproc_cancel, job.reference.job_id) return job, refresh_fn, cancel_fn