Пример #1
0
    def _run_spark_submit(self, application, jars):
        assert_airflow_package_installed()
        from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook
        from airflow.exceptions import AirflowException

        # task_env = get_cloud_config(Clouds.local)
        spark_local_config = SparkLocalEngineConfig()
        _config = self.config
        deploy = self.deploy

        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=spark_local_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(self.task.get_py_files()),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )

        log_buffer = StringIO()
        with log_buffer as lb:
            dbnd_log_handler = self._capture_submit_log(spark, lb)
            try:
                spark.submit(application=application)
            except AirflowException as ex:
                return_code = self._get_spark_return_code_from_exception(ex)
                if return_code != "0":
                    error_snippets = parse_spark_log_safe(
                        log_buffer.getvalue().split(os.linesep))
                    raise failed_to_run_spark_script(
                        self,
                        spark._build_spark_submit_command(
                            application=application),
                        application,
                        return_code,
                        error_snippets,
                    )
                else:
                    raise failed_spark_status(ex)
            finally:
                spark.log.handlers = [
                    h for h in spark.log.handlers if not dbnd_log_handler
                ]
Пример #2
0
    def _run_spark_submit(self, file, jars):
        """
        Request Body	Description	Type
            file	File containing the application to run (required)	path
            proxyUser	User ID to impersonate when running the job	string
            className	Application Java or Spark main class	string
            args	Command line arguments for the application	list of strings
            jars	Jar files to be used in this session	list of strings
            pyFiles	Python files to be used in this session	list of strings
            files	Other files to be used in this session	list of strings
            driverMemory	Amount of memory to use for the driver process	string
            driverCores	Number of cores to use for the driver process	int
            executorMemory	Amount of memory to use for each executor process	string
            executorCores	Number of cores to use for each executor	int
            numExecutors	Number of executors to launch for this session	int
            archives	Archives to be used in this session	list of strings
            queue	The name of the YARN queue to which the job should be submitted	string
            name	Name of this session	string
            conf	Spark configuration properties	Map of key=val
        :param task:
        :return:
        """
        task = self.task  # type: SparkTask
        _config = task.spark_config  #

        deploy = self.deploy
        data = dict(
            conf=_config.conf,
            file=deploy.sync(file),
            className=task.main_class,
            name=self.job.job_id,
            args=list_of_strings(task.application_args()),
            files=deploy.sync_files(_config.files),
            pyFiles=deploy.sync_files(self.task.get_py_files()),
            jars=deploy.sync_files(jars),
            executorCores=_config.executor_cores,
            executorMemory=_config.executor_memory,
            driverMemory=_config.driver_memory,
            driverCores=_config.executor_cores,
            proxyUser=_config.proxy_user,
            queue=_config.queue,
            archives=_config.archives,
            numExecutors=_config.num_executors,
        )
        data = {k: v for k, v in six.iteritems(data) if v is not None}
        livy_endpoint = self.get_livy_endpoint()
        logger.info("Connecting to: %s", livy_endpoint)
        livy_config = self.task_run.task.spark_engine
        livy = LivyBatchClient.from_endpoint(
            livy_endpoint,
            status_code_retries=livy_config.retry_on_status_error,
            ignore_ssl_errors=self.get_livy_ignore_ssl_errors(),
        )
        batch = livy.post_batch(data)
        self._run_hook(batch, self.livy_config.job_submitted_hook)
        livy.track_batch_progress(
            batch["id"], status_reporter=self._report_livy_batch_status)
Пример #3
0
    def run_pyspark(self, pyspark_script):
        # should be reimplemented using SparkSubmitHook (maybe from airflow)
        # note that config jars are not supported.
        if not self.databricks_config.cluster_id:
            spark_submit_parameters = [self.sync(pyspark_script)] + (
                list_of_strings(self.task.application_args()))
            databricks_json = self._create_spark_submit_json(
                spark_submit_parameters)
        else:
            pyspark_script = self.sync(pyspark_script)
            parameters = [
                self._dbfs_scheme_to_local(e)
                for e in list_of_strings(self.task.application_args())
            ]
            databricks_json = self._create_pyspark_submit_json(
                python_file=pyspark_script, parameters=parameters)

        return self._run_spark_submit(databricks_json)
Пример #4
0
    def _task_banner(self, banner, verbosity):
        b = banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.application_args()))
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self)
Пример #5
0
    def _add_spark_info(self):
        b = self.banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.task.application_args())
            )
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self.task)
Пример #6
0
 def _get_job_builder(self, job_type):
     job_builder = self.cluster_hook.create_job_template(
         self.task.task_id,
         self.dataproc.cluster,
         job_type=job_type,
         properties=self.config.conf,
     )
     # we will have "unique" job name by set_job_name
     job_builder.set_job_name(self.job.job_name)
     job_builder.add_args(list_of_strings(self.task.application_args()))
     job_builder.add_file_uris(self.deploy.sync_files(self.config.files))
     return job_builder
Пример #7
0
 def run_spark(self, main_class):
     jars_list = []
     jars = self.config.jars
     if jars:
         jars_list = ["--jars"] + jars
     # should be reimplemented using SparkSubmitHook (maybe from airflow)
     spark_submit_parameters = [
         "--class",
         main_class,
         self.sync(self.config.main_jar),
     ] + (list_of_strings(self.task.application_args()) + jars_list)
     databricks_json = self._create_spark_submit_json(spark_submit_parameters)
     return self._run_spark_submit(databricks_json)
Пример #8
0
    def run_spark(self, main_class):
        spark_cmd_line = CmdLineBuilder()
        spark_cmd_line.add("/usr/lib/spark/bin/spark-submit", "--class", main_class)
        spark_cmd_line.extend(self.config_to_command_line())

        # application jar
        spark_cmd_line.add(self.deploy.sync(self.config.main_jar))
        # add user side args
        spark_cmd_line.extend(list_of_strings(self.task.application_args()))

        cmd = SparkCommand.create(
            cmdline=spark_cmd_line.get_cmd_line(safe_curly_brackets=True),
            language="command_line",
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)
Пример #9
0
    def _run_spark_submit(self, file, jars):
        from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook

        _config = self.config
        deploy = self.deploy
        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=self.emr_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(_config.py_files),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )

        step_id = self.emr_cluster.run_spark_submit_step(
            name=self.job.job_id,
            spark_submit_command=spark._build_spark_submit_command(
                application=deploy.sync(file)
            ),
        )
        self.task_run.set_external_resource_urls(
            self.emr_cluster.get_emr_logs_dict(self.spark_application_logs)
        )
        self.emr_cluster.wait_for_step_completion(
            step_id, status_reporter=self._report_step_status
        )
        pass
Пример #10
0
    def run_pyspark(self, pyspark_script):
        # should be reimplemented using SparkSubmitHook (maybe from airflow)
        # note that config jars are not supported.

        arguments = list2cmdline_safe(list_of_strings(
            self.task.application_args()),
                                      safe_curly_brackets=True)

        cmd = SparkCommand.create(
            script_location=self.deploy.sync(pyspark_script),
            language="python",
            user_program_arguments=arguments,
            arguments=list2cmdline_safe(self.config_to_command_line(),
                                        safe_curly_brackets=True),
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)

        return True
Пример #11
0
    def _run_spark_submit(self, application, jars):
        # task_env = get_cloud_config(Clouds.local)
        spark_local_config = SparkLocalEngineConfig()
        _config = self.config
        deploy = self.deploy

        AIRFLOW_ON = is_airflow_enabled()

        if AIRFLOW_ON:
            from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook
            from airflow.exceptions import AirflowException as SparkException
        else:
            from dbnd_spark._vendor.airflow.spark_hook import (
                SparkException,
                SparkSubmitHook,
            )

        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=spark_local_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(self.task.get_py_files()),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )
        if not AIRFLOW_ON:
            # If there's no Airflow then there's no Connection so we
            # take conn information from spark config
            spark.set_connection(spark_local_config.conn_uri)

        log_buffer = StringIO()
        with log_buffer as lb:
            dbnd_log_handler = self._capture_submit_log(spark, lb)
            try:
                # We use str because we can accept Target objects (in case of JAR files)
                # or str objects (path to pyspark script)
                spark.submit(application=str(application))
            except SparkException as ex:
                return_code = self._get_spark_return_code_from_exception(ex)
                if return_code != "0":
                    error_snippets = parse_spark_log_safe(
                        log_buffer.getvalue().split(os.linesep))
                    raise failed_to_run_spark_script(
                        self,
                        spark._build_spark_submit_command(
                            application=application),
                        application,
                        return_code,
                        error_snippets,
                    )
                else:
                    raise failed_spark_status(ex)
            finally:
                spark.log.handlers = [
                    h for h in spark.log.handlers if not dbnd_log_handler
                ]