def env_vars_exception_in_standalone_cluster_mode():
            # Given
            hook = SparkSubmitHook(conn_id='spark_standalone_cluster',
                                   env_vars={"bar": "foo"})

            # When
            hook._build_spark_submit_command(self._spark_job_file)
        def env_vars_exception_in_standalone_cluster_mode():
            # Given
            hook = SparkSubmitHook(conn_id='spark_standalone_cluster',
                                   env_vars={"bar": "foo"})

            # When
            hook._build_spark_submit_command(self._spark_job_file)
    def test_resolve_spark_submit_env_vars_standalone_client_mode(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_standalone_cluster_client_mode',
                               env_vars={"bar": "foo"})

        # When
        hook._build_spark_submit_command(self._spark_job_file)

        # Then
        self.assertEqual(hook._env, {"bar": "foo"})
    def test_resolve_spark_submit_env_vars_standalone_client_mode(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_standalone_cluster_client_mode',
                               env_vars={"bar": "foo"})

        # When
        hook._build_spark_submit_command(self._spark_job_file)

        # Then
        self.assertEqual(hook._env, {"bar": "foo"})
    def test_resolve_connection_spark_k8s_cluster_ns_conf(self):
        # Given we specify the config option directly
        conf = {
            'spark.kubernetes.namespace': 'airflow',
        }
        hook = SparkSubmitHook(conn_id='spark_k8s_cluster', conf=conf)

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {
            "spark_home": "/opt/spark",
            "queue": None,
            "spark_binary": "spark-submit",
            "master": "k8s://https://k8s-master",
            "deploy_mode": "cluster",
            "namespace": "airflow"
        }
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master")
        self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
        self.assertEqual(dict_cmd["--conf"],
                         "spark.kubernetes.namespace=airflow")
    def test_build_spark_submit_command(self):
        # Given
        hook = SparkSubmitHook(**self._config)

        # When
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        expected_build_cmd = [
            'spark-submit',
            '--master', 'yarn',
            '--conf', 'parquet.compression=SNAPPY',
            '--files', 'hive-site.xml',
            '--py-files', 'sample_library.py',
            '--jars', 'parquet.jar',
            '--packages', 'com.databricks:spark-avro_2.11:3.2.0',
            '--exclude-packages', 'org.bad.dependency:1.0.0',
            '--repositories', 'http://myrepo.org',
            '--num-executors', '10',
            '--total-executor-cores', '4',
            '--executor-cores', '4',
            '--executor-memory', '22g',
            '--driver-memory', '3g',
            '--keytab', 'privileged_user.keytab',
            '--principal', 'user/[email protected]',
            '--name', 'spark-job',
            '--class', 'com.foo.bar.AppMain',
            '--verbose',
            'test_application.py',
            '-f', 'foo',
            '--bar', 'bar',
            '--with-spaces', 'args should keep embdedded spaces',
            'baz'
        ]
        self.assertEquals(expected_build_cmd, cmd)
示例#7
0
    def _run_spark_submit(self, application, jars):
        assert_airflow_package_installed()
        from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook
        from airflow.exceptions import AirflowException

        # task_env = get_cloud_config(Clouds.local)
        spark_local_config = SparkLocalEngineConfig()
        _config = self.config
        deploy = self.deploy

        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=spark_local_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(self.task.get_py_files()),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )

        log_buffer = StringIO()
        with log_buffer as lb:
            dbnd_log_handler = self._capture_submit_log(spark, lb)
            try:
                spark.submit(application=application)
            except AirflowException as ex:
                return_code = self._get_spark_return_code_from_exception(ex)
                if return_code != "0":
                    error_snippets = parse_spark_log_safe(
                        log_buffer.getvalue().split(os.linesep))
                    raise failed_to_run_spark_script(
                        self,
                        spark._build_spark_submit_command(
                            application=application),
                        application,
                        return_code,
                        error_snippets,
                    )
                else:
                    raise failed_spark_status(ex)
            finally:
                spark.log.handlers = [
                    h for h in spark.log.handlers if not dbnd_log_handler
                ]
    def test_resolve_spark_submit_env_vars_k8s(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_k8s_cluster',
                               env_vars={"bar": "foo"})

        # When
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        self.assertEqual(cmd[4], "spark.kubernetes.driverEnv.bar=foo")
    def test_resolve_spark_submit_env_vars_k8s(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_k8s_cluster',
                               env_vars={"bar": "foo"})

        # When
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        self.assertEqual(cmd[4], "spark.kubernetes.driverEnv.bar=foo")
    def test_resolve_spark_submit_env_vars_yarn(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_yarn_cluster',
                               env_vars={"bar": "foo"})

        # When
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        self.assertEqual(cmd[4], "spark.yarn.appMasterEnv.bar=foo")
        self.assertEqual(hook._env, {"bar": "foo"})
    def test_resolve_connection_spark_standalone_cluster_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_standalone_cluster')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        expected_spark_connection = {"master": "spark://spark-standalone-master:6066",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": "cluster",
                                     "queue": None,
                                     "spark_home": "/path/to/spark_home"}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
    def test_resolve_connection_spark_binary_set_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_binary_set')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        expected_spark_connection = {"master": "yarn",
                                     "spark_binary": "custom-spark-submit",
                                     "deploy_mode": None,
                                     "queue": None,
                                     "spark_home": None}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(cmd[0], 'custom-spark-submit')
    def test_resolve_connection_mesos_default_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_default_mesos')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"master": "mesos://host:5050",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": None,
                                     "queue": None,
                                     "spark_home": None}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "mesos://host:5050")
    def test_resolve_connection_spark_home_set_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_home_set')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        expected_spark_connection = {"master": "yarn://yarn-master",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": None,
                                     "queue": None,
                                     "spark_home": "/opt/myspark",
                                     "namespace": 'default'}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(cmd[0], '/opt/myspark/bin/spark-submit')
示例#15
0
    def test_resolve_connection_spark_binary_default_value(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_default')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        expected_spark_connection = {"master": "yarn",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": None,
                                     "queue": 'root.default',
                                     "spark_home": None,
                                     "namespace": 'default'}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(cmd[0], 'spark-submit')
示例#16
0
    def test_resolve_connection_yarn_default(self):
        # Given
        hook = SparkSubmitHook(conn_id='')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"master": "yarn",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": None,
                                     "queue": None,
                                     "spark_home": None,
                                     "namespace": 'default'}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "yarn")
    def test_resolve_connection_yarn_default(self):
        # Given
        hook = SparkSubmitHook(conn_id='')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"master": "yarn",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": None,
                                     "queue": None,
                                     "spark_home": None,
                                     "namespace": 'default'}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "yarn")
    def test_resolve_connection_spark_yarn_cluster_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_yarn_cluster')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"master": "yarn://yarn-master",
                                     "spark_binary": "spark-submit",
                                     "deploy_mode": "cluster",
                                     "queue": "root.etl",
                                     "spark_home": None}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "yarn://yarn-master")
        self.assertEqual(dict_cmd["--queue"], "root.etl")
        self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
示例#19
0
    def test_resolve_connection_spark_k8s_cluster_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_k8s_cluster')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"spark_home": "/opt/spark",
                                     "queue": None,
                                     "spark_binary": "spark-submit",
                                     "master": "k8s://https://k8s-master",
                                     "deploy_mode": "cluster",
                                     "namespace": "mynamespace"}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master")
        self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
    def test_resolve_connection_spark_k8s_cluster_connection(self):
        # Given
        hook = SparkSubmitHook(conn_id='spark_k8s_cluster')

        # When
        connection = hook._resolve_connection()
        cmd = hook._build_spark_submit_command(self._spark_job_file)

        # Then
        dict_cmd = self.cmd_args_to_dict(cmd)
        expected_spark_connection = {"spark_home": "/opt/spark",
                                     "queue": None,
                                     "spark_binary": "spark-submit",
                                     "master": "k8s://https://k8s-master",
                                     "deploy_mode": "cluster",
                                     "namespace": "mynamespace"}
        self.assertEqual(connection, expected_spark_connection)
        self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master")
        self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
示例#21
0
    def _run_spark_submit(self, file, jars):
        from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook

        _config = self.config
        deploy = self.deploy
        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=self.emr_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(_config.py_files),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )

        step_id = self.emr_cluster.run_spark_submit_step(
            name=self.job.job_id,
            spark_submit_command=spark._build_spark_submit_command(
                application=deploy.sync(file)
            ),
        )
        self.task_run.set_external_resource_urls(
            self.emr_cluster.get_emr_logs_dict(self.spark_application_logs)
        )
        self.emr_cluster.wait_for_step_completion(
            step_id, status_reporter=self._report_step_status
        )
        pass
示例#22
0
    def _run_spark_submit(self, application, jars):
        # task_env = get_cloud_config(Clouds.local)
        spark_local_config = SparkLocalEngineConfig()
        _config = self.config
        deploy = self.deploy

        AIRFLOW_ON = is_airflow_enabled()

        if AIRFLOW_ON:
            from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook
            from airflow.exceptions import AirflowException as SparkException
        else:
            from dbnd_spark._vendor.airflow.spark_hook import (
                SparkException,
                SparkSubmitHook,
            )

        spark = SparkSubmitHook(
            conf=_config.conf,
            conn_id=spark_local_config.conn_id,
            name=self.job.job_id,
            application_args=list_of_strings(self.task.application_args()),
            java_class=self.task.main_class,
            files=deploy.arg_files(_config.files),
            py_files=deploy.arg_files(self.task.get_py_files()),
            driver_class_path=_config.driver_class_path,
            jars=deploy.arg_files(jars),
            packages=_config.packages,
            exclude_packages=_config.exclude_packages,
            repositories=_config.repositories,
            total_executor_cores=_config.total_executor_cores,
            executor_cores=_config.executor_cores,
            executor_memory=_config.executor_memory,
            driver_memory=_config.driver_memory,
            keytab=_config.keytab,
            principal=_config.principal,
            num_executors=_config.num_executors,
            env_vars=self._get_env_vars(),
            verbose=_config.verbose,
        )
        if not AIRFLOW_ON:
            # If there's no Airflow then there's no Connection so we
            # take conn information from spark config
            spark.set_connection(spark_local_config.conn_uri)

        log_buffer = StringIO()
        with log_buffer as lb:
            dbnd_log_handler = self._capture_submit_log(spark, lb)
            try:
                # We use str because we can accept Target objects (in case of JAR files)
                # or str objects (path to pyspark script)
                spark.submit(application=str(application))
            except SparkException as ex:
                return_code = self._get_spark_return_code_from_exception(ex)
                if return_code != "0":
                    error_snippets = parse_spark_log_safe(
                        log_buffer.getvalue().split(os.linesep))
                    raise failed_to_run_spark_script(
                        self,
                        spark._build_spark_submit_command(
                            application=application),
                        application,
                        return_code,
                        error_snippets,
                    )
                else:
                    raise failed_spark_status(ex)
            finally:
                spark.log.handlers = [
                    h for h in spark.log.handlers if not dbnd_log_handler
                ]