def env_vars_exception_in_standalone_cluster_mode(): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster', env_vars={"bar": "foo"}) # When hook._build_spark_submit_command(self._spark_job_file)
def test_resolve_spark_submit_env_vars_standalone_client_mode(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster_client_mode', env_vars={"bar": "foo"}) # When hook._build_spark_submit_command(self._spark_job_file) # Then self.assertEqual(hook._env, {"bar": "foo"})
def test_resolve_connection_spark_k8s_cluster_ns_conf(self): # Given we specify the config option directly conf = { 'spark.kubernetes.namespace': 'airflow', } hook = SparkSubmitHook(conn_id='spark_k8s_cluster', conf=conf) # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "airflow" } self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster") self.assertEqual(dict_cmd["--conf"], "spark.kubernetes.namespace=airflow")
def test_build_spark_submit_command(self): # Given hook = SparkSubmitHook(**self._config) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_build_cmd = [ 'spark-submit', '--master', 'yarn', '--conf', 'parquet.compression=SNAPPY', '--files', 'hive-site.xml', '--py-files', 'sample_library.py', '--jars', 'parquet.jar', '--packages', 'com.databricks:spark-avro_2.11:3.2.0', '--exclude-packages', 'org.bad.dependency:1.0.0', '--repositories', 'http://myrepo.org', '--num-executors', '10', '--total-executor-cores', '4', '--executor-cores', '4', '--executor-memory', '22g', '--driver-memory', '3g', '--keytab', 'privileged_user.keytab', '--principal', 'user/[email protected]', '--name', 'spark-job', '--class', 'com.foo.bar.AppMain', '--verbose', 'test_application.py', '-f', 'foo', '--bar', 'bar', '--with-spaces', 'args should keep embdedded spaces', 'baz' ] self.assertEquals(expected_build_cmd, cmd)
def _run_spark_submit(self, application, jars): assert_airflow_package_installed() from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook from airflow.exceptions import AirflowException # task_env = get_cloud_config(Clouds.local) spark_local_config = SparkLocalEngineConfig() _config = self.config deploy = self.deploy spark = SparkSubmitHook( conf=_config.conf, conn_id=spark_local_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(self.task.get_py_files()), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) log_buffer = StringIO() with log_buffer as lb: dbnd_log_handler = self._capture_submit_log(spark, lb) try: spark.submit(application=application) except AirflowException as ex: return_code = self._get_spark_return_code_from_exception(ex) if return_code != "0": error_snippets = parse_spark_log_safe( log_buffer.getvalue().split(os.linesep)) raise failed_to_run_spark_script( self, spark._build_spark_submit_command( application=application), application, return_code, error_snippets, ) else: raise failed_spark_status(ex) finally: spark.log.handlers = [ h for h in spark.log.handlers if not dbnd_log_handler ]
def test_resolve_spark_submit_env_vars_k8s(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster', env_vars={"bar": "foo"}) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then self.assertEqual(cmd[4], "spark.kubernetes.driverEnv.bar=foo")
def test_resolve_spark_submit_env_vars_yarn(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster', env_vars={"bar": "foo"}) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then self.assertEqual(cmd[4], "spark.yarn.appMasterEnv.bar=foo") self.assertEqual(hook._env, {"bar": "foo"})
def test_resolve_connection_spark_standalone_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "spark://spark-standalone-master:6066", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": None, "spark_home": "/path/to/spark_home"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
def test_resolve_connection_spark_binary_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_binary_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "custom-spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'custom-spark-submit')
def test_resolve_connection_mesos_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default_mesos') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "mesos://host:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "mesos://host:5050")
def test_resolve_connection_spark_home_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": "/opt/myspark", "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/opt/myspark/bin/spark-submit')
def test_resolve_connection_spark_binary_default_value(self): # Given hook = SparkSubmitHook(conn_id='spark_default') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": 'root.default', "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_yarn_default(self): # Given hook = SparkSubmitHook(conn_id='') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn")
def test_resolve_connection_spark_yarn_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": "root.etl", "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn://yarn-master") self.assertEqual(dict_cmd["--queue"], "root.etl") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_spark_k8s_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "mynamespace"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def _run_spark_submit(self, file, jars): from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook _config = self.config deploy = self.deploy spark = SparkSubmitHook( conf=_config.conf, conn_id=self.emr_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(_config.py_files), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) step_id = self.emr_cluster.run_spark_submit_step( name=self.job.job_id, spark_submit_command=spark._build_spark_submit_command( application=deploy.sync(file) ), ) self.task_run.set_external_resource_urls( self.emr_cluster.get_emr_logs_dict(self.spark_application_logs) ) self.emr_cluster.wait_for_step_completion( step_id, status_reporter=self._report_step_status ) pass
def _run_spark_submit(self, application, jars): # task_env = get_cloud_config(Clouds.local) spark_local_config = SparkLocalEngineConfig() _config = self.config deploy = self.deploy AIRFLOW_ON = is_airflow_enabled() if AIRFLOW_ON: from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook from airflow.exceptions import AirflowException as SparkException else: from dbnd_spark._vendor.airflow.spark_hook import ( SparkException, SparkSubmitHook, ) spark = SparkSubmitHook( conf=_config.conf, conn_id=spark_local_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(self.task.get_py_files()), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) if not AIRFLOW_ON: # If there's no Airflow then there's no Connection so we # take conn information from spark config spark.set_connection(spark_local_config.conn_uri) log_buffer = StringIO() with log_buffer as lb: dbnd_log_handler = self._capture_submit_log(spark, lb) try: # We use str because we can accept Target objects (in case of JAR files) # or str objects (path to pyspark script) spark.submit(application=str(application)) except SparkException as ex: return_code = self._get_spark_return_code_from_exception(ex) if return_code != "0": error_snippets = parse_spark_log_safe( log_buffer.getvalue().split(os.linesep)) raise failed_to_run_spark_script( self, spark._build_spark_submit_command( application=application), application, return_code, error_snippets, ) else: raise failed_spark_status(ex) finally: spark.log.handlers = [ h for h in spark.log.handlers if not dbnd_log_handler ]