def test_resolve_connection(self): # Default to the standard yarn connection because conn_id does not exists hook = SparkSubmitHook(conn_id='') self.assertEqual(hook._resolve_connection(), ('yarn', None, None)) assert "--master yarn" in ' '.join( hook._build_command(self._spark_job_file)) # Default to the standard yarn connection hook = SparkSubmitHook(conn_id='spark_default') self.assertEqual(hook._resolve_connection(), ('yarn', 'root.default', None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn" in cmd assert "--queue root.default" in cmd # Connect to a mesos master hook = SparkSubmitHook(conn_id='spark_default_mesos') self.assertEqual(hook._resolve_connection(), ('mesos://host:5050', None, None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master mesos://host:5050" in cmd # Set specific queue and deploy mode hook = SparkSubmitHook(conn_id='spark_yarn_cluster') self.assertEqual(hook._resolve_connection(), ('yarn://yarn-master', 'root.etl', 'cluster')) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn://yarn-master" in cmd assert "--queue root.etl" in cmd assert "--deploy-mode cluster" in cmd
def test_resolve_connection_spark_k8s_cluster_ns_conf(self): # Given we specify the config option directly conf = { 'spark.kubernetes.namespace': 'airflow', } hook = SparkSubmitHook(conn_id='spark_k8s_cluster', conf=conf) # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "airflow" } self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster") self.assertEqual(dict_cmd["--conf"], "spark.kubernetes.namespace=airflow")
def test_resolve_connection_spark_home_not_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_not_set') # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then self.assertSequenceEqual(connection, ('yarn://yarn-master', None, None, None)) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_yarn_default(self): # Given hook = SparkSubmitHook(conn_id='') # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) self.assertSequenceEqual(connection, ('yarn', None, None, None)) self.assertEqual(dict_cmd["--master"], "yarn")
def test_resolve_connection_mesos_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default_mesos') # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) self.assertSequenceEqual(connection, ('mesos://host:5050', None, None, None)) self.assertEqual(dict_cmd["--master"], "mesos://host:5050")
def test_resolve_connection(self): # Default to the standard yarn connection because conn_id does not exists hook = SparkSubmitHook(conn_id='') self.assertEqual(hook._resolve_connection(), ('yarn', None, None, None)) assert "--master yarn" in ' '.join(hook._build_command(self._spark_job_file)) # Default to the standard yarn connection hook = SparkSubmitHook(conn_id='spark_default') self.assertEqual( hook._resolve_connection(), ('yarn', 'root.default', None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn" in cmd assert "--queue root.default" in cmd # Connect to a mesos master hook = SparkSubmitHook(conn_id='spark_default_mesos') self.assertEqual( hook._resolve_connection(), ('mesos://host:5050', None, None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master mesos://host:5050" in cmd # Set specific queue and deploy mode hook = SparkSubmitHook(conn_id='spark_yarn_cluster') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', 'root.etl', 'cluster', None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn://yarn-master" in cmd assert "--queue root.etl" in cmd assert "--deploy-mode cluster" in cmd # Set the spark home hook = SparkSubmitHook(conn_id='spark_home_set') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', None, None, '/opt/myspark') ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('/opt/myspark/bin/spark-submit') # Spark home not set hook = SparkSubmitHook(conn_id='spark_home_not_set') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', None, None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('spark-submit')
def test_resolve_connection_spark_yarn_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) self.assertSequenceEqual( connection, ('yarn://yarn-master', 'root.etl', 'cluster', None)) self.assertEqual(dict_cmd["--master"], "yarn://yarn-master") self.assertEqual(dict_cmd["--queue"], "root.etl") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_spark_binary_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_binary_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "custom-spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'custom-spark-submit')
def test_resolve_connection_spark_standalone_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "spark://spark-standalone-master:6066", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": None, "spark_home": "/path/to/spark_home"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
def test_resolve_connection_mesos_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default_mesos') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "mesos://host:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "mesos://host:5050")
def test_resolve_connection_spark_home_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": "/opt/myspark", "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/opt/myspark/bin/spark-submit')
def test_resolve_connection_spark_binary_default_value(self): # Given hook = SparkSubmitHook(conn_id='spark_default') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": 'root.default', "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_yarn_default(self): # Given hook = SparkSubmitHook(conn_id='') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn")
def test_resolve_connection_yarn_default(self): # Given hook = SparkSubmitHook(conn_id='') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn")
def test_resolve_connection_spark_yarn_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": "root.etl", "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn://yarn-master") self.assertEqual(dict_cmd["--queue"], "root.etl") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_spark_k8s_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "mynamespace"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_spark_k8s_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "mynamespace"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_mesos_cluster_env_connection(self): # Given conn_name = self.gen_conn_name(10) os.environ["AIRFLOW_CONN_SPARK_{}".format( conn_name.upper())] = "mesos://mesos-master:5050" hook = SparkSubmitHook(conn_id='spark_{}'.format(conn_name)) # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "master": "mesos://mesos-master:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "mesos://mesos-master:5050")