def test_exec_failure(self, db_mock_class):
        """
        Test the execute function in case where the run failed.
        """
        run = {
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS
        }
        op = DatabricksRunNowOperator(task_id=TASK_ID, job_id=JOB_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.run_now.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED',
                                                      '')

        with self.assertRaises(AirflowException):
            op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS,
            'job_id': JOB_ID
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)
        db_mock.run_now.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEqual(RUN_ID, op.run_id)
    def test_exec_failure(self, db_mock_class):
        """
        Test the execute function in case where the run failed.
        """
        run = {
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS
        }
        op = DatabricksRunNowOperator(task_id=TASK_ID, job_id=JOB_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.run_now.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED', '')

        with self.assertRaises(AirflowException):
            op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS,
            'job_id': JOB_ID
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)
        db_mock.run_now.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEquals(RUN_ID, op.run_id)
    def test_on_kill(self, db_mock_class):
        run = {
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS
        }
        op = DatabricksRunNowOperator(task_id=TASK_ID, job_id=JOB_ID, json=run)
        db_mock = db_mock_class.return_value
        op.run_id = RUN_ID

        op.on_kill()
        db_mock.cancel_run.assert_called_once_with(RUN_ID)
    def test_on_kill(self, db_mock_class):
        run = {
            'notebook_params': NOTEBOOK_PARAMS,
            'notebook_task': NOTEBOOK_TASK,
            'jar_params': JAR_PARAMS
        }
        op = DatabricksRunNowOperator(task_id=TASK_ID, job_id=JOB_ID, json=run)
        db_mock = db_mock_class.return_value
        op.run_id = RUN_ID

        op.on_kill()
        db_mock.cancel_run.assert_called_once_with(RUN_ID)
    def test_init_with_templating(self):
        json = {
            'notebook_params': NOTEBOOK_PARAMS,
            'jar_params': TEMPLATED_JAR_PARAMS
        }

        dag = DAG('test', start_date=datetime.now())
        op = DatabricksRunNowOperator(dag=dag, task_id=TASK_ID, job_id=JOB_ID, json=json)
        op.json = op.render_template('json', op.json, {'ds': DATE})
        expected = databricks_operator._deep_string_coerce({
            'notebook_params': NOTEBOOK_PARAMS,
            'jar_params': RENDERED_TEMPLATED_JAR_PARAMS,
            'job_id': JOB_ID
        })
        self.assertDictEqual(expected, op.json)
    def test_init_with_templating(self):
        json = {
            'notebook_params': NOTEBOOK_PARAMS,
            'jar_params': TEMPLATED_JAR_PARAMS
        }

        dag = DAG('test', start_date=datetime.now())
        op = DatabricksRunNowOperator(dag=dag, task_id=TASK_ID, job_id=JOB_ID, json=json)
        op.json = op.render_template('json', op.json, {'ds': DATE})
        expected = databricks_operator._deep_string_coerce({
            'notebook_params': NOTEBOOK_PARAMS,
            'jar_params': RENDERED_TEMPLATED_JAR_PARAMS,
            'job_id': JOB_ID
        })
        self.assertDictEqual(expected, op.json)
 def test_init_with_bad_type(self):
     json = {'test': datetime.now()}
     # Looks a bit weird since we have to escape regex reserved symbols.
     exception_message = r'Type \<(type|class) \'datetime.datetime\'\> used ' + \
                         r'for parameter json\[test\] is not a number or a string'
     with self.assertRaisesRegexp(AirflowException, exception_message):
         DatabricksRunNowOperator(task_id=TASK_ID, job_id=JOB_ID, json=json)
    def test_init_with_merging(self):
        """
        Test the initializer when json and other named parameters are both
        provided. The named parameters should override top level keys in the
        json dict.
        """
        override_notebook_params = {'workers': 999}
        json = {'notebook_params': NOTEBOOK_PARAMS, 'jar_params': JAR_PARAMS}

        op = DatabricksRunNowOperator(task_id=TASK_ID,
                                      json=json,
                                      job_id=JOB_ID,
                                      notebook_params=override_notebook_params,
                                      python_params=PYTHON_PARAMS,
                                      spark_submit_params=SPARK_SUBMIT_PARAMS)

        expected = databricks_operator._deep_string_coerce({
            'notebook_params':
            override_notebook_params,
            'jar_params':
            JAR_PARAMS,
            'python_params':
            PYTHON_PARAMS,
            'spark_submit_params':
            SPARK_SUBMIT_PARAMS,
            'job_id':
            JOB_ID
        })

        self.assertDictEqual(expected, op.json)
    def test_init_with_json(self):
        """
        Test the initializer with json data.
        """
        json = {
            'notebook_params': NOTEBOOK_PARAMS,
            'jar_params': JAR_PARAMS,
            'python_params': PYTHON_PARAMS,
            'spark_submit_params': SPARK_SUBMIT_PARAMS
        }
        op = DatabricksRunNowOperator(task_id=TASK_ID,
                                      job_id=JOB_ID,
                                      json=json)

        expected = databricks_operator._deep_string_coerce({
            'notebook_params':
            NOTEBOOK_PARAMS,
            'jar_params':
            JAR_PARAMS,
            'python_params':
            PYTHON_PARAMS,
            'spark_submit_params':
            SPARK_SUBMIT_PARAMS,
            'job_id':
            JOB_ID
        })

        self.assertDictEqual(expected, op.json)
    def test_init_with_named_parameters(self):
        """
        Test the initializer with the named parameters.
        """
        op = DatabricksRunNowOperator(job_id=JOB_ID, task_id=TASK_ID)
        expected = databricks_operator._deep_string_coerce({'job_id': 42})

        self.assertDictEqual(expected, op.json)
Exemplo n.º 11
0
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/foo',
    }
}

script_job_config = {
    "name": "My_cool_task",
    "new_cluster": {
        "spark_version": "7.3.x-scala2.12",
        "num_workers": 1,
        "node_type_id": "Standard_D3_v2"
    },
    "spark_python_task": {
        "python_file": "dbfs:/my_job.py"
    }
}

submit_run_databricks_from_notebook = DatabricksSubmitRunOperator(
    task_id="submit_run_databricks_from_notebook",
    json=notebook_job_config,
    dag=dag)

submit_run_databricks_from_script = DatabricksSubmitRunOperator(
    task_id="submit_run_databricks_from_script",
    json=script_job_config,
    dag=dag)

run_now_databricks = DatabricksRunNowOperator(task_id="run_now_databricks",
                                              job_id=3,
                                              dag=dag)
Exemplo n.º 12
0
    'retries': 1,
    'retry_delay': timedelta(minutes=2)
}

with DAG('databricks_coins_dag',
         start_date=datetime(2021, 5, 8),
         schedule_interval='@monthly',
         catchup=False,
         default_args=default_args) as dag:

    daily_coins_notebook = DatabricksRunNowOperator(
        task_id='daily_coins_notebook',
        databricks_conn_id=
        'databricks',  # name of the  databricks connection in airflow
        job_id=
        3,  # Job Id in Databricks which is parameterized with the notebook
        json={
            "notebook_params": {
                'inPath':
                '/Users/XXXXXX/DailyCoinTreatment'  # Databricks notebook path
            }
        })

    monthly_coins_notebook = DatabricksRunNowOperator(
        task_id='monthly_coins_notebook',
        databricks_conn_id=
        'databricks',  # name of the  databricks connection in airflow
        job_id=
        2,  # Job Id in Databricks which is parameterized with the notebook
        json={
            "notebook_params": {
                'inPath':
Exemplo n.º 13
0
import airflow
from datetime import datetime
from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksRunNowOperator

args = {
    'owner': 'airflow',
    'email': ['*****@*****.**'],
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(1)
}

dag = DAG(dag_id='databricks_trigger_job',
          default_args=args,
          schedule_interval='0 12 * * *',
          start_date=datetime(2017, 7, 15),
          catchup=False)

job_run = DatabricksRunNowOperator(task_id='job_task', dag=dag, job_id=17160)

job_run
Exemplo n.º 14
0
        }
    },
    'query': "SELECT * FROM table_four"
}]
with DAG(dag_id='adb_pipeline',
         default_args=args,
         start_date=datetime(2019, 1, 1),
         schedule_interval='30 4 * * *',
         catchup=False) as dag:

    t1 = DummyOperator(task_id='kick_off_dag')

    t2 = S3KeySensor(task_id='check_for_file',
                     bucket_key='globetelecom/copy_*',
                     poke_interval=45,
                     timeout=600,
                     wildcard_match=True,
                     bucket_name=BUCKET,
                     aws_conn_id=S3_CONN_ID)

    for job in job_info:
        spark = DatabricksRunNowOperator(task_id=job['job_id'],
                                         job_id=job['job_id'],
                                         json=job['config'])

        query = PostgresOperator(task_id='post_{0}_query'.format(
            job['job_id']),
                                 sql=job['query'],
                                 postgres_conn_id='prod_postgres')
        t1 >> t2 >> spark >> query
import airflow
from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksRunNowOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import timedelta, datetime

# default arguments
args = {
    'owner': 'Daniel',
    'depends_on_past': False,
    'databricks_conn_id': 'my_databricks_conn'
}

with DAG(dag_id='testJobs',
         default_args=args,
         start_date=airflow.utils.dates.days_ago(1)) as dag:

    first_task = DummyOperator(task_id='first_task')

    notebook_1_task = DatabricksRunNowOperator(
        task_id='notebook_1',
        job_id=1,
        json={"notebook_params": {
            "param1": "AnyParam"
        }},
        do_xcom_push=True)

    first_task >> notebook_1_task
from airflow.contrib.operators.databricks_operator import (
    DatabricksRunNowOperator, )

args = {
    "owner": "airflow",
    "start_date": airflow.utils.dates.days_ago(2),
    "depends_on_past": True,
}

dag = DAG(
    dag_id="databricks_af_operator",
    default_args=args,
    schedule_interval="@daily",
)

t1 = BashOperator(
    task_id="t1_bash_operator",
    bash_command="echo Airflow execution date is {{ ds }} ",
    dag=dag,
)

json = {"notebook_params": {"airflow_param": "NeilAF", "dt": "{{ ds }}"}}
# by default uses the connection databricks_conn_id
t2 = DatabricksRunNowOperator(task_id="notebook_run",
                              json=json,
                              job_id=21598,
                              dag=dag)

t1 >> t2
Exemplo n.º 17
0
# Rather than construct our task in one block of key-value parameters, we'll use the named parameters
# of DatabricksSubmitRunOperator to initialize the operator.
# Again, this will create a new cluster for the duration of the task.
spark_jar_task = DatabricksSubmitRunOperator(
    task_id='spark_jar_task',
    dag=dag,
    new_cluster=cluster_spec,
    spark_jar_task={'main_class_name': 'com.example.ProcessData'},
    libraries=[{
        'jar': 'dbfs:/lib/etl-0.1.jar'
    }])
# The 'libraries' argument allows you to attach libraries to the cluster that will be instantiated
# to execute the task.

# Finally, we have a python script that we wish to execute on an existing cluster, on successful completion
# of both of the first two tasks.

# In this case, the job is already defined in the Databricks workspace and `job_id` is used to identify it.
# Arguments can be passed to the job using `notebook_params`, `python_params` or `spark_submit_params`
# as appropriate.
# The association with the existing cluster would be part of that existing job's definition and configurable through
# the Jobs UI in the Databricks workspace.
spark_python_task = DatabricksRunNowOperator(task_id='spark_python_task',
                                             dag=dag,
                                             job_id=1337,
                                             python_params=['country', 'DK'])

# Define the order in which these jobs must run
notebook_task.set_downstream(spark_python_task)
spark_jar_task.set_downstream(spark_python_task)
    'depends_on_past': False,
    'databricks_conn_id': 'adb_workspace'
}

# DAG with Context Manager
# refer to https://airflow.apache.org/docs/stable/concepts.html?highlight=connection#context-manager
# trigger workflow daily at 04:30 am
with DAG(dag_id='adb_pipeline',
         default_args=args,
         start_date=airflow.utils.dates.days_ago(1),
         schedule_interval='30 4 * * *') as dag:

    # job 1 definition and configurable through the Jobs UI in the Databricks workspace
    notebook_1_task = DatabricksRunNowOperator(
        task_id='notebook_1',
        job_id=1,
        json={"notebook_params": {
            'inPath': '/bronze/uber'
        }})

    # Arguments can be passed to the job using `notebook_params`, `python_params` or `spark_submit_params`
    json_2 = {"notebook_params": {'inPath': '/bronze/kaggle'}}

    notebook_2_task = DatabricksRunNowOperator(task_id='notebook_2',
                                               job_id=2,
                                               json=json_2)

    # input parameters for job 3
    json_3 = {
        "notebook_params": {
            'bronzePath': '/bronze/',
            'silverPath': '/silver'
Exemplo n.º 19
0
args = {
    'owner': 'pluralsightws165',
    'email': ['*****@*****.**'],
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0)
}

dag = DAG(dag_id='ETL_newyork_taxi_operator',
          default_args=args,
          schedule_interval='@daily')
cluster_id = None

with dag:
    # Example of using the JSON parameter to initialize the operator.

    mount_storage_accounts = DatabricksRunNowOperator(
        task_id='mount_storage_accounts_task', job_id=100, notebook_params={})

    copy_data_to_delta_lake = DatabricksRunNowOperator(
        task_id='copy_data_to_delta_lake_task', job_id=117, notebook_params={})

    run_etl_yellow_taxi = DatabricksRunNowOperator(
        task_id='yellow_taxi_etl_task', job_id=108, notebook_params={})

    run_etl_green_taxi = DatabricksRunNowOperator(
        task_id='green_taxi_etl_task', job_id=123, notebook_params={})

    run_etl_fhv_taxi = DatabricksRunNowOperator(task_id='fhv_taxi_etl_task',
                                                job_id=133,
                                                notebook_params={})

    un_mount_storage = DatabricksRunNowOperator(