def test_exec_failure(self, db_mock_class): """ Test the execute function in case where the run failed. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED', '') with self.assertRaises(AirflowException): op.execute(None) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID, }) db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit, retry_delay=op.databricks_retry_delay) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) self.assertEquals(RUN_ID, op.run_id)
def test_exec_success(self, db_mock_class): """ Test the execute function in case where the run is successful. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS', '') op.execute(None) expected = op._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) self.assertEquals(RUN_ID, op.run_id)
def test_exec_success(self, db_mock_class): """ Test the execute function in case where the run is successful. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS', '') op.execute(None) expected = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID } db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) self.assertEquals(RUN_ID, op.run_id)
def test_exec_failure(self, db_mock_class): """ Test the execute function in case where the run failed. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED', '') with self.assertRaises(AirflowException): op.execute(None) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID, }) db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit, retry_delay=op.databricks_retry_delay) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) self.assertEqual(RUN_ID, op.run_id)
def test_init_with_named_parameters(self): """ Test the initializer with the named parameters. """ op = DatabricksSubmitRunOperator(task_id=TASK_ID, new_cluster=NEW_CLUSTER, notebook_task=NOTEBOOK_TASK) expected = op._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) self.assertDictEqual(expected, op.json)
def test_on_kill(self, db_mock_class): run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value op.run_id = RUN_ID op.on_kill() db_mock.cancel_run.assert_called_once_with(RUN_ID)
def test_init_with_json(self): """ Test the initializer with json data. """ json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK} op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json) expected = op._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) self.assertDictEqual(expected, op.json)
def test_init_with_templating(self): json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': TEMPLATED_NOTEBOOK_TASK, } dag = DAG('test', start_date=datetime.now()) op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json) op.json = op.render_template('json', op.json, {'ds': DATE}) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK, 'run_name': TASK_ID, }) self.assertDictEqual(expected, op.json)
def test_init_with_templating(self): json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': TEMPLATED_NOTEBOOK_TASK, } dag = DAG('test', start_date=datetime.now()) op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json) op.render_template_fields(context={'ds': DATE}) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK, 'run_name': TASK_ID, }) self.assertDictEqual(expected, op.json)
def test_init_with_json(self): """ Test the initializer with json data. """ json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json) expected = op._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) self.assertDictEqual(expected, op.json)
def databricks_operator_with_env(self, dag): databricks_cluster_params = { "spark_version": "6.5.x-scala2.11", "node_type_id": "m5a.large", "aws_attributes": { "availability": "SPOT_WITH_FALLBACK", "ebs_volume_count": 1, "ebs_volume_type": "GENERAL_PURPOSE_SSD", "ebs_volume_size": 100, }, "spark_env_vars": { "DBND__VERBOSE": "True" }, "num_workers": 1, } databricks_task_params = { "name": "generate rport", "new_cluster": databricks_cluster_params, "libraries": [{ "pypi": { "package": "dbnd" } }], "max_retries": 1, "spark_python_task": { "python_file": "s3://databricks/scripts/databricks_report.py" }, } return DatabricksSubmitRunOperator(task_id="databricks_task", json=databricks_task_params)
def test_init_with_bad_type(self): json = {'test': datetime.now()} # Looks a bit weird since we have to escape regex reserved symbols. exception_message = r'Type \<(type|class) \'datetime.datetime\'\> used ' + \ r'for parameter json\[test\] is not a number or a string' with self.assertRaisesRegexp(AirflowException, exception_message): DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
def test_deep_string_coerce(self): op = DatabricksSubmitRunOperator(task_id='test') test_json = { 'test_int': 1, 'test_float': 1.0, 'test_dict': {'key': 'value'}, 'test_list': [1, 1.0, 'a', 'b'], 'test_tuple': (1, 1.0, 'a', 'b') } expected = { 'test_int': '1', 'test_float': '1.0', 'test_dict': {'key': 'value'}, 'test_list': ['1', '1.0', 'a', 'b'], 'test_tuple': ['1', '1.0', 'a', 'b'] } self.assertDictEqual(op._deep_string_coerce(test_json), expected)
def test_init_with_merging(self): """ Test the initializer when json and other named parameters are both provided. The named parameters should override top level keys in the json dict. """ override_new_cluster = {'workers': 999} json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json, new_cluster=override_new_cluster) expected = op._deep_string_coerce({ 'new_cluster': override_new_cluster, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID, }) self.assertDictEqual(expected, op.json)
def test_init_with_specified_run_name(self): """ Test the initializer with a specified run_name. """ json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME }) self.assertDictEqual(expected, op.json)
def Workflow_0(config): if config.fabric == "azdb": workflow_id = "" workflow_version = "latest" workflow_jar = "s3://abinitio-spark-redshift-testing/prophecy/jars//latest/workflow.jar" prophecy_libs_jar = "s3://abinitio-spark-redshift-testing/prophecy/jars/libs/version/prophecy-libs-assembly-1.0.jar" workflow = DatabricksSubmitRunOperator( task_id="Workflow_0", new_cluster="Small", spark_jar_task={ "main_class_name": "main", "parameters": ["-C", "fabricName=" + config.fabric] }, databricks_conn_id=config.connection_id, libraries={ "jar": workflow_jar, "jar": prophecy_libs_jar }) return workflow, workflow
def prophecy_workflow1(config) -> BaseOperator: workflow_id = "381" workflow_version = "latest" return DatabricksSubmitRunOperator( task_id='prophecy_workflow1', new_cluster=config.fabric['job_sizes'][config.job_size], spark_jar_task={ 'main_class_name': 'Main', 'parameters': ['-C', 'fabricName=' + (config.fabric['name'])] }, databricks_conn_id=config.connId, libraries=[{ 'jar': 'dbfs:/FileStore/jars/prophecy/management/app/dp/%s/%s/workflow.jar' % (workflow_id, workflow_version) }, { 'jar': 'dbfs:/FileStore/jars/prophecy/management/app/dp/prophecy-libs/a9ca779efa7418f84186228725e35b0063acf006/prophecy-libs.jar' }])
'node_type_id': 'r3.xlarge', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 8 } notebook_task_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/PrepareData', }, } # Example of using the JSON parameter to initialize the operator. notebook_task = DatabricksSubmitRunOperator( task_id='notebook_task', dag=dag, json=notebook_task_params) # Example of using the named parameters of DatabricksSubmitRunOperator # to initialize the operator. spark_jar_task = DatabricksSubmitRunOperator( task_id='spark_jar_task', dag=dag, new_cluster=new_cluster, spark_jar_task={ 'main_class_name': 'com.example.ProcessData' }, libraries=[ { 'jar': 'dbfs:/lib/etl-0.1.jar' }
"param2": "123" }, 'notebook_path': '/Users/[email protected]/airflow/airflow_test', }, } # notebook_task_params2 = { # 'new_cluster': new_cluster, # 'notebook_task': {'base_parameters':{"retailer_name":context['dag_run'].conf.get('retailer_name')}, # 'notebook_path': '/Users/[email protected]/airflow/airflow_test_2', # }, # } notebook_task = DatabricksSubmitRunOperator(task_id='Run-notebook-1', dag=dag, json=notebook_task_params) notebook_task2 = DatabricksSubmitRunOperator( task_id='Run-notebook-2', dag=dag, json={ 'new_cluster': new_cluster, 'notebook_task': { 'base_parameters': { "retailer_name": '{{ dag_run.conf["retailer_name"] if dag_run else "" }}', "cat": '{{ dag_run.conf["cat"] if dag_run else "" }}', "fam": '{{ dag_run.conf["fam"] if dag_run else "" }}' }, 'notebook_path':
'notebook_path': '/Users/[email protected]/PrepareData', 'base_parameters': { 'output_path': '/mnt/path/to/output' } }, } # The above block of key-value parameters are equivalent to the 'new cluster' and 'notebook task' objects # supplied to the Databricks Runs Submit API. # More info here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#runs-submit # and here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#newcluster # and here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#notebooktask # We'll feed all of our parameters to the DatabricksSubmitRunOperator via its `JSON` parameter. notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', dag=dag, json=notebook_task_params) # Our second task, which is independent of the first, executes a spark JAR (i.e. compiled Scala code). # Rather than construct our task in one block of key-value parameters, we'll use the named parameters # of DatabricksSubmitRunOperator to initialize the operator. # Again, this will create a new cluster for the duration of the task. spark_jar_task = DatabricksSubmitRunOperator( task_id='spark_jar_task', dag=dag, new_cluster=cluster_spec, spark_jar_task={'main_class_name': 'com.example.ProcessData'}, libraries=[{ 'jar': 'dbfs:/lib/etl-0.1.jar' }]) # The 'libraries' argument allows you to attach libraries to the cluster that will be instantiated
def create_dag(dag_id, description, conf, date): default_args = { 'owner': 'airflow', 'email': conf.get('dag_email').split(','), 'email_on_failure': conf.get('dag_email_on_failure'), 'email_on_retry': conf.get('dag_email_on_retry'), 'retries': 3, 'retry_delay': timedelta(minutes=5), 'depends_on_past': conf.get('dag_depends_on_past'), } dag = DAG( dag_id=dag_id, description=description, schedule_interval=conf.get('dag_schedule_interval'), template_searchpath=[conf.get('sql_path'), conf.get('email_path')], default_args=default_args, start_date=datetime(*map(int, conf.get('dag_start_date').split(','))), catchup=conf.get('dag_catchup', True)) with dag: misc_search_expr = download_search_expr = None search_exprs = [file_spec.get('search_expr') for file_spec in conf.get('file_specs').values()] if None in search_exprs: misc_search_expr = '^(?!{})'.format('|'.join([s for s in search_exprs if s is not None])) else: download_search_expr = '({})'.format('|'.join([s for s in search_exprs if s is not None])) download = FTPSearchOperator( task_id='ftp_download', ftp_conn_id=conf.get('ftp_conn_id'), local_filepath=conf.get('data_path'), remote_filepath=conf.get('remote_inbound_path'), search_expr=download_search_expr, min_date="{{ execution_date }}", max_date="{{ next_execution_date }}", ftp_conn_type=conf.get('ftp_conn_type')) remove_tmp_files = PythonOperator( task_id='remove_local_files', provide_context=True, python_callable=remove_local_files, op_kwargs={"download_directory": conf.get('data_path'), "file_list_location": download.task_id}, trigger_rule='none_failed') for filename, file_spec in conf.get('file_specs').items(): file_spec = {**{k: v for k, v in conf.items() if k in ('search_expr', 'gpg_decrypt', 'gpg_decrypt', 'unzip', 'import', 'output_date_format')}, **file_spec} date_str = date.strftime(file_spec.get('output_date_format', '%Y-%m-%d')) input_s3_dir = "s3://{}/{}".format(conf.get('s3_bucket'), parse_directory_pattern(file_spec['directory_pattern'], date_str, 'csv').lstrip("/")) check_for_files = PythonOperator( task_id='check_for_{}_files'.format(filename), provide_context=True, python_callable=skip_if_no_files, op_kwargs={"search_expr": file_spec.get('search_expr') or misc_search_expr}) file_list_xcom_location = check_for_files.task_id if file_spec.get('unzip'): unzip_files = UnzipOperator(task_id='unzip_{}_files'.format(filename), file_list_xcom_location=file_list_xcom_location) file_list_xcom_location = unzip_files.task_id else: unzip_files = DummyOperator(task_id='unzip_{}_files'.format(filename)) if file_spec.get('gpg_decrypt'): decrypt = CryptographyOperator( task_id='decrypt_{}_files'.format(filename), crypto_conn_id=conf.get('crypt_conn'), file_list_xcom_location=file_list_xcom_location, output_directory=conf.get('data_path'), remove_encrypted=True, operation='decrypt') file_list_xcom_location = decrypt.task_id else: decrypt = DummyOperator(task_id='decrypt_{}_files'.format(filename)) save_to_s3 = LocalToS3Operator( task_id='save_{}_files_to_s3'.format(filename), s3_conn_id=conf.get('aws_connection_id'), s3_bucket=conf.get('s3_bucket'), s3_prefix=input_s3_dir, file_list_xcom_location=file_list_xcom_location) if file_spec.get('import'): import_file = DatabricksSubmitRunOperator( task_id='import_{}_file'.format(filename), job_id='{}dynamic_workflow_file_import'.format(conf.get('databricks_job_prefix')), polling_period_seconds=60 * 3, notebook_params={"config_path": file_spec['config_path'], "file_date": date_str, "file_path": "{}/{}".format( input_s3_dir, str(file_spec.get('unzipped_search_expr', file_spec['search_expr'])).replace(".*", "*"))} ) save_to_s3 >> import_file download >> check_for_files >> unzip_files >> decrypt >> save_to_s3 >> remove_tmp_files return dag
'email': ['*****@*****.**'], 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(1) } dag = DAG(dag_id='databricks_create_job', default_args=args, schedule_interval=None) test_cluster = { 'spark_version': '6.5 (includes Apache Spark 2.4.5, Scala 2.11)', 'node_type_id': 'm5.large', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 1 } notebook_task_params = { 'airlfow_cluster': test_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/Data Lake PoC', }, } notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', dag=dag, json=notebook_task_params) notebook_task
import tempfile from oauth2client import file from googleapiclient import discovery from pathlib import Path import httplib2 from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator API_SERVICE_NAME = 'webmasters' API_VERSION = 'v3' with tempfile.NamedTemporaryFile() as tmp: tmp.write( str.encode( '{"_module": "oauth2client.client", "scopes": ["https://www.googleapis.com/auth/webmasters.readonly"], "token_expiry": "2018-09-10T13:58:19Z", "id_token": null, "user_agent": null, "access_token": "ya29.GlsUBt4GD-bZZgOQZyOxMo28F14c4dGb4fqBP-zwoqGtCf1JNGC_u_F6Ya7WzIq9A8dGH_w3cOotGocTG2YyqWUV2Zn8oDr7TdH0ukh6PLqbww1bqhD7dlHvucq4", "token_uri": "https://www.googleapis.com/oauth2/v3/token", "invalid": false, "token_response": {"access_token": "ya29.GlsUBt4GD-bZZgOQZyOxMo28F14c4dGb4fqBP-zwoqGtCf1JNGC_u_F6Ya7WzIq9A8dGH_w3cOotGocTG2YyqWUV2Zn8oDr7TdH0ukh6PLqbww1bqhD7dlHvucq4", "scope": "https://www.googleapis.com/auth/webmasters.readonly", "expires_in": 3600, "token_type": "Bearer"}, "client_id": "551103279375-v7dc84rm7ba3hr7gr9h477ag1q20pm77.apps.googleusercontent.com", "token_info_uri": "https://www.googleapis.com/oauth2/v3/tokeninfo", "client_secret": "oGrW2HJ4jiJ-ttsu-Ij-_OPd", "revoke_uri": "https://accounts.google.com/o/oauth2/revoke", "_class": "OAuth2Credentials", "refresh_token": "1/q8xG064hS-vnIFKhmZpyjM_HtCdrJ9Q7SckX8fbsgZM", "id_token_jwt": null}' )) tmp.flush() print(tmp.name) storage = file.Storage(tmp.name) credentials = storage.get() http = credentials.authorize(http=httplib2.Http()) service = discovery.build(API_SERVICE_NAME, API_VERSION, http=http, cache_discovery=False) print(credentials) print(http) print(service) task = DatabricksSubmitRunOperator(task)
'notebook_task': { 'notebook_path': '/Users/[email protected]/foo', } } script_job_config = { "name": "My_cool_task", "new_cluster": { "spark_version": "7.3.x-scala2.12", "num_workers": 1, "node_type_id": "Standard_D3_v2" }, "spark_python_task": { "python_file": "dbfs:/my_job.py" } } submit_run_databricks_from_notebook = DatabricksSubmitRunOperator( task_id="submit_run_databricks_from_notebook", json=notebook_job_config, dag=dag) submit_run_databricks_from_script = DatabricksSubmitRunOperator( task_id="submit_run_databricks_from_script", json=script_job_config, dag=dag) run_now_databricks = DatabricksRunNowOperator(task_id="run_now_databricks", job_id=3, dag=dag)
'end_date': None, 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=5) } dag = DAG('DE_dag', default_args=default_args, schedule_interval= '@daily') DE_sh = BashOperator( task_id='DE_Equipment_Trader_SCRAPE', bash_command="python3 /home/ec2-user/DE/equipment_trader.py ", queue="pipeline2", dag=dag) notebook_task_params = { 'existing_cluster_id': '0128-230140-huts317', # cluster id of MIG Cluster 2 'notebook_task': { 'notebook_path': '/Users/[email protected]/DE_sales_inventory' } } DE_notebook_task = DatabricksSubmitRunOperator( task_id = 'DE_sales_inventory', dag = dag, queue = 'pipeline2', json = notebook_task_params ) DE_sh.set_downstream(DE_notebook_task)
'new_cluster': new_cluster, 'notebook_task': { 'base_parameters': { "retailer_name": retailer_name, "version": version, "categroy": category, "family": family, 'store_group_id': store_group_id }, 'notebook_path': '/Users/[email protected]/CPGAI_modeling/01_read_data', }, } notebook_task = DatabricksSubmitRunOperator( task_id='Read-data-and-build-high-bucket-models', dag=dag, json=notebook_task_params) notebook_task2 = DatabricksSubmitRunOperator( task_id='Run-high-low-and-final-ranking', dag=dag, json={ 'new_cluster': new_cluster2, 'notebook_task': { 'base_parameters': { "retailer_name": retailer_name, "version": version, "categroy": category, "family": family, 'store_group_id': store_group_id },
'depends_on_past': False, 'start_date': datetime(2021, 1, 29), 'schedule_interval': 'none', } dag = DAG(dag_id='yelp_databricks_operator', default_args=args) s3_mount_params = { 'existing_cluster_id': '0122-135412-chair803', 'notebook_task': { 'notebook_path': '/Yelp Analytics/Mount_yelp_data_from_s3', }, } notebook_task1 = DatabricksSubmitRunOperator(task_id='mount_s3_notebook_task', dag=dag, json=s3_mount_params) preprocess_biz_params = { 'existing_cluster_id': '0122-135412-chair803', 'notebook_task': { 'notebook_path': '/Yelp Analytics/Pre_Processing_Business_Data', }, } notebook_task2 = DatabricksSubmitRunOperator( task_id='preprocess_biz_notebook_task', dag=dag, json=preprocess_biz_params) preprocess_checkin_params = {
"ebs_volume_size": 100 }, 'num_workers': 1 } notebook_spark_load_data_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/spark-load-data', }, } '''Load data to AWS''' spark_load_data = DatabricksSubmitRunOperator( task_id='run_spark_load_data_to_aws', new_cluster=new_cluster, notebook_task={ 'notebook_path': '/Users/[email protected]/spark-load-data' }, do_xcom_push=True, dag=dag) notebook_spark_daily_calculations_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/spark-calculate-data', }, } '''Load data to AWS''' spark_daily_calculations = DatabricksSubmitRunOperator( task_id='run_spark_daily_calculations', do_xcom_push=True, dag=dag,
'email': ['*****@*****.**'], 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(2) } new_cluster = { 'spark_version': '6.0.x-scala2.11', 'node_type_id': 'i3.xlarge', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 2 } dag = DAG(dag_id='example_databricks_operator', default_args=args, schedule_interval='@daily') notebook_task_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Repos/[email protected]/airflow_demo/src/example_job.py' } } notebook_task = DatabricksSubmitRunOperator( task_id='Airflow_', databricks_conn_id='databricks_default', dag=dag, json=notebook_task_params)
'instance_profile_arn': 'arn:aws:iam::00000000000:instance-profile/de-instance-profile' }, 'num_workers': 1 } #set path to repartition.py file in Databricks catalog notebook_params = { 'new_cluster': etl_cluster, 'notebook_task': { 'notebook_path': '/path_to_file_in_databricks/repartition' } } run_process_data = DatabricksSubmitRunOperator(task_id='process_data', json=notebook_params, retries=2, dag=dag) ### V1 with PythonOperator which executes run_add_partitions func from athena.py run_repair_partition = PythonOperator( task_id="repair_partition", dag=dag, python_callable=run_add_partitions, execution_timeout=timedelta(minutes=10), provide_context=True, ) ### V2 with AWSAthenaOperator run_repair_partition = AWSAthenaOperator( task_id='repair_partition', query='MSCK REPAIR TABLE amplitude_feed',
new_cluster = { 'spark_version': '2.1.0-db3-scala2.11', 'node_type_id': 'r3.xlarge', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 8 } notebook_task_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/PrepareData', }, } # Example of using the JSON parameter to initialize the operator. notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', json=notebook_task_params) # Example of using the named parameters of DatabricksSubmitRunOperator # to initialize the operator. spark_jar_task = DatabricksSubmitRunOperator( task_id='spark_jar_task', new_cluster=new_cluster, spark_jar_task={'main_class_name': 'com.example.ProcessData'}, libraries=[{ 'jar': 'dbfs:/lib/etl-0.1.jar' }]) notebook_task >> spark_jar_task
'depends_on_past': False, 'start_date': datetime(2020, 4, 8), 'end_date': None, 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=5) } dag = DAG('WGO_dag', default_args=default_args, schedule_interval='@daily') WGO_sh = BashOperator( task_id='WGO_RV_Trader_SCRAPE', bash_command="python3 /home/ec2-user/WGO/rv_trader_mobile_scrape.py ", queue="pipeline2", dag=dag) notebook_task_params = { 'existing_cluster_id': '0128-230140-huts317', # cluster id of MIG Cluster 2 'notebook_task': { 'notebook_path': '/Users/[email protected]/WGO_inventory_analysis' } } WGO_notebook_task = DatabricksSubmitRunOperator(task_id='WGO_notebook_task', dag=dag, queue='pipeline2', json=notebook_task_params) WGO_sh.set_downstream(WGO_notebook_task)
from os import environ from airflow import DAG from airflow.contrib.hooks.ssh_hook import SSHHook from datetime import timedelta from airflow.utils.dates import days_ago from airflow import DAG from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator dag = DAG(dag_id='raw-to-parquet', default_args=None, schedule_interval=None,start_date=days_ago(2),catchup=False) spark_jar_task = DatabricksSubmitRunOperator( task_id='spark_jar_task', dag=dag, existing_cluster_id='1234', spark_jar_task={ 'main_class_name': 'com.example.ProcessData' }, libraries=[ { 'jar': 'dbfs:/lib/etl-0.1.jar' } ] )
'retries': 5, 'retry_delay': timedelta(minutes=5) } dag = DAG('amazon_notebook_dag', default_args=default_args, schedule_interval='30 12 * * sun,mon,tue,wed,thu,fri,sat ') ''' amazon_scrape = BashOperator( task_id='amazon_scrape', bash_command="/home/ec2-user/SHELL/AMAZON2.sh ", email_on_failure = True, email = email_list, queue='pipeline9', dag=dag) ''' notebook_task_params = { 'existing_cluster_id': '0128-230140-huts317', # cluster id of MIG Cluster 2 'notebook_task': { 'notebook_path': '/Users/[email protected]/amazon_analysis_xbyte' } } notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', email_on_failure=True, email=email_list, dag=dag, queue='pipeline9', json=notebook_task_params)