def test_exec_failure(self, db_mock_class):
        """
        Test the execute function in case where the run failed.
        """
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED', '')

        with self.assertRaises(AirflowException):
            op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID,
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)
        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEquals(RUN_ID, op.run_id)
    def test_exec_success(self, db_mock_class):
        """
        Test the execute function in case where the run is successful.
        """
        run = {
          'new_cluster': NEW_CLUSTER,
          'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS', '')

        op.execute(None)

        expected = op._deep_string_coerce({
          'new_cluster': NEW_CLUSTER,
          'notebook_task': NOTEBOOK_TASK,
          'run_name': TASK_ID
        })
        db_mock_class.assert_called_once_with(
                DEFAULT_CONN_ID,
                retry_limit=op.databricks_retry_limit)
        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEquals(RUN_ID, op.run_id)
示例#3
0
    def test_exec_success(self, db_mock_class):
        """
        Test the execute function in case where the run is successful.
        """
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS',
                                                      '')

        op.execute(None)

        expected = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID
        }
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit)
        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEquals(RUN_ID, op.run_id)
    def test_exec_failure(self, db_mock_class):
        """
        Test the execute function in case where the run failed.
        """
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED',
                                                      '')

        with self.assertRaises(AirflowException):
            op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID,
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)
        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEqual(RUN_ID, op.run_id)
 def test_init_with_named_parameters(self):
     """
     Test the initializer with the named parameters.
     """
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, new_cluster=NEW_CLUSTER, notebook_task=NOTEBOOK_TASK)
     expected = op._deep_string_coerce({
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK,
       'run_name': TASK_ID
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_named_parameters(self):
     """
     Test the initializer with the named parameters.
     """
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, new_cluster=NEW_CLUSTER, notebook_task=NOTEBOOK_TASK)
     expected = op._deep_string_coerce({
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK,
       'run_name': TASK_ID
     })
     self.assertDictEqual(expected, op.json)
    def test_on_kill(self, db_mock_class):
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        op.run_id = RUN_ID

        op.on_kill()

        db_mock.cancel_run.assert_called_once_with(RUN_ID)
    def test_on_kill(self, db_mock_class):
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        op.run_id = RUN_ID

        op.on_kill()

        db_mock.cancel_run.assert_called_once_with(RUN_ID)
 def test_init_with_json(self):
     """
     Test the initializer with json data.
     """
     json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK}
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
     expected = op._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': NOTEBOOK_TASK,
         'run_name': TASK_ID
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_templating(self):
     json = {
         'new_cluster': NEW_CLUSTER,
         'notebook_task': TEMPLATED_NOTEBOOK_TASK,
     }
     dag = DAG('test', start_date=datetime.now())
     op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json)
     op.json = op.render_template('json', op.json, {'ds': DATE})
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK,
         'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_templating(self):
     json = {
         'new_cluster': NEW_CLUSTER,
         'notebook_task': TEMPLATED_NOTEBOOK_TASK,
     }
     dag = DAG('test', start_date=datetime.now())
     op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json)
     op.render_template_fields(context={'ds': DATE})
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK,
         'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_json(self):
     """
     Test the initializer with json data.
     """
     json = {
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK
     }
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
     expected = op._deep_string_coerce({
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK,
       'run_name': TASK_ID
     })
     self.assertDictEqual(expected, op.json)
示例#13
0
    def databricks_operator_with_env(self, dag):
        databricks_cluster_params = {
            "spark_version": "6.5.x-scala2.11",
            "node_type_id": "m5a.large",
            "aws_attributes": {
                "availability": "SPOT_WITH_FALLBACK",
                "ebs_volume_count": 1,
                "ebs_volume_type": "GENERAL_PURPOSE_SSD",
                "ebs_volume_size": 100,
            },
            "spark_env_vars": {
                "DBND__VERBOSE": "True"
            },
            "num_workers": 1,
        }

        databricks_task_params = {
            "name": "generate rport",
            "new_cluster": databricks_cluster_params,
            "libraries": [{
                "pypi": {
                    "package": "dbnd"
                }
            }],
            "max_retries": 1,
            "spark_python_task": {
                "python_file": "s3://databricks/scripts/databricks_report.py"
            },
        }

        return DatabricksSubmitRunOperator(task_id="databricks_task",
                                           json=databricks_task_params)
 def test_init_with_bad_type(self):
     json = {'test': datetime.now()}
     # Looks a bit weird since we have to escape regex reserved symbols.
     exception_message = r'Type \<(type|class) \'datetime.datetime\'\> used ' + \
                         r'for parameter json\[test\] is not a number or a string'
     with self.assertRaisesRegexp(AirflowException, exception_message):
         DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
 def test_deep_string_coerce(self):
     op = DatabricksSubmitRunOperator(task_id='test')
     test_json = {
         'test_int': 1,
         'test_float': 1.0,
         'test_dict': {'key': 'value'},
         'test_list': [1, 1.0, 'a', 'b'],
         'test_tuple': (1, 1.0, 'a', 'b')
     }
     expected = {
         'test_int': '1',
         'test_float': '1.0',
         'test_dict': {'key': 'value'},
         'test_list': ['1', '1.0', 'a', 'b'],
         'test_tuple': ['1', '1.0', 'a', 'b']
     }
     self.assertDictEqual(op._deep_string_coerce(test_json), expected)
 def test_deep_string_coerce(self):
     op = DatabricksSubmitRunOperator(task_id='test')
     test_json = {
         'test_int': 1,
         'test_float': 1.0,
         'test_dict': {'key': 'value'},
         'test_list': [1, 1.0, 'a', 'b'],
         'test_tuple': (1, 1.0, 'a', 'b')
     }
     expected = {
         'test_int': '1',
         'test_float': '1.0',
         'test_dict': {'key': 'value'},
         'test_list': ['1', '1.0', 'a', 'b'],
         'test_tuple': ['1', '1.0', 'a', 'b']
     }
     self.assertDictEqual(op._deep_string_coerce(test_json), expected)
 def test_init_with_merging(self):
     """
     Test the initializer when json and other named parameters are both
     provided. The named parameters should override top level keys in the
     json dict.
     """
     override_new_cluster = {'workers': 999}
     json = {
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK,
     }
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json, new_cluster=override_new_cluster)
     expected = op._deep_string_coerce({
       'new_cluster': override_new_cluster,
       'notebook_task': NOTEBOOK_TASK,
       'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_merging(self):
     """
     Test the initializer when json and other named parameters are both
     provided. The named parameters should override top level keys in the
     json dict.
     """
     override_new_cluster = {'workers': 999}
     json = {
       'new_cluster': NEW_CLUSTER,
       'notebook_task': NOTEBOOK_TASK,
     }
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json, new_cluster=override_new_cluster)
     expected = op._deep_string_coerce({
       'new_cluster': override_new_cluster,
       'notebook_task': NOTEBOOK_TASK,
       'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
 def test_init_with_specified_run_name(self):
     """
     Test the initializer with a specified run_name.
     """
     json = {
         'new_cluster': NEW_CLUSTER,
         'notebook_task': NOTEBOOK_TASK,
         'run_name': RUN_NAME
     }
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': NOTEBOOK_TASK,
         'run_name': RUN_NAME
     })
     self.assertDictEqual(expected, op.json)
示例#20
0
def Workflow_0(config):
    if config.fabric == "azdb":
        workflow_id = ""
        workflow_version = "latest"
        workflow_jar = "s3://abinitio-spark-redshift-testing/prophecy/jars//latest/workflow.jar"
        prophecy_libs_jar = "s3://abinitio-spark-redshift-testing/prophecy/jars/libs/version/prophecy-libs-assembly-1.0.jar"
        workflow = DatabricksSubmitRunOperator(
            task_id="Workflow_0",
            new_cluster="Small",
            spark_jar_task={
                "main_class_name": "main",
                "parameters": ["-C", "fabricName=" + config.fabric]
            },
            databricks_conn_id=config.connection_id,
            libraries={
                "jar": workflow_jar,
                "jar": prophecy_libs_jar
            })
        return workflow, workflow
示例#21
0
def prophecy_workflow1(config) -> BaseOperator:
    workflow_id = "381"
    workflow_version = "latest"
    return DatabricksSubmitRunOperator(
        task_id='prophecy_workflow1',
        new_cluster=config.fabric['job_sizes'][config.job_size],
        spark_jar_task={
            'main_class_name': 'Main',
            'parameters': ['-C', 'fabricName=' + (config.fabric['name'])]
        },
        databricks_conn_id=config.connId,
        libraries=[{
            'jar':
            'dbfs:/FileStore/jars/prophecy/management/app/dp/%s/%s/workflow.jar'
            % (workflow_id, workflow_version)
        }, {
            'jar':
            'dbfs:/FileStore/jars/prophecy/management/app/dp/prophecy-libs/a9ca779efa7418f84186228725e35b0063acf006/prophecy-libs.jar'
        }])
    'node_type_id': 'r3.xlarge',
    'aws_attributes': {
        'availability': 'ON_DEMAND'
    },
    'num_workers': 8
}

notebook_task_params = {
    'new_cluster': new_cluster,
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/PrepareData',
    },
}
# Example of using the JSON parameter to initialize the operator.
notebook_task = DatabricksSubmitRunOperator(
    task_id='notebook_task',
    dag=dag,
    json=notebook_task_params)

# Example of using the named parameters of DatabricksSubmitRunOperator
# to initialize the operator.
spark_jar_task = DatabricksSubmitRunOperator(
    task_id='spark_jar_task',
    dag=dag,
    new_cluster=new_cluster,
    spark_jar_task={
        'main_class_name': 'com.example.ProcessData'
    },
    libraries=[
        {
            'jar': 'dbfs:/lib/etl-0.1.jar'
        }
示例#23
0
            "param2": "123"
        },
        'notebook_path':
        '/Users/[email protected]/airflow/airflow_test',
    },
}

# notebook_task_params2 = {
#     'new_cluster': new_cluster,
#     'notebook_task': {'base_parameters':{"retailer_name":context['dag_run'].conf.get('retailer_name')},
#     'notebook_path': '/Users/[email protected]/airflow/airflow_test_2',
#   },
# }

notebook_task = DatabricksSubmitRunOperator(task_id='Run-notebook-1',
                                            dag=dag,
                                            json=notebook_task_params)

notebook_task2 = DatabricksSubmitRunOperator(
    task_id='Run-notebook-2',
    dag=dag,
    json={
        'new_cluster': new_cluster,
        'notebook_task': {
            'base_parameters': {
                "retailer_name":
                '{{ dag_run.conf["retailer_name"] if dag_run else "" }}',
                "cat": '{{ dag_run.conf["cat"] if dag_run else "" }}',
                "fam": '{{ dag_run.conf["fam"] if dag_run else "" }}'
            },
            'notebook_path':
示例#24
0
        'notebook_path': '/Users/[email protected]/PrepareData',
        'base_parameters': {
            'output_path': '/mnt/path/to/output'
        }
    },
}

# The above block of key-value parameters are equivalent to the 'new cluster' and 'notebook task' objects
# supplied to the Databricks Runs Submit API.
# More info here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#runs-submit
# and here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#newcluster
# and here: https://docs.databricks.com/dev-tools/api/latest/jobs.html#notebooktask

# We'll feed all of our parameters to the DatabricksSubmitRunOperator via its `JSON` parameter.
notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                            dag=dag,
                                            json=notebook_task_params)

# Our second task, which is independent of the first, executes a spark JAR (i.e. compiled Scala code).
# Rather than construct our task in one block of key-value parameters, we'll use the named parameters
# of DatabricksSubmitRunOperator to initialize the operator.
# Again, this will create a new cluster for the duration of the task.
spark_jar_task = DatabricksSubmitRunOperator(
    task_id='spark_jar_task',
    dag=dag,
    new_cluster=cluster_spec,
    spark_jar_task={'main_class_name': 'com.example.ProcessData'},
    libraries=[{
        'jar': 'dbfs:/lib/etl-0.1.jar'
    }])
# The 'libraries' argument allows you to attach libraries to the cluster that will be instantiated
示例#25
0
def create_dag(dag_id, description, conf, date):
    default_args = {
        'owner': 'airflow',
        'email': conf.get('dag_email').split(','),
        'email_on_failure': conf.get('dag_email_on_failure'),
        'email_on_retry': conf.get('dag_email_on_retry'),
        'retries': 3,
        'retry_delay': timedelta(minutes=5),
        'depends_on_past': conf.get('dag_depends_on_past'),
    }

    dag = DAG(
        dag_id=dag_id,
        description=description,
        schedule_interval=conf.get('dag_schedule_interval'),
        template_searchpath=[conf.get('sql_path'), conf.get('email_path')],
        default_args=default_args,
        start_date=datetime(*map(int, conf.get('dag_start_date').split(','))),
        catchup=conf.get('dag_catchup', True))

    with dag:
        misc_search_expr = download_search_expr = None
        search_exprs = [file_spec.get('search_expr') for file_spec in conf.get('file_specs').values()]
        if None in search_exprs:
            misc_search_expr = '^(?!{})'.format('|'.join([s for s in search_exprs if s is not None]))
        else:
            download_search_expr = '({})'.format('|'.join([s for s in search_exprs if s is not None]))

        download = FTPSearchOperator(
            task_id='ftp_download',
            ftp_conn_id=conf.get('ftp_conn_id'),
            local_filepath=conf.get('data_path'),
            remote_filepath=conf.get('remote_inbound_path'),
            search_expr=download_search_expr,
            min_date="{{ execution_date }}",
            max_date="{{ next_execution_date }}",
            ftp_conn_type=conf.get('ftp_conn_type'))

        remove_tmp_files = PythonOperator(
            task_id='remove_local_files',
            provide_context=True,
            python_callable=remove_local_files,
            op_kwargs={"download_directory": conf.get('data_path'),
                       "file_list_location": download.task_id},
            trigger_rule='none_failed')

        for filename, file_spec in conf.get('file_specs').items():
            file_spec = {**{k: v for k, v in conf.items()
                            if k in ('search_expr', 'gpg_decrypt', 'gpg_decrypt', 'unzip', 'import',
                                     'output_date_format')},
                         **file_spec}
            date_str = date.strftime(file_spec.get('output_date_format', '%Y-%m-%d'))
            input_s3_dir = "s3://{}/{}".format(conf.get('s3_bucket'),
                                               parse_directory_pattern(file_spec['directory_pattern'],
                                                                       date_str, 'csv').lstrip("/"))

            check_for_files = PythonOperator(
                task_id='check_for_{}_files'.format(filename),
                provide_context=True,
                python_callable=skip_if_no_files,
                op_kwargs={"search_expr": file_spec.get('search_expr') or misc_search_expr})
            file_list_xcom_location = check_for_files.task_id

            if file_spec.get('unzip'):
                unzip_files = UnzipOperator(task_id='unzip_{}_files'.format(filename),
                                            file_list_xcom_location=file_list_xcom_location)
                file_list_xcom_location = unzip_files.task_id
            else:
                unzip_files = DummyOperator(task_id='unzip_{}_files'.format(filename))

            if file_spec.get('gpg_decrypt'):
                decrypt = CryptographyOperator(
                    task_id='decrypt_{}_files'.format(filename),
                    crypto_conn_id=conf.get('crypt_conn'),
                    file_list_xcom_location=file_list_xcom_location,
                    output_directory=conf.get('data_path'),
                    remove_encrypted=True,
                    operation='decrypt')
                file_list_xcom_location = decrypt.task_id
            else:
                decrypt = DummyOperator(task_id='decrypt_{}_files'.format(filename))

            save_to_s3 = LocalToS3Operator(
                task_id='save_{}_files_to_s3'.format(filename),
                s3_conn_id=conf.get('aws_connection_id'),
                s3_bucket=conf.get('s3_bucket'),
                s3_prefix=input_s3_dir,
                file_list_xcom_location=file_list_xcom_location)

            if file_spec.get('import'):
                import_file = DatabricksSubmitRunOperator(
                    task_id='import_{}_file'.format(filename),
                    job_id='{}dynamic_workflow_file_import'.format(conf.get('databricks_job_prefix')),
                    polling_period_seconds=60 * 3,
                    notebook_params={"config_path": file_spec['config_path'],
                                     "file_date": date_str,
                                     "file_path": "{}/{}".format(
                                         input_s3_dir, str(file_spec.get('unzipped_search_expr',
                                                                         file_spec['search_expr'])).replace(".*", "*"))}
                )

                save_to_s3 >> import_file

            download >> check_for_files >> unzip_files >> decrypt >> save_to_s3 >> remove_tmp_files
    return dag
示例#26
0
    'email': ['*****@*****.**'],
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(1)
}

dag = DAG(dag_id='databricks_create_job',
          default_args=args,
          schedule_interval=None)

test_cluster = {
    'spark_version': '6.5 (includes Apache Spark 2.4.5, Scala 2.11)',
    'node_type_id': 'm5.large',
    'aws_attributes': {
        'availability': 'ON_DEMAND'
    },
    'num_workers': 1
}

notebook_task_params = {
    'airlfow_cluster': test_cluster,
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/Data Lake PoC',
    },
}

notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                            dag=dag,
                                            json=notebook_task_params)

notebook_task
示例#27
0
import tempfile
from oauth2client import file
from googleapiclient import discovery
from pathlib import Path
import httplib2
from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator

API_SERVICE_NAME = 'webmasters'
API_VERSION = 'v3'
with tempfile.NamedTemporaryFile() as tmp:
    tmp.write(
        str.encode(
            '{"_module": "oauth2client.client", "scopes": ["https://www.googleapis.com/auth/webmasters.readonly"], "token_expiry": "2018-09-10T13:58:19Z", "id_token": null, "user_agent": null, "access_token": "ya29.GlsUBt4GD-bZZgOQZyOxMo28F14c4dGb4fqBP-zwoqGtCf1JNGC_u_F6Ya7WzIq9A8dGH_w3cOotGocTG2YyqWUV2Zn8oDr7TdH0ukh6PLqbww1bqhD7dlHvucq4", "token_uri": "https://www.googleapis.com/oauth2/v3/token", "invalid": false, "token_response": {"access_token": "ya29.GlsUBt4GD-bZZgOQZyOxMo28F14c4dGb4fqBP-zwoqGtCf1JNGC_u_F6Ya7WzIq9A8dGH_w3cOotGocTG2YyqWUV2Zn8oDr7TdH0ukh6PLqbww1bqhD7dlHvucq4", "scope": "https://www.googleapis.com/auth/webmasters.readonly", "expires_in": 3600, "token_type": "Bearer"}, "client_id": "551103279375-v7dc84rm7ba3hr7gr9h477ag1q20pm77.apps.googleusercontent.com", "token_info_uri": "https://www.googleapis.com/oauth2/v3/tokeninfo", "client_secret": "oGrW2HJ4jiJ-ttsu-Ij-_OPd", "revoke_uri": "https://accounts.google.com/o/oauth2/revoke", "_class": "OAuth2Credentials", "refresh_token": "1/q8xG064hS-vnIFKhmZpyjM_HtCdrJ9Q7SckX8fbsgZM", "id_token_jwt": null}'
        ))
    tmp.flush()
    print(tmp.name)
    storage = file.Storage(tmp.name)
    credentials = storage.get()
    http = credentials.authorize(http=httplib2.Http())
    service = discovery.build(API_SERVICE_NAME,
                              API_VERSION,
                              http=http,
                              cache_discovery=False)
    print(credentials)
    print(http)
    print(service)

    task = DatabricksSubmitRunOperator(task)
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/foo',
    }
}

script_job_config = {
    "name": "My_cool_task",
    "new_cluster": {
        "spark_version": "7.3.x-scala2.12",
        "num_workers": 1,
        "node_type_id": "Standard_D3_v2"
    },
    "spark_python_task": {
        "python_file": "dbfs:/my_job.py"
    }
}

submit_run_databricks_from_notebook = DatabricksSubmitRunOperator(
    task_id="submit_run_databricks_from_notebook",
    json=notebook_job_config,
    dag=dag)

submit_run_databricks_from_script = DatabricksSubmitRunOperator(
    task_id="submit_run_databricks_from_script",
    json=script_job_config,
    dag=dag)

run_now_databricks = DatabricksRunNowOperator(task_id="run_now_databricks",
                                              job_id=3,
                                              dag=dag)
示例#29
0
文件: DE_dag.py 项目: MIG-Data/dags
    'end_date': None,
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('DE_dag', default_args=default_args, schedule_interval= '@daily')

DE_sh = BashOperator(
    task_id='DE_Equipment_Trader_SCRAPE',
    bash_command="python3 /home/ec2-user/DE/equipment_trader.py ",
    queue="pipeline2",
    dag=dag)


notebook_task_params = {
        'existing_cluster_id': '0128-230140-huts317', # cluster id of MIG Cluster 2
        'notebook_task': {
                'notebook_path': '/Users/[email protected]/DE_sales_inventory'
                }
        }
DE_notebook_task = DatabricksSubmitRunOperator(
        task_id = 'DE_sales_inventory',
        dag = dag,
        queue = 'pipeline2',
        json = notebook_task_params
        )
DE_sh.set_downstream(DE_notebook_task)
示例#30
0
    'new_cluster': new_cluster,
    'notebook_task': {
        'base_parameters': {
            "retailer_name": retailer_name,
            "version": version,
            "categroy": category,
            "family": family,
            'store_group_id': store_group_id
        },
        'notebook_path':
        '/Users/[email protected]/CPGAI_modeling/01_read_data',
    },
}

notebook_task = DatabricksSubmitRunOperator(
    task_id='Read-data-and-build-high-bucket-models',
    dag=dag,
    json=notebook_task_params)

notebook_task2 = DatabricksSubmitRunOperator(
    task_id='Run-high-low-and-final-ranking',
    dag=dag,
    json={
        'new_cluster': new_cluster2,
        'notebook_task': {
            'base_parameters': {
                "retailer_name": retailer_name,
                "version": version,
                "categroy": category,
                "family": family,
                'store_group_id': store_group_id
            },
示例#31
0
    'depends_on_past': False,
    'start_date': datetime(2021, 1, 29),
    'schedule_interval': 'none',
}

dag = DAG(dag_id='yelp_databricks_operator', default_args=args)

s3_mount_params = {
    'existing_cluster_id': '0122-135412-chair803',
    'notebook_task': {
        'notebook_path': '/Yelp Analytics/Mount_yelp_data_from_s3',
    },
}

notebook_task1 = DatabricksSubmitRunOperator(task_id='mount_s3_notebook_task',
                                             dag=dag,
                                             json=s3_mount_params)

preprocess_biz_params = {
    'existing_cluster_id': '0122-135412-chair803',
    'notebook_task': {
        'notebook_path': '/Yelp Analytics/Pre_Processing_Business_Data',
    },
}

notebook_task2 = DatabricksSubmitRunOperator(
    task_id='preprocess_biz_notebook_task',
    dag=dag,
    json=preprocess_biz_params)

preprocess_checkin_params = {
示例#32
0
        "ebs_volume_size": 100
    },
    'num_workers': 1
}

notebook_spark_load_data_params = {
    'new_cluster': new_cluster,
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/spark-load-data',
    },
}
'''Load data to AWS'''
spark_load_data = DatabricksSubmitRunOperator(
    task_id='run_spark_load_data_to_aws',
    new_cluster=new_cluster,
    notebook_task={
        'notebook_path': '/Users/[email protected]/spark-load-data'
    },
    do_xcom_push=True,
    dag=dag)

notebook_spark_daily_calculations_params = {
    'new_cluster': new_cluster,
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/spark-calculate-data',
    },
}
'''Load data to AWS'''
spark_daily_calculations = DatabricksSubmitRunOperator(
    task_id='run_spark_daily_calculations',
    do_xcom_push=True,
    dag=dag,
示例#33
0
    'email': ['*****@*****.**'],
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(2)
}
new_cluster = {
    'spark_version': '6.0.x-scala2.11',
    'node_type_id': 'i3.xlarge',
    'aws_attributes': {
        'availability': 'ON_DEMAND'
    },
    'num_workers': 2
}

dag = DAG(dag_id='example_databricks_operator',
          default_args=args,
          schedule_interval='@daily')

notebook_task_params = {
    'new_cluster': new_cluster,
    'notebook_task': {
        'notebook_path':
        '/Repos/[email protected]/airflow_demo/src/example_job.py'
    }
}

notebook_task = DatabricksSubmitRunOperator(
    task_id='Airflow_',
    databricks_conn_id='databricks_default',
    dag=dag,
    json=notebook_task_params)
示例#34
0
        'instance_profile_arn':
        'arn:aws:iam::00000000000:instance-profile/de-instance-profile'
    },
    'num_workers': 1
}

#set path to repartition.py file in Databricks catalog
notebook_params = {
    'new_cluster': etl_cluster,
    'notebook_task': {
        'notebook_path': '/path_to_file_in_databricks/repartition'
    }
}

run_process_data = DatabricksSubmitRunOperator(task_id='process_data',
                                               json=notebook_params,
                                               retries=2,
                                               dag=dag)

### V1 with PythonOperator which executes run_add_partitions func from athena.py
run_repair_partition = PythonOperator(
    task_id="repair_partition",
    dag=dag,
    python_callable=run_add_partitions,
    execution_timeout=timedelta(minutes=10),
    provide_context=True,
)

### V2 with AWSAthenaOperator
run_repair_partition = AWSAthenaOperator(
    task_id='repair_partition',
    query='MSCK REPAIR TABLE amplitude_feed',
    new_cluster = {
        'spark_version': '2.1.0-db3-scala2.11',
        'node_type_id': 'r3.xlarge',
        'aws_attributes': {
            'availability': 'ON_DEMAND'
        },
        'num_workers': 8
    }

    notebook_task_params = {
        'new_cluster': new_cluster,
        'notebook_task': {
            'notebook_path': '/Users/[email protected]/PrepareData',
        },
    }
    # Example of using the JSON parameter to initialize the operator.
    notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                                json=notebook_task_params)

    # Example of using the named parameters of DatabricksSubmitRunOperator
    # to initialize the operator.
    spark_jar_task = DatabricksSubmitRunOperator(
        task_id='spark_jar_task',
        new_cluster=new_cluster,
        spark_jar_task={'main_class_name': 'com.example.ProcessData'},
        libraries=[{
            'jar': 'dbfs:/lib/etl-0.1.jar'
        }])

    notebook_task >> spark_jar_task
示例#36
0
    'depends_on_past': False,
    'start_date': datetime(2020, 4, 8),
    'end_date': None,
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('WGO_dag', default_args=default_args, schedule_interval='@daily')

WGO_sh = BashOperator(
    task_id='WGO_RV_Trader_SCRAPE',
    bash_command="python3 /home/ec2-user/WGO/rv_trader_mobile_scrape.py ",
    queue="pipeline2",
    dag=dag)

notebook_task_params = {
    'existing_cluster_id':
    '0128-230140-huts317',  # cluster id of MIG Cluster 2
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/WGO_inventory_analysis'
    }
}
WGO_notebook_task = DatabricksSubmitRunOperator(task_id='WGO_notebook_task',
                                                dag=dag,
                                                queue='pipeline2',
                                                json=notebook_task_params)
WGO_sh.set_downstream(WGO_notebook_task)
示例#37
0
from os import environ

from airflow import DAG
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from airflow.utils.dates import days_ago


from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator

dag = DAG(dag_id='raw-to-parquet', default_args=None, schedule_interval=None,start_date=days_ago(2),catchup=False)

spark_jar_task = DatabricksSubmitRunOperator(
  task_id='spark_jar_task',
  dag=dag,
  existing_cluster_id='1234',
  spark_jar_task={
    'main_class_name': 'com.example.ProcessData'
  },
  libraries=[
    {
      'jar': 'dbfs:/lib/etl-0.1.jar'
    }
  ]
)
示例#38
0
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('amazon_notebook_dag',
          default_args=default_args,
          schedule_interval='30 12 * * sun,mon,tue,wed,thu,fri,sat ')
'''
amazon_scrape = BashOperator(
    task_id='amazon_scrape',
    bash_command="/home/ec2-user/SHELL/AMAZON2.sh ",
	email_on_failure = True,
	email = email_list,
    queue='pipeline9',
    dag=dag)
'''

notebook_task_params = {
    'existing_cluster_id':
    '0128-230140-huts317',  # cluster id of MIG Cluster 2
    'notebook_task': {
        'notebook_path': '/Users/[email protected]/amazon_analysis_xbyte'
    }
}
notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                            email_on_failure=True,
                                            email=email_list,
                                            dag=dag,
                                            queue='pipeline9',
                                            json=notebook_task_params)