def test_execute(self, conn_id='spark_default'): operator = SparkSubmitOperator(task_id='spark_submit_job', dag=self.dag, **self._config) self.assertEqual(conn_id, operator._conn_id) self.assertEqual(self._config['application'], operator._application) self.assertEqual(self._config['conf'], operator._conf) self.assertEqual(self._config['files'], operator._files) self.assertEqual(self._config['py_files'], operator._py_files) self.assertEqual(self._config['jars'], operator._jars) self.assertEqual(self._config['executor_cores'], operator._executor_cores) self.assertEqual(self._config['executor_memory'], operator._executor_memory) self.assertEqual(self._config['keytab'], operator._keytab) self.assertEqual(self._config['principal'], operator._principal) self.assertEqual(self._config['name'], operator._name) self.assertEqual(self._config['num_executors'], operator._num_executors) self.assertEqual(self._config['verbose'], operator._verbose) self.assertEqual(self._config['java_class'], operator._java_class) self.assertEqual(self._config['driver_memory'], operator._driver_memory) self.assertEqual(self._config['application_args'], operator._application_args)
def _get_test_dag(self): with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag: op1 = SparkSubmitOperator(task_id='op1') op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo') op3 = S3ListOperator(task_id='op3', bucket='foo') op4 = EmrCreateJobFlowOperator(task_id='op4') op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo') op6 = FileToWasbOperator(task_id='op6', container_name='foo', blob_name='foo', file_path='foo') op7 = EmailOperator(task_id='op7', subject='foo', to='foo', html_content='foo') op8 = S3CopyObjectOperator(task_id='op8', dest_bucket_key='foo', source_bucket_key='foo') op9 = BranchPythonOperator(task_id='op9', python_callable=print) op10 = PythonOperator(task_id='op10', python_callable=range) op1 >> [op2, op3, op4] op2 >> [op5, op6] op6 >> [op7, op8, op9] op3 >> [op7, op8] op8 >> [op9, op10] return dag
def test_spark_dag(mock_subproc_popen): # Hack to get around having a Connection os.environ["AIRFLOW_CONN_SPARK"] = "something" dag = DAG( dag_id="spark_dag", default_args=default_args, schedule_interval=None, ) # pylint: disable=unused-variable clean_data = SparkSubmitOperator( task_id="run_spark", application="some_path.py", conn_id="SPARK", dag=dag, ) pipeline = make_dagster_pipeline_from_airflow_dag( dag=dag, tags={ AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat() }, ) execute_pipeline(pipeline) # , instance=instance,) assert mock_subproc_popen.call_args_list[0][0] == ([ "spark-submit", "--master", "", "--name", "airflow-spark", "some_path.py" ], )
def test_render_template(self): # Given operator = SparkSubmitOperator(task_id='spark_submit_job', dag=self.dag, **self._config) ti = TaskInstance(operator, DEFAULT_DATE) # When ti.render_templates() # Then expected_application_args = [ '-f', 'foo', '--bar', 'bar', '--start', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), '--end', DEFAULT_DATE.strftime("%Y-%m-%d"), '--with-spaces', 'args should keep embdedded spaces', ] expected_name = 'spark_submit_job' self.assertListEqual(expected_application_args, getattr(operator, '_application_args')) self.assertEqual(expected_name, getattr(operator, '_name'))
def spark_submit_operator(self, dag): operator = SparkSubmitOperator( task_id="spark_submit_task", application="script.py", application_args=["input.csv", "output.csv"], dag=dag, ) track_dag(dag) return operator
def sparkOperator(file, task_id, executor_cores=5, num_executors=10, **kwargs): return SparkSubmitOperator( application='/home/airflow/airflow-apps/dlpredictor/{}'.format(file), application_args=[], conn_id='spark_default', executor_memory='32G', conf={'spark.driver.maxResultSize': '8g'}, driver_memory='32G', executor_cores=executor_cores, num_executors=num_executors, task_id=task_id, dag=dag, **kwargs)
def sparkOperator(file, task_id, **kwargs): return SparkSubmitOperator( application='/home/airflow/airflow/din_model/pipeline/{}'.format(file), application_args=['/home/airflow/airflow/din_model/config.yml'], conn_id='spark_default', executor_memory='32G', conf={'spark.driver.maxResultSize': '4g'}, driver_memory='32G', executor_cores=5, num_executors=20, task_id=task_id, dag=dag, **kwargs)
def execute(self, context): dag = DAG(self.task_id) task_text = '{}'.format(self.task_id) self.log.info(f"SPARK: Executing " + task_text) _config = { 'application': self.application, 'master': self.master, 'deploy-mode': self.deploy_mode, 'executor_cores': self.executor_cores, 'EXECUTORS_MEM': self.executor_memory } SparkSubmitOperator(task_id=self.task_id, dag=dag, **_config)
def spark_submit_operator(self, dag): operator = SparkSubmitOperator( task_id="spark_submit_task", application="script.py", application_args=["input.csv", "output.csv"], dag=dag, ) env = { "AIRFLOW_CTX_DAG_ID": "test_dag", "AIRFLOW_CTX_EXECUTION_DATE": "spark_submit_task", "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000", "AIRFLOW_CTX_TRY_NUMBER": "1", } add_tracking_to_submit_task(env, operator) return operator
def spark_submit_operator(self, dag): operator = SparkSubmitOperator( task_id="spark_submit_task", application="script.py", application_args=["input.csv", "output.csv"], dag=dag, ) env = { "AIRFLOW_CTX_DAG_ID": "test_dag", "AIRFLOW_CTX_EXECUTION_DATE": "spark_submit_task", "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000", "AIRFLOW_CTX_TRY_NUMBER": "1", "AIRFLOW_CTX_UID": get_airflow_instance_uid(), } with wrap_operator_with_tracking_info(env, operator): return operator
def sparkOperator(file, task_id, **kwargs): return SparkSubmitOperator( application= '/home/airflow/airflow-apps/lookalike-model/lookalike_model/application/pipeline/{}' .format(file), application_args=[ '/home/airflow/airflow-apps/lookalike-model/lookalike_model/application/pipeline/config.yml' ], conn_id='spark_default', executor_memory='8G', conf={ 'spark.driver.maxResultSize': '5g', 'spark.hadoop.hive.exec.dynamic.partition': True, 'spark.hadoop.hive.exec.dynamic.partition.mode': 'nonstrict' }, driver_memory='8G', executor_cores=5, num_executors=20, task_id=task_id, dag=dag, **kwargs)
def test_render_template(self): # Given operator = SparkSubmitOperator(task_id='spark_submit_job', dag=self.dag, **self._config) ti = TaskInstance(operator, DEFAULT_DATE) # When ti.render_templates() # Then expected_application_args = [ u'-f foo', u'--bar bar', u'--start %s' % (DEFAULT_DATE - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), u'--end %s' % DEFAULT_DATE.strftime("%Y-%m-%d") ] expected_name = "spark_submit_job" self.assertListEqual(sorted(expected_application_args), sorted(getattr(operator, '_application_args'))) self.assertEqual(expected_name, getattr(operator, '_name'))
def transform(self, subdag: nx.DiGraph, parent_fragment: DAGFragment) -> DAGFragment: subdag_roots = [n for n, d in subdag.in_degree() if d == 0] first_root = subdag_roots[0].task_id task_id_prefix = '' if first_root in ['op2', 'op3'] else '2' TestSubDagTransformer1.op1 = SparkSubmitOperator( task_id=f"t{task_id_prefix}p1", dag=self.dag) TestSubDagTransformer1.op2 = EmrAddStepsOperator( task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag) TestSubDagTransformer1.op3 = S3ListOperator( task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag) TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator( task_id=f"t{task_id_prefix}p4", dag=self.dag) TestSubDagTransformer1.op5 = DummyOperator( task_id=f"t{task_id_prefix}p5", dag=self.dag) TestSubDagTransformer1.op1 >> [ TestSubDagTransformer1.op2, TestSubDagTransformer1.op3 ] >> TestSubDagTransformer1.op4 return DAGFragment( [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'start_date': datetime.now() - timedelta(minutes=20), 'retries': 5, 'retry_delay': timedelta(minutes=1), 'dagrun_timeout': timedelta(minutes=5) } with DAG('batch_pipeline', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: # Define task clean, running a cleaning job. t1 = BashOperator(task_id='print_current_date', bash_command='date') t2 = BashOperator( task_id='print_job_started', bash_command= 'echo "******* *** *** Spark Batch Job Has Started ********************"' ) flat_obs = SparkSubmitOperator(application=entry_point, verbose=True, task_id='flat_obs', conn_id='spark_default') t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"') t1 >> t2 >> flat_obs >> t3
def first_function_execute(**context): print("HELLO ") def second_function_execute(**context): print("Is it me you looking for") default_args = { "owner": "airflow", "retries": 1, "retry_delay": timedelta(minutes=5), "depends_on_past": False, "start_date": datetime(2021, 1, 1), } with DAG(dag_id="spark2", schedule_interval="@once", default_args=default_args, catchup=False) as f: first_f = PythonOperator(task_id="first", python_callable=first_function_execute, provide_context=True, op_kwargs={"name": "Soumil Shah"}) spark_submit_task1 = SparkSubmitOperator(task_id='spark_submit_job', conn_id='spark_default') first_f >> spark_submit_task1
dag=dag) # schedule spark jobs via airflow # Import the operator from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator # Set the path for our files. entry_point = os.path.join(os.environ["AIRFLOW_HOME"], "scripts", "clean_ratings.py") dependency_path = os.path.join(os.environ["AIRFLOW_HOME"], "dependencies", "pydiaper.zip") with DAG('data_pipeline', start_date=datetime(2019, 6, 25), schedule_interval='@daily') as dag: # Define task clean, running a cleaning job. clean_data = SparkSubmitOperator( application=entry_point, py_files=dependency_path, task_id='clean_data', conn_id='spark_default') # deploy pipeline spark_args = {"py_files": dependency_path, "conn_id": "spark_default"} # Define ingest, clean and transform job. with dag: ingest = BashOperator(task_id='Ingest_data', bash_command='tap-marketing-api | target-csv --config %s' % config) clean = SparkSubmitOperator(application=clean_path, task_id='clean_data', **spark_args) insight = SparkSubmitOperator(application=transform_path, task_id='show_report', **spark_args) # set triggering sequence ingest >> clean >> insight
def test_execute(self): # Given / When conn_id = 'spark_default' operator = SparkSubmitOperator(task_id='spark_submit_job', spark_binary="sparky", dag=self.dag, **self._config) # Then expected results expected_dict = { 'conf': { 'parquet.compression': 'SNAPPY' }, 'files': 'hive-site.xml', 'py_files': 'sample_library.py', 'archives': 'sample_archive.zip#SAMPLE', 'driver_class_path': 'parquet.jar', 'jars': 'parquet.jar', 'packages': 'com.databricks:spark-avro_2.11:3.2.0', 'exclude_packages': 'org.bad.dependency:1.0.0', 'repositories': 'http://myrepo.org', 'total_executor_cores': 4, 'executor_cores': 4, 'executor_memory': '22g', 'keytab': 'privileged_user.keytab', 'principal': 'user/[email protected]', 'proxy_user': '******', 'name': '{{ task_instance.task_id }}', 'num_executors': 10, 'verbose': True, 'application': 'test_application.py', 'driver_memory': '3g', 'java_class': 'com.foo.bar.AppMain', 'application_args': [ '-f', 'foo', '--bar', 'bar', '--start', '{{ macros.ds_add(ds, -1)}}', '--end', '{{ ds }}', '--with-spaces', 'args should keep embdedded spaces', ], 'spark_binary': 'sparky' } self.assertEqual(conn_id, operator._conn_id) self.assertEqual(expected_dict['application'], operator._application) self.assertEqual(expected_dict['conf'], operator._conf) self.assertEqual(expected_dict['files'], operator._files) self.assertEqual(expected_dict['py_files'], operator._py_files) self.assertEqual(expected_dict['archives'], operator._archives) self.assertEqual(expected_dict['driver_class_path'], operator._driver_class_path) self.assertEqual(expected_dict['jars'], operator._jars) self.assertEqual(expected_dict['packages'], operator._packages) self.assertEqual(expected_dict['exclude_packages'], operator._exclude_packages) self.assertEqual(expected_dict['repositories'], operator._repositories) self.assertEqual(expected_dict['total_executor_cores'], operator._total_executor_cores) self.assertEqual(expected_dict['executor_cores'], operator._executor_cores) self.assertEqual(expected_dict['executor_memory'], operator._executor_memory) self.assertEqual(expected_dict['keytab'], operator._keytab) self.assertEqual(expected_dict['principal'], operator._principal) self.assertEqual(expected_dict['proxy_user'], operator._proxy_user) self.assertEqual(expected_dict['name'], operator._name) self.assertEqual(expected_dict['num_executors'], operator._num_executors) self.assertEqual(expected_dict['verbose'], operator._verbose) self.assertEqual(expected_dict['java_class'], operator._java_class) self.assertEqual(expected_dict['driver_memory'], operator._driver_memory) self.assertEqual(expected_dict['application_args'], operator._application_args) self.assertEqual(expected_dict['spark_binary'], operator._spark_binary)
# Step 2: Move json file to hdfs storage move_to_hdfs = BashOperator(task_id="move_to_hdfs", bash_command=""" hdfs dfs -mkdir -p /dim_sku && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/sku_data.csv /dim_sku """) # Step 3: Create a hive table on our sku_data creating_sku_table = HiveOperator(task_id="creating_sku_table", hive_cli_conn_id="hive_conn", hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS dim_sku( asin STRING, title STRING, price DOUBLE, brand STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE """) processing_sku_data = SparkSubmitOperator( task_id="processing_sku_data", conn_id="spark_conn", application="/usr/local/airflow/dags/scripts/dim_sku_processing.py", verbose=False) unzip_file_store_as_csv >> move_to_hdfs >> creating_sku_table >> processing_sku_data
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2019, 6, 22), 'schedule_interval': None, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(seconds=30), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } with DAG("HELLO", catchup=False, default_args=default_args) as dag: t1 = BashOperator( task_id='print_date', bash_command='date', ) t2 = SparkSubmitOperator( task_id="run_spark_job", application=f"{os.environ['AIRFLOW__CORE__DAGS_FOLDER']}/find_pi.py", ) t2 >> t1
spark_config = { 'conn_id': 'spark_local', 'java_class': 'com.spark.airflow.test_spark_airflow', 'application': '/Users/ravimuthyala/AirflowSparkTestCode/sparkairflowtest_2.12-0.1.jar', 'jars': '/Users/ravimuthyala/AirflowSparkTestCode/postgresql-42.2.12.jar', 'application_args': ["/Users/ravimuthyala/AirflowSparkTestCode/receipts.csv"], 'driver_memory': '1g', 'executor_cores': 1, 'num_executors': 1, 'executor_memory': '1g' } spark_submit_operator = SparkSubmitOperator(task_id='Spark_Scala_Submit_Job', dag=dag, **spark_config) emailNotify = EmailOperator(task_id='email_notification', to='*****@*****.**', subject='Spark Submit Job Alert', html_content='Airflow Spark Submit Job Done', dag=dag) t1Failed = EmailOperator(dag=dag, trigger_rule=TriggerRule.ONE_FAILED, task_id="SparkJobFailed", to=["*****@*****.**"], subject="Spark job Failed", html_content='<h3>Spark job has failed</h3>')
from airflow.models import DAG from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator from datetime import datetime default_args = { 'owner': 'airflow', 'start_date': datetime(2019, 1, 1) } dag = DAG('stackoverflow_stats', default_args=default_args, schedule_interval='@daily') SparkSubmitOperator( task_id='get-stats', application="/usr/local/airflow/jobs/stats.py", dag=dag, run_as_user='******', application_args=['--date', '{{ ds }}'], name='Stats DAG for {{ ds }}', num_executors=2, executor_memory='2g' )
'spark.hadoop.fs.s3a.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', 'spark.hadoop.fs.s3a.access.key': os.environ.get('AWS_ACCESS_KEY_ID', ''), 'spark.hadoop.fs.s3a.secret.key': os.environ.get('AWS_SECRET_ACCESS_KEY', ''), 'spark.hadoop.fs.s3a.endpoint': "{}:{}".format(os.environ.get('AWS_SERVER', ''), os.environ.get('AWS_PORT', '')), 'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false', 'spark.hadoop.fs.s3a.path.style.access': 'true', 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem' } spark = SparkSubmitOperator(task_id='fetch_csv_from_s3_and_update_postgres', dag=dag, conf=spark_conf, application='{spark_dir}/s3topostgres.py'.format( spark_dir=SPARK_DIRECTORY), application_args=['-f', FILE, '-t', TABLE]) check = CheckOperator(task_id='check_demo_contains_data', conn_id='local_pg', sql='SELECT COUNT(*) FROM {table}'.format(table=TABLE), dag=dag) spark >> check
'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG( 'din_model_integration', default_args=default_args, schedule_interval=None, ) clean = SparkSubmitOperator( application='/home/wei2/airflow/din_model/pipeline/main_clean.py', application_args=['/home/wei2/airflow/din_model/config.yml'], conn_id='spark_default', executor_memory='16G', driver_memory='16G', executor_cores=5, num_executors=20, task_id='din_clean', dag=dag, ) process = SparkSubmitOperator( application='/home/wei2/airflow/din_model/pipeline/main_processing.py', application_args=['/home/wei2/airflow/din_model/config.yml'], conn_id='spark_default', executor_memory='16G', driver_memory='16G', executor_cores=5, num_executors=20, task_id='din_processing',
'log.kafka.topic': 'druid-kafka-proxy', 'log.kafka.create': 'true', 'log.kafka.servers': 'soctxadev01.gsoc.verizon.com:6667', 'log.rowcase.headers': 'cs_username:lower,vzid:lower,x_cs_auth_domain:upper,x_exception_id:lower,sc_filter_result:uppper,cs_referer:lower,sc_status:upper,s_action:upper,cs_method:upper,rs_content_type:lower,cs_uri_scheme:lower,cs_host:lower,cs_uri_path:lower,cs_uri_query:lower,cs_uri_extension:lower,cs_user_agent:lower,x_bluecoat_application_name:lower,x_bluecoat_application_operation:lower,cs_categories:lower,cs_auth_group:lower', 'log.partition.by.date': 'true' }.items() + [v.split("=") for v in OTHER_PARAM_OVERRIDES.split(",")]) spark_submit_task = SparkSubmitOperator( task_id='spark_submit_job', conn_id='spark_default', java_class='com.verizon.gsoc.datasources.phoenix.Phoenix', application=EXECUTABLE_PATH, # application_args=[' '.join(['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()])], application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()], total_executor_cores='1', executor_cores='1', executor_memory='2g', num_executors='2', name='spark-airflow-phoenix', verbose=True, driver_memory='1g', xcom_push='true', conf=config, dag=dag, ) def print_hello(): return 'Finally it worked!!!!' + str(datetime.now().strftime("%m%d%Y-%H%M")) def print_check(): return 'Finally it worked!!!!' + str(datetime.now().strftime("%m%d%Y-%H%M"))
task_id='load_dim_visa_type', dag=dag, table="dim_visa_type", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="i94project", s3_key="dicts/visa_type.csv", copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1") ) # process us airport in spark process_dim_us_airport = SparkSubmitOperator( application=r"/usr/local/airflow/plugins/helpers/spark_dispatch.py", application_args=[r"process_airport", # command r"s3a://i94project/dicts", # dictionaries r"s3a://i94project/stage/input/airport-codes.csv", # input_path r"s3a://i94project/stage/output/airport.parquet"], # output_path task_id="process_dim_us_airport", packages=spark_packages, dag=dag ) # load dim_us_airport table load_dim_us_airport = StageToRedshiftOperator( task_id='load_dim_us_airport', dag=dag, table="dim_us_airport", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="i94project", s3_key="stage/output/airport.parquet", # copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1")
max_active_runs=1) start_operator = DummyOperator(task_id='begin_execution', dag=dag) download_git_data = PythonOperator(task_id="download_git_data", python_callable=download_data, dag=dag, provide_context=True) spark_config = { 'conn_id': config.get('HOST', 'SPARK_CONN'), 'application': config.get('HOST', 'SPARK_APP') } spark_process = SparkSubmitOperator(task_id="spark_submit", dag=dag, **spark_config) del_json_task = BashOperator( task_id="delete_old_data", bash_command='rm -r ' + home_dir + '/"{{ (execution_date - macros.timedelta(days=3)).strftime("%Y-%m-%d") }}"', ) del_crc_task = BashOperator( task_id="delete_crc_data", bash_command='find ' + home_dir + '/git_{{ (execution_date - macros.timedelta(days=2)).strftime("%Y-%m-%d") }}.parquet/ -name "*.crc" -exec rm \'{}\' \;', ) del_suc_task = BashOperator(
task_id='add_partition_title_basics_table', hql=hiveSQL_add_partition_title_basics, hive_cli_conn_id='beeline', dag=dag) dummy_op = DummyOperator(task_id='dummy', dag=dag) pyspark_top_tvseries = SparkSubmitOperator( task_id='pyspark_write_top_tvseries_to_final', conn_id='spark', application='/home/airflow/airflow/python/pyspark_top_tvseries.py', total_executor_cores='2', executor_cores='2', executor_memory='2g', num_executors='2', name='spark_calculate_top_tvseries', verbose=True, application_args=[ '--year', '{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}', '--month', '{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}', '--day', '{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', '--hdfs_source_dir', '/user/hadoop/imdb', '--hdfs_target_dir', '/user/hadoop/imdb_final/top_tvseries', '--hdfs_target_format', 'csv' ], dag=dag) create_table_for_top_tvseries = HiveOperator( task_id='create_top_tvseries_external_table', hql=hiveSQL_create_top_tvseries_external_table, hive_cli_conn_id='beeline', dag=dag)
DEFAULT_DATE = timezone.datetime(2017, 1, 1) srcDir = os.getcwd() + '/dags/repo/examples/hello_2.11-1.0.jar' args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('sparkjob', default_args=args, schedule_interval='@monthly', dagrun_timeout=timedelta(minutes=60)) spark_task = BashOperator( task_id='spark_java_bash', bash_command='spark-submit --class {{ params.class }} {{ params.jar }}', params={ 'class': 'hello', 'jar': srcDir }, dag=dag) _config = { 'application': srcDir, 'master': 'local', 'deploy-mode': 'cluster', 'executor_cores': 1, 'EXECUTORS_MEM': '1G' } operator = SparkSubmitOperator(task_id='spark_submit_op_job', dag=dag, java_class='hello', **_config) operator >> spark_task
schedule_interval=None, default_args=default_args, user_defined_macros=conf.__dict__) # 定义一个空任务作为起始任务 start = DummyOperator(task_id='start', queue='script', dag=dag) # 使用SparkSubmitOperator定义执行spark作业的任务 # 和作业有关的所有参数和配置都在_config字段中定义 # tk-dev-emr-airflow-spark是定义的emr spark连接,该连接需要在airflow admin web页面定义 # application_args可用于指定pyspark脚本中自定义的参数 _config = { 'name': '{{ ti.task_id }}', 'application': '/server/airflow/dags/testoperator/wordcount.py', 'executor_cores': 2, 'executor_memory': '12g', 'application_args': [ '-fid', '{{ ti.job_id }}', ] } spark_task1 = SparkSubmitOperator(task_id='spark_task1', conn_id='tk_dev_dw_spark', queue='script', dag=dag, **_config) # 定义任务之间依赖关系 start >> spark_task1
saving_rates = BashOperator(task_id="saving_rates", bash_command=""" hdfs dfs -mkdir -p /forex && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex """) creating_forex_rates_table = HiveOperator( task_id="creating_forex_rates_table", hive_cli_conn_id="hive_conn", hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates( base STRING, last_update DATE, eur DOUBLE, usd DOUBLE, nzd DOUBLE, gbp DOUBLE, jpy DOUBLE, cad DOUBLE ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE """) forex_processing = SparkSubmitOperator( task_id="forex_processing", conn_id="spark_conn", application="/usr/local/airflow/dags/scripts/forex_processing.py", verbose=False)