def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id(self): with patch("boto3.client", self.boto3_client_mock): operator = EmrCreateJobFlowOperator( task_id="test_task", aws_conn_id="aws_default", emr_conn_id="emr_default" ) self.assertEqual(operator.execute(None), "j-8989898989")
def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id( self): with patch('boto3.client', self.boto3_client_mock): operator = EmrCreateJobFlowOperator(task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default') self.assertEqual(operator.execute(None), 'j-8989898989')
def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id(self): with patch('boto3.client', self.boto3_client_mock): operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default' ) self.assertEqual(operator.execute(None), 'j-8989898989')
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=self._config, region_name='ap-southeast-2', dag=DAG('test_dag_id', default_args=args))
def create_emr_job_flow(**kwargs): # Required params to be changed: # - aws_account_id # - aws_region # - ec2_key_pair # - ec2_subnet_id emr_settings = EmrSettings( aws_account_id="????????????", aws_region="us-east-1", ec2_key_pair="???????????", ec2_subnet_id="????????????", cluster_name=f"Ifood Data Architect Test | {kwargs['ds']}", master_instance_type="m5.4xlarge", master_instance_count=1, core_instance_type="m5.4xlarge", core_instance_count=1, core_instance_market="ON_DEMAND", task_instance_type="c5.2xlarge", task_instance_count=1, task_instance_market="SPOT", step_concurrency_level=4) job_flow_id = EmrCreateJobFlowOperator( task_id="create_cluster_emr_job_task", aws_conn_id="aws_default", region_name=emr_settings.aws_region, job_flow_overrides=emr_settings.crete_job_flow_overrides(), dag=dag, ).execute(kwargs) kwargs["ti"].xcom_push(key="job_flow_id", value=job_flow_id)
def _get_test_dag(self): with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag: op1 = SparkSubmitOperator(task_id='op1') op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo') op3 = S3ListOperator(task_id='op3', bucket='foo') op4 = EmrCreateJobFlowOperator(task_id='op4') op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo') op6 = FileToWasbOperator(task_id='op6', container_name='foo', blob_name='foo', file_path='foo') op7 = EmailOperator(task_id='op7', subject='foo', to='foo', html_content='foo') op8 = S3CopyObjectOperator(task_id='op8', dest_bucket_key='foo', source_bucket_key='foo') op9 = BranchPythonOperator(task_id='op9', python_callable=print) op10 = PythonOperator(task_id='op10', python_callable=range) op1 >> [op2, op3, op4] op2 >> [op5, op6] op6 >> [op7, op8, op9] op3 >> [op7, op8] op8 >> [op9, op10] return dag
def run_emr_job(current_dag, cluster_name, task_gen_name, aws_connection, emr_connection, script_location, library_location, region='us-east-1'): """ Creates the EMR cluster, runs the step and terminates the cluster when it is completed current_dag: DAG that is created by the user cluster_name: Name given to cluster by the user task_gen_name: A general name for the task being done. This is used to name different tasks aws_connection: Connection to AWS for account credentials emr_connection: Name of Airflow connection storing EMR configuration details script_location: S3 location of the xcript to be run library_location: S3 location of the library being used to run spark-submit region: AWS region where the cluster is being created """ # Name of the new cluster being created job_flow_overrides = {'Name': cluster_name} # name of task creating the cluster create_cluster_task_name = task_gen_name + "_create_cluster" # Task that creates the cluster cluster_creator = EmrCreateJobFlowOperator( task_id=create_cluster_task_name, job_flow_overrides=job_flow_overrides, aws_conn_id=aws_connection, emr_conn_id=emr_connection, dag=current_dag) # script-runner.jar file location is region specific script_runner_jar = 's3://' + region + '.elasticmapreduce/libs/script-runner/script-runner.jar' # Step description step_definition = [{ 'Name': task_gen_name, 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': script_runner_jar, 'Args': [script_location, library_location, ''] } }] # Task that terminates the cluster cluster_remover = EmrTerminateJobFlowOperator( task_id=task_gen_name + "_remove_cluster", job_flow_id="{{ task_instance.xcom_pull('" + create_cluster_task_name + "', key='return_value') }}", aws_conn_id=aws_connection, dag=current_dag) # Add the step and step checker tasks add_step_to_emr(cluster_creator, task_gen_name, step_definition, cluster_remover, create_cluster_task_name, aws_connection, current_dag)
def create_dag(): with DAG(dag_id='emr_job_flow_manual_steps_dag', default_args=DEFAULT_DAG_ARGS, dagrun_timeout=timedelta(hours=2), max_active_runs=1, schedule_interval=None) as dag: create_cluster_op = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides={'Name': CLUSTER_NAME}, aws_conn_id=AWS_CONN_ID, emr_conn_id=EMR_CONN_ID) add_steps_to_cluster_op = TemplatedEmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID, steps=[{ 'Name': 'calculate_pi', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar', 'Args': ['10'], 'MainClass': 'org.apache.spark.examples.SparkPi' } }]) monitor_cluster_op = EmrJobFlowSensor( task_id='monitor_cluster', retries=0, aws_conn_id=AWS_CONN_ID, job_flow_id= '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}', timeout=1800) monitor_step_op = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=AWS_CONN_ID) terminate_cluster_op = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID) handle_failure_op = PythonOperator( task_id='handle_failure', python_callable=handle_failure_task, trigger_rule=trigger_rule.TriggerRule.ONE_FAILED) create_cluster_op >> monitor_cluster_op >> handle_failure_op create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op return dag
def print_hello(): print('Hello world!') try: print("eafds") create_emr = EmrCreateJobFlowOperator(task_id='create_job_flow',aws_conn_id='aws_default',dag=dag) return(create_emr) except AirflowException as ae: print (ae.message)
def execute(self, context): if self.environment not in ["dev", "prod"]: logging.error(f"Can't recognise deployment environment '{self.environment}'. \n" "Review the environment variable 'DEPLOYMENT_ENVIRONMENT'") raise ValueError(f"self.environment = os.environ['DEPLOYMENT_ENVIRONMENT'] --> {self.environment}") # check if development/local environment if self.environment == 'dev': logging.info("EMR cluster running from development environment") # get user aws name client = boto3.client('sts') username = client.get_caller_identity()['Arn'].split(":", 5)[5].split("/", 1)[1].lower() # Create zipped archive of the local airflow repository airflow_repo_path = '/home/vagrant/uk_dm_airflow' zip_local_path = '/tmp/latest' shutil.make_archive(base_name='/tmp/latest', format='zip', root_dir=airflow_repo_path) logging.info(f"Zipped file location: {zip_local_path}") # Upload zipped airflow repository to user's s3 bucket hook = S3_hook.S3Hook(aws_conn_id=self.aws_conn_id) hook.load_file(f"{zip_local_path}.zip", f'{username}/spark_local/latest.zip', bucket_name='grp-ds-users', replace=True) logging.info(f"Airflow repo uploaded to user bucket. User: '******'") # Upload local bootstrap file to user s3 buckets bootstrap_path = self.bootstrap_path hook.load_file(bootstrap_path, f'{username}/spark_local/bootstrap.sh', bucket_name='grp-ds-users', replace=True) self.override_emr_template(username) return EmrCreateJobFlowOperator.execute(self, context) # Create cluster and return jobflow_id # Output the edited EMR template. self.job_flow_overrides['BootstrapActions'][0]['ScriptBootstrapAction']['Args'] = [ f'{self.environment}', self.install_packages_on_emr] logging.info(self.job_flow_overrides) return EmrCreateJobFlowOperator.execute(self, context) # Returns the jobflow id
def get_create_job_flow_operator( job_flow_name, job_flow_overrides, aws_conn_id, emr_conn_id, ): cc_index = _get_cc_index_template() job_flow_overrides["Steps"][0]["HadoopJarStep"]["Args"][-1] = cc_index print(job_flow_overrides) return EmrCreateJobFlowOperator( task_id=_get_job_flow_creator_task_id(job_flow_name), job_flow_overrides=job_flow_overrides, aws_conn_id=aws_conn_id, emr_conn_id=emr_conn_id, )
def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=self._config, dag=DAG('test_dag_id', default_args=args) )
def transform(self, subdag: nx.DiGraph, parent_fragment: DAGFragment) -> DAGFragment: subdag_roots = [n for n, d in subdag.in_degree() if d == 0] first_root = subdag_roots[0].task_id task_id_prefix = '' if first_root in ['op2', 'op3'] else '2' TestSubDagTransformer1.op1 = SparkSubmitOperator( task_id=f"t{task_id_prefix}p1", dag=self.dag) TestSubDagTransformer1.op2 = EmrAddStepsOperator( task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag) TestSubDagTransformer1.op3 = S3ListOperator( task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag) TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator( task_id=f"t{task_id_prefix}p4", dag=self.dag) TestSubDagTransformer1.op5 = DummyOperator( task_id=f"t{task_id_prefix}p5", dag=self.dag) TestSubDagTransformer1.op1 >> [ TestSubDagTransformer1.op2, TestSubDagTransformer1.op3 ] >> TestSubDagTransformer1.op4 return DAGFragment( [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
file_content = content_object.get()["Body"].read().decode("utf-8") return json.loads(file_content) with DAG( dag_id=DAG_ID, description="Run multiple Spark jobs with Amazon EMR", default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(1), schedule_interval=None, tags=["emr", "spark", "pyspark"], ) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=get_object( "job_flow_overrides/job_flow_overrides.json", work_bucket), ) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=get_object("emr_steps/emr_steps.json", work_bucket), ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
class TestEmrCreateJobFlowOperator(unittest.TestCase): # When _config = { 'Name': 'test_job_flow', 'ReleaseLabel': '5.11.0', 'Steps': [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', '{{ macros.ds_add(ds, -1) }}', '{{ ds }}' ] } }] } def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=self._config, dag=DAG('test_dag_id', default_args=args) ) def test_init(self): self.assertEqual(self.operator.aws_conn_id, 'aws_default') self.assertEqual(self.operator.emr_conn_id, 'emr_default') def test_render_template(self): ti = TaskInstance(self.operator, DEFAULT_DATE) ti.render_templates() expected_args = { 'Name': 'test_job_flow', 'ReleaseLabel': '5.11.0', 'Steps': [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), DEFAULT_DATE.strftime("%Y-%m-%d"), ] } }] } self.assertDictEqual(self.operator.job_flow_overrides, expected_args) def test_execute_returns_job_id(self): self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) with patch('boto3.session.Session', self.boto3_session_mock): self.assertEqual(self.operator.execute(None), 'j-8989898989')
'Name': 'calculate_pi', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['/usr/lib/spark/bin/run-example', 'SparkPi', '10'] } }] JOB_FLOW_OVERRIDES = {'Name': 'PiCalc', 'Steps': SPARK_TEST_STEPS} dag = DAG('emr_job_flow_automatic_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *') job_flow_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag) job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', dag=dag) job_flow_creator.set_downstream(job_sensor)
def create_job_flow_file(job_var): client = boto3.client('s3') client.put_object(Body=job_var, Bucket='vivek-mathew', Key='job-flow.txt') dag = DAG(dag_id='Emr', schedule_interval=schedule, default_args=args, catchup=False) # @TODO : Add a task for getting the latest AMI create_cluster = EmrCreateJobFlowOperator(task_id="create_cluster", aws_conn_id='aws_default', emr_conn_id='test_emr', dag=dag) create_job_flow_variable = PythonOperator( task_id="set_jobflow_var", python_callable=set_job_flow_var, op_args=[ "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}" ], dag=dag) create_job_flow_file = PythonOperator( task_id="create_job_flow_file", python_callable=create_job_flow_file, op_args=[ "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}"
'region': PARAMS['REGION'], 'aws_access_key': PARAMS['aws_access_key'], 'aws_secret': PARAMS['aws_secret'], 'bucket': PARAMS['RAW_DATA_BUCKET'], 'file_path': PARAMS['PYTHON_APPS'] }, dag=dag) cluster_creator = EmrCreateJobFlowOperator(task_id='create_immigration_job', job_flow_overrides=JOB_FLOW, aws_conn_id='aws_default', emr_conn_id='emr_default', region_name=PARAMS['REGION'], dag=dag) add_transform_step_task = EmrAddStepsOperatorV2( task_id='add_transform_step', job_flow_id= "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}", aws_conn_id='aws_default', steps=TRANSFORM_IMMIGRATION_SAS_DATA, region_name=PARAMS['REGION'], dag=dag) watch_immigration_transform_task = EmrStepSensor( task_id='watch_immigration_transform', job_flow_id=
from airflow.utils.dates import days_ago from emr_job_flow_with_sensor import EmrJobFlowWithSensor from emr_step_with_sensor import EmrStepWithSensor # the job flow step configuration as described here: # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Client.run_job_flow step_conf = {} job_conf = {} dag = DAG( dag_id='spark_job', default_args={ 'owner': 'airflow', 'start_date': days_ago(1) } ) job = EmrJobFlowWithSensor( task_id='job_and_retry', job_flow=EmrCreateJobFlowOperator( task_id='job', job_flow_overrides=job_conf ), sensor=EmrJobFlowSensor( task_id='sensor', job_flow_id='' ), dag=dag )
task_id='transfer_brazil_data_file', python_callable=transfer_brazil_data_file, dag=dag) # Verify weather US data file exists transfer_usa_data_file_task = PythonOperator( task_id='transfer_usa_data_file', python_callable=transfer_usa_data_file, op_kwargs={ 'bucket': 'covid19-lake', 'prefix': 'enigma-aggregation/json/us_states' }, dag=dag) # Create an EMR JobFlow spin_up_emr_cluster_task = EmrCreateJobFlowOperator( task_id='spin_up_emr_cluster', job_flow_overrides=emr_settings, dag=dag) # Add steps to an existing EMR JobFlow add_pipeline_to_emr_cluster_task = EmrAddStepsOperator( task_id='add_pipeline_to_emr_cluster', job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \ +" key='return_value')}}", steps=covid19_pipeline, dag=dag ) # Wait step to be completed watch_pipeline_step_task = EmrStepSensor( task_id='watch_pipeline_step', job_flow_id="{{task_instance.xcom_pull(" \ " 'spin_up_emr_cluster'," \
logging.info('checking that data exists in s3') source_s3 = S3Hook(aws_conn_id='aws_default') keys = source_s3.list_keys( bucket_name='dendsparktut', #TODO prefix='raw_data/') #TODO logging.info('keys {}'.format(keys)) check_data_exists_task = PythonOperator(task_id='check_data_exists', python_callable=check_data_exists, provide_context=False, dag=dag) create_job_flow_task = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=default_emr_settings, dag=dag) copy_python_script = EmrAddStepsOperator( task_id='copy_script', # XComs let tasks exchange messages job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=copy_script_step, dag=dag) watch_prev_step_task1 = EmrStepSensor( task_id='watch_prev_step1', job_flow_id=
# Load pyspark script file into S3 script_to_s3 = LoadFileIntoS3Operator( dag=dag, task_id="script_to_s3", airflow_folder=config['S3']['airflow_folder'], filename=config['S3']['local_script'], s3_key=config['S3']['s3_script'], bucket_name=config['S3']['BUCKET_NAME'], aws_credentials_id="aws_credentials") # Create an EMR cluster create_emr_cluster = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_credentials", emr_conn_id="emr_default", dag=dag, ) # Add your steps to the EMR cluster step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_credentials", steps=SPARK_STEPS, params={ "BUCKET_NAME": config['S3']['BUCKET_NAME'], "s3_script": config['S3']['s3_script'], "s3_clean": config['S3']['s3_clean'],
key="", load_sas=False, provide_context=True) # read local script files and transfer to S3 load_script = StageToS3Operator(task_id="load_script_to_S3", mode="scripts", filename=local_scripts, bucket_name=bucket_name, prefix="scripts", key="") # Create an EMR cluster create_emr_cluster = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=get_job_flow_overrides(job_flow_overrides), aws_conn_id="aws_default", emr_conn_id="emr_default") # Add steps to the EMR cluster # Step 1 = ETL Pipeline # Step 2 = Data Quality Test step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, params={ "bucket_name": bucket_name, "s3_etl_script": s3_etl_script,
}, #todo is it required? { "Name": "Hadoop" }, ], "VisibleToAllUsers": True, # todo not found "JobFlowRole": "EMR_EC2_DefaultRole", # todo called InstanceProfile? "ServiceRole": "EMR_DefaultRole", } create_job_flow_task = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', job_flow_overrides=default_emr_settings, dag=dag, region_name="us-east-1") extract_step_task = EmrAddStepsOperator( task_id='extract_step', # 'add_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=[{ "Name": "Step1 Preprocess", "ActionOnFailure": "CONTINUE", "HadoopJarStep": { "Jar": "command-runner.jar", # todo https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html "Args": [
'email_on_retry': True } with DAG( dag_id='flight_delays_emr', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=1), schedule_interval='@once', ) as dag: start_operator = DummyOperator(task_id='begin_execution', dag=dag) end_operator = DummyOperator(task_id='stop_execution', dag=dag) with open('emr_job_flow.json', 'r') as fp: job_flow = json.load(fp) cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=job_flow, aws_conn_id='aws_credentials', emr_conn_id='emr_default' ) job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_credentials' ) # define the DAG structure, in terms of the created operators start_operator >> cluster_creator >> job_sensor >> end_operator
def handle_failure_task(): raise AirflowException('Marking DAG as failed due to an upstream failure!') with DAG( dag_id='example_emr_job_flow_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), max_active_runs=1, schedule_interval=None, params=get_config('emr') ) as dag: create_cluster_op = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides={'Name': 'PiCalc'}, aws_conn_id=get_config('emr')['aws_conn_id'], emr_conn_id=get_config('emr')['emr_conn_id'] ) add_steps_to_cluster_op = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=get_config('emr')['aws_conn_id'], steps=[ { 'Name': 'calculate_pi', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Jar': '{{ params.hadoop_jar_path }}', 'Args': [ '10'
check_emr_database = BranchPythonOperator( task_id='check_emr_database', provide_context=True, python_callable=check_emr_database, retries=1, dag=dag, ) skip_emr_database_creation = DummyOperator( task_id="skip_emr_database_creation", trigger_rule=TriggerRule.NONE_FAILED, dag=dag, ) create_emr_database_cluster = EmrCreateJobFlowOperator( task_id='create_emr_database_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, dag=dag) create_emr_database_step = EmrAddStepsOperator( task_id='create_emr_database_step', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}", aws_conn_id='aws_default', on_failure_callback=cleanup_emr_cluster_if_steps_fail, steps=CREATE_DATABASE, ) create_emr_database_sensor = EmrStepSensor( task_id='create_emr_database_sensor', job_flow_id= "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}",
JOB_FLOW_OVERRIDES = { 'Name': 'PiCalc', 'KeepJobFlowAliveWhenNoSteps': True } dag = DAG( 'emr_job_flow_manual_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *' ) cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
class TestEmrCreateJobFlowOperator(unittest.TestCase): # When _config = { 'Name': 'test_job_flow', 'ReleaseLabel': '5.11.0', 'Steps': [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', '{{ macros.ds_add(ds, -1) }}', '{{ ds }}' ] } }] } def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=self._config, region_name='ap-southeast-2', dag=DAG('test_dag_id', default_args=args)) def test_init(self): self.assertEqual(self.operator.aws_conn_id, 'aws_default') self.assertEqual(self.operator.emr_conn_id, 'emr_default') self.assertEqual(self.operator.region_name, 'ap-southeast-2') def test_render_template(self): ti = TaskInstance(self.operator, DEFAULT_DATE) ti.render_templates() expected_args = { 'Name': 'test_job_flow', 'ReleaseLabel': '5.11.0', 'Steps': [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), DEFAULT_DATE.strftime("%Y-%m-%d"), ] } }] } self.assertDictEqual(self.operator.job_flow_overrides, expected_args) def test_execute_returns_job_id(self): self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) with patch('boto3.session.Session', self.boto3_session_mock): self.assertEqual(self.operator.execute(None), 'j-8989898989')
] JOB_FLOW_OVERRIDES = { 'Name': 'PiCalc', 'Steps': SPARK_TEST_STEPS } dag = DAG( 'emr_job_flow_automatic_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *' ) job_flow_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag ) job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', dag=dag ) job_flow_creator.set_downstream(job_sensor)
start_operator = DummyOperator(task_id="Begin_execution", dag=dag) # Empty out the analytics bucket - otherwise we aggregate results from successive runs bucket_name = BUCKET_NAME + "/" + S3_ANALYTICS_BUCKET empty_bucket = BashOperator( task_id="empty_bucket", bash_command="aws s3 rm s3://{} --recursive".format( bucket_name), dag=dag, ) # Create EMR instance create_EMR_instance = EmrCreateJobFlowOperator( task_id="create_EMR_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", dag=dag ) # Add your steps to the EMR cluster EMR_step_adder = EmrAddStepsOperator( task_id="EMR_step_adder", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, params={ # these params are used to provide the parameters for the steps JSON above "bucket_name": BUCKET_NAME, "s3_data": S3_DATA_BUCKET, "s3_script_bucket": S3_SCRIPT_BUCKET, "s3_output": S3_ANALYTICS_BUCKET,
SCRAPERS ''' scrapers_dummy = DummyOperator(task_id='scrapers_dummy', dag=dag) ''' STAGING LAYER PRE-PROCESSING (Spark consume Sources save to Parquet) ''' manifold_emr_creator = EmrCreateJobFlowOperator( task_id='create_manifold_emr', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_credentials', emr_conn_id='emr_credentials', ) manifold_emr_job_sensor = EmrJobFlowSensor( task_id='check_emr_completion', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_manifold_emr', key='return_value') }}", aws_conn_id='aws_credentials', ) ''' STAGING TABLE CREATION '''
dag = DAG('EMR_TEST_1', default_args=DEFAULT_ARGS, catchup=False, schedule_interval="0 1 * * *") with dag: file_sensor = S3KeySensor(task_id='file_sensor', poke_interval=600, timeout=1000, soft_fail=False, bucket_name='ds-afarrell', bucket_key='manybla.txt') create_cluster = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_benchmarks_connection') run_some_pyspark = EmrAddStepsOperator( task_id='run_some_pyspark', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=EMR_STEP_1) output_file_sensor = S3KeySensor( task_id='output_file_sensor', poke_interval=600, timeout=1000, soft_fail=False,
delimiter='', aws_conn_id='aws_default') processed_tweet_data_quality = S3DataQualityOperator( task_id='Processed_tweet_data_quality_check', dag=dag, bucket=bucket_etl, prefix='{}/{}'.format(tweet_stat_key, exec_date_partitioned), delimiter='', aws_conn_id='aws_default') # Create EMR job flow and monitor it job_flow_creator = EmrCreateJobFlowOperator( task_id='Create_emr_job_flow', dag=dag, job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', ) job_sensor = EmrJobFlowSensor( task_id='Check_emr_job_flow', dag=dag, job_flow_id= "{{ task_instance.xcom_pull(task_ids='Create_emr_job_flow', key='return_value') }}", aws_conn_id='aws_default', ) # Define task orders raw_tweet_data_quality >> job_flow_creator job_flow_creator >> job_sensor
'spark-submit', '--deploy-mode', 'cluster', 's3://<s3-bucket>/jobs/movies-analytics.py', '-i', 's3://<s3-bucket>/data', '-o', 's3://<s3-bucket>/results' ] } }] JOB_FLOW_OVERRIDES = {"Name": "MoviesAnalytics"} with DAG(dag_id='emr_job_movies_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval=None) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id='create_emr_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='movie_analytics_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS) step_checker = EmrStepSensor( task_id='wait_for_analytics_completion', job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id=
"Key": "Owner", "Value": "Data Analytics Team" }, ], } with DAG( dag_id=DAG_ID, description="Run built-in Spark app on Amazon EMR", default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(1), schedule_interval=None, tags=["emr", "spark"], ) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=JOB_FLOW_OVERRIDES) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",