class TestEmrStepSensor(unittest.TestCase): def setUp(self): self.emr_client_mock = MagicMock() self.sensor = EmrStepSensor( task_id='test_task', poke_interval=0, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', ) mock_emr_session = MagicMock() mock_emr_session.client.return_value = self.emr_client_mock # Mock out the emr_client creator self.boto3_session_mock = MagicMock(return_value=mock_emr_session) def test_step_completed(self): self.emr_client_mock.describe_step.side_effect = [ DESCRIBE_JOB_STEP_RUNNING_RETURN, DESCRIBE_JOB_STEP_COMPLETED_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): self.sensor.execute(None) self.assertEqual(self.emr_client_mock.describe_step.call_count, 2) calls = [ unittest.mock.call(ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N'), unittest.mock.call(ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N') ] self.emr_client_mock.describe_step.assert_has_calls(calls) def test_step_cancelled(self): self.emr_client_mock.describe_step.side_effect = [ DESCRIBE_JOB_STEP_RUNNING_RETURN, DESCRIBE_JOB_STEP_CANCELLED_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): self.assertRaises(AirflowException, self.sensor.execute, None) def test_step_failed(self): self.emr_client_mock.describe_step.side_effect = [ DESCRIBE_JOB_STEP_RUNNING_RETURN, DESCRIBE_JOB_STEP_FAILED_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): self.assertRaises(AirflowException, self.sensor.execute, None) def test_step_interrupted(self): self.emr_client_mock.describe_step.side_effect = [ DESCRIBE_JOB_STEP_RUNNING_RETURN, DESCRIBE_JOB_STEP_INTERRUPTED_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): self.assertRaises(AirflowException, self.sensor.execute, None)
def setUp(self): self.emr_client_mock = MagicMock() self.sensor = EmrStepSensor( task_id='test_task', poke_interval=0, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', ) mock_emr_session = MagicMock() mock_emr_session.client.return_value = self.emr_client_mock # Mock out the emr_client creator self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
def emr_step_task_group(script_name, cluster_id, aws_conn_id, dag): step = [{ 'Name': f'Run {script_name}.py', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ 'spark-submit', '--deploy-mode', 'client', '--py-files', f'{file_schema}{bucket_name}scripts/utils.zip', f'{file_schema}{bucket_name}scripts/{script_name}.py', '--config-path', f'{file_schema}{bucket_name}scripts/config.yaml' ] } }] add_step = EmrAddStepsOperator(task_id='add_step', job_flow_id=cluster_id, aws_conn_id=aws_conn_id, steps=step, dag=dag) wait_step_completion = EmrStepSensor( task_id='wait_step_completion', job_flow_id=cluster_id, aws_conn_id=aws_conn_id, step_id= f"{{{{ ti.xcom_pull(task_ids='run_{script_name}.add_step')[0] }}}}", dag=dag) add_step.set_downstream(wait_step_completion) return add_step, wait_step_completion
def Workflow_0(config): if config.fabric == "emr": workflow_id = "9" workflow_version = "latest" workflow_jar = f"s3://{config.s3Bucket}/prophecy/jars/9/latest/workflow.jar" prophecy_libs_jar = f"{config.prophecyLibsJar}" executor_memory = "1g" executor_cores = "4" num_executors = "6" driver_memory = "1g" driver_cores = "2" job_flow_id = config.cluster_id spark_steps = [{ "Name": "Compute_Step", "ActionOnFailure": "CONTINUE", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": ["spark-submit", "--executor-memory", executor_memory, "--executor-cores", executor_cores, "--num-executors", num_executors, "--driver-memory", driver_memory, "--driver-cores", driver_cores, "--conf", "spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4", "--conf", "spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4", "--deploy-mode", "cluster", "--class", "Main", "--jars", workflow_jar, prophecy_libs_jar, "-C", "A=B", "-C", "fabricName=" + config.fabric] } }] step_adder = EmrAddStepsOperator( task_id = "Workflow_0", job_flow_id = job_flow_id, aws_conn_id = "aws_default_pankaj", steps = spark_steps, trigger_rule = "all_success" ) step_checker = EmrStepSensor( task_id = "Workflow_0WatchSteps", job_flow_id = job_flow_id, step_id = "{{ task_instance.xcom_pull(task_ids='Workflow_0', key='return_value')[0] }}", aws_conn_id = "aws_default_pankaj" ) step_adder >> step_checker return step_adder, step_checker
# [START howto_operator_emr_manual_steps_tasks] cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEPS) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id='aws_default') cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default') cluster_creator >> step_adder >> step_checker >> cluster_remover # [END howto_operator_emr_manual_steps_tasks]
# [START howto_operator_emr_manual_steps_tasks] cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id=cluster_creator.output, steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id=cluster_creator.output, step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", ) cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id=cluster_creator.output) step_adder >> step_checker >> cluster_remover # [END howto_operator_emr_manual_steps_tasks] # Task dependencies created via `XComArgs`: # cluster_creator >> step_adder # cluster_creator >> step_checker # cluster_creator >> cluster_remover
) entity_task_list.append(render_spark_step) # add spark step to emr add_step = EmrAddStepsOperator( task_id='add_step_{}'.format(key), job_flow_id=job_flow_id, aws_conn_id='aws_default', steps="{{ task_instance.xcom_pull(task_ids='" + 'render_spark_step_{}'.format(key) + "', key='return_value') }}", ) entity_task_list.append(add_step) # wait for the step to complete watch_step = EmrStepSensor( task_id='watch_step_{}'.format(key), job_flow_id=job_flow_id, step_id="{{ task_instance.xcom_pull(task_ids='" + 'add_step_{}'.format(key) + "', key='return_value')[0] }}", aws_conn_id='aws_default' ) entity_task_list.append(watch_step) # if file has sql quality steps, then create additional tasks in DAG if file['sql_step_args']: # must drop duplicates in table as spark jdbc does not support upsert # https://issues.apache.org/jira/browse/SPARK-19335 drop_table_duplicates = PostgresOperator( task_id='drop_table_duplicates_{table}'.format(table=file['sql_step_args']['db_table']), postgres_conn_id='postgres_default', sql=""" DELETE FROM {table} t1 USING {table} t2 WHERE
task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=cluster_conf) add_step_task = EmrAddStepsOperator( task_id='My_first_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='my_aws_conn', steps=my_first_job) watch_prev_step_task = EmrStepSensor( task_id='watch_prev_step', job_flow_id= "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}", step_id= "{{task_instance.xcom_pull(task_ids='My_first_job', key='return_value')}}", aws_conn_id='aws_default') terminate_job_flow_task = EmrTerminateJobFlowOperator( task_id='terminate_job_flow', job_flow_id= "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}", aws_conn_id='aws_default', trigger_rule="all_done") #dependencies check_data_exists_task >> create_job_flow_task create_job_flow_task >> add_step_task
}] step_1 = EmrAddStepsOperator( task_id='step_1', aws_conn_id='aws_default', steps=add_scala_steps('jar-location', 'class-name', 'CONTINUE', "{{ dag_run.conf['date'] }}"), dag=dag ) sensor_1 = EmrStepSensor( task_id='sensor_1', job_flow_id="{{ dag_run.conf['cluster_id'] }}", step_id="{{ task_instance.xcom_pull('step_1', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag ) step_2 = EmrAddStepsOperator( task_id='step_2', aws_conn_id='aws_default', steps=add_scala_steps('jar-location', 'class-name', 'CONTINUE', "{{ dag_run.conf['date'] }}", "{{ dag_run.conf['date'] }}"), dag=dag ) sensor_2 = EmrStepSensor( task_id='sensor_daily_parse_message',
task_id="create_cluster", job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", ) add_step_load_raw_data = EmrAddStepsOperator( task_id="add_step_load_raw_data", job_flow_id=create_cluster.output, aws_conn_id="aws_default", steps=SparkSteps.LOAD_RAW_DATA, ) wait_for_step_load_raw_data = EmrStepSensor( task_id="wait_for_step_load_raw_data", job_flow_id=create_cluster.output, step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}", aws_conn_id="aws_default", ) add_step_transform = EmrAddStepsOperator( task_id="add_step_transform", job_flow_id=create_cluster.output, aws_conn_id="aws_default", steps=SparkSteps.TRANSFORM, ) wait_for_step_transform = EmrStepSensor( task_id="wait_for_step_transform", job_flow_id=create_cluster.output, step_id="{{ task_instance.xcom_pull(task_ids='add_step_transform', key='return_value')[0] }}", aws_conn_id="aws_default",
}] stage1_adder = EmrAddStepsOperator( task_id="add_stage_1", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEP_1, dag=dag, ) stage1_checker = EmrStepSensor( task_id="watch_stage1", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_stage_1', key='return_value')[0] }}", aws_conn_id="aws_default", dag=dag, ) sensor_stage2_key = S3KeySensor( task_id="s3_sensor_stage2_key", bucket_name="{{ dag_run.conf['metaBucketName'] }}", bucket_key= "{{ dag_run.conf['instanceId'] }}/step_2_adv_unmatched_enc_kc_kp/_SUCCESS", ) SPARK_STEP_2 = [{ "Name": "adv-mr-pid-stage2", "ActionOnFailure": "TERMINATE_JOB_FLOW",
aws_conn_id='aws_default', steps=step1, dag=dag) emr_step_2 = EmrAddStepsOperator( task_id='emr_step2', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=step2, dag=dag) emr_step_sensor = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='emr_step2', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) stop_emr_cluster = EmrTerminateJobFlowOperator( task_id='stop_emr1', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', dag=dag) create_emr_cluster >> emr_step_1 emr_step_1 >> emr_step_2 emr_step_2 >> emr_step_sensor emr_step_sensor >> stop_emr_cluster