def _add_benchmarks(self, task_group): with TaskGroup(task_group, prefix_group_id=True, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks( task_group).get_benchmarks() chain(*benchmark_tasks) return benchmarks
def test_chain_not_support_type(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [op1, op2] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 3) ] with self.assertRaises(TypeError): chain([op1, op2], 1) # noqa
def test_chain_different_length_iterable(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [op1, op2, op3, op4, op5] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 6) ] with self.assertRaises(AirflowException): chain([op1, op2], [op3, op4, op5])
def test_chain(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [op1, op2, op3, op4, op5, op6] = [DummyOperator(task_id=f't{i}', dag=dag) for i in range(1, 7)] chain(op1, [op2, op3], [op4, op5], op6) assert {op2, op3} == set(op1.get_direct_relatives(upstream=False)) assert [op4] == op2.get_direct_relatives(upstream=False) assert [op5] == op3.get_direct_relatives(upstream=False) assert {op4, op5} == set(op6.get_direct_relatives(upstream=True))
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> benchmarks >> cleanup_cluster else: install_cluster >> benchmarks
def test_chain(self): dag = DAG(dag_id='test_chain', start_date=datetime.now()) [op1, op2, op3, op4, op5, op6] = [ DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 7) ] chain(op1, [op2, op3], [op4, op5], op6) self.assertCountEqual([op2, op3], op1.get_direct_relatives(upstream=False)) self.assertEqual([op4], op2.get_direct_relatives(upstream=False)) self.assertEqual([op5], op3.get_direct_relatives(upstream=False)) self.assertCountEqual([op4, op5], op6.get_direct_relatives(upstream=True))
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() connect_to_platform = self._get_platform_connector().get_task() final_status=final_dag_status.get_task(self.dag) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> connect_to_platform >> benchmarks >> cleanup_cluster >> final_status else: install_cluster >> connect_to_platform >> benchmarks
def build(self): installer = self._get_openshift_installer() initialize_cluster = installer.initialize_cluster_task() connect_to_platform = self._get_platform_connector().get_task() with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils: utils_tasks = self._get_scale_ci_diagnosis().get_utils() chain(*utils_tasks) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) initialize_cluster >> connect_to_platform >> benchmarks >> utils
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() connect_to_platform = self._get_platform_connector().get_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) rosa_post_installation = self._get_rosa_postinstall_setup( )._get_rosa_postinstallation() if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks >> cleanup_cluster else: install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks
# [END howto_operator_gcp_transfer_wait_operation] # [START howto_operator_gcp_transfer_cancel_operation] cancel_operation = CloudDataTransferServiceCancelOperationOperator( task_id="cancel_operation", operation_name="{{task_instance.xcom_pull(" "'wait_for_second_operation_to_start', key='sensed_operations')[0]['name']}}", ) # [END howto_operator_gcp_transfer_cancel_operation] # [START howto_operator_gcp_transfer_delete_job] delete_transfer_from_aws_job = CloudDataTransferServiceDeleteJobOperator( task_id="delete_transfer_from_aws_job", job_name= "{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}", project_id=GCP_PROJECT_ID, ) # [END howto_operator_gcp_transfer_delete_job] chain( create_transfer_job_from_aws, wait_for_operation_to_start, pause_operation, list_operations, get_operation, resume_operation, wait_for_operation_to_end, cancel_operation, delete_transfer_from_aws_job, )
"sourceFormat": "DATASTORE_BACKUP", "compression": "NONE", "csvOptions": { "skipLeadingRows": 1 }, }, }, ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryInsertJobOperator( task_id="execute_query", configuration={ "query": { "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`", "useLegacySql": False, } }, ) chain( # Firestore export_database_to_gcs, # BigQuery create_dataset, create_external_table_multiple_types, read_data_from_gcs_multiple_types, delete_dataset, )
dag_id=DAG_ID, default_args=default_args, description= "An example DAG demonstrating lineage emission within an Airflow DAG.", schedule_interval=None, start_date=days_ago(1), catchup=False, dagrun_timeout=timedelta(minutes=5), tags=["datahub demo"], ) as dag: emit_lineage_task = DatahubEmitterOperator( task_id="emit_lineage", datahub_conn_id="datahub_rest", mces=[ builder.make_lineage_mce(upstream_urns=[ builder.make_dataset_urn("glue", "mydb.tableA"), builder.make_dataset_urn("glue", "mydb.tableB"), ], downstream_urn=builder.make_dataset_urn( "glue", "mydb.tableC", )) ], ) get_airflow_cfg_operator = PythonOperator( task_id="get_airflow_cfg_task", python_callable=print_airflow_cfg) get_print_env_vars_operator = PythonOperator( task_id="get_print_env_vars_task", python_callable=print_env_vars) chain(emit_lineage_task, get_airflow_cfg_operator, get_print_env_vars_operator)
'type': 'video', 'fields': 'items/id/videoId', }, google_api_response_via_xcom='video_ids_response', s3_destination_key=f'{S3_BUCKET_NAME}/youtube_search.json', s3_overwrite=True, ) # [END howto_transfer_google_api_youtube_search_to_s3] task_transform_video_ids = transform_video_ids() # [START howto_transfer_google_api_youtube_list_to_s3] task_video_data_to_s3 = GoogleApiToS3Operator( task_id='video_data_to_s3', google_api_service_name='youtube', google_api_service_version='v3', google_api_endpoint_path='youtube.videos.list', google_api_endpoint_params={ 'part': YOUTUBE_VIDEO_PARTS, 'maxResults': 50, 'fields': YOUTUBE_VIDEO_FIELDS, }, google_api_endpoint_params_via_xcom='video_ids', s3_destination_key=f'{S3_BUCKET_NAME}/youtube_videos.json', s3_overwrite=True, ) # [END howto_transfer_google_api_youtube_list_to_s3] chain(task_video_ids_to_s3, task_transform_video_ids, task_video_data_to_s3)
zone=GCE_ZONE, resource_id=GCE_INSTANCE, body={ 'machineType': f'zones/{GCE_ZONE}/machineTypes/{GCE_SHORT_MACHINE_TYPE_NAME}' }, task_id='gcp_compute_set_machine_type', ) # [END howto_operator_gce_set_machine_type] # Duplicate set machine type for idempotence testing # [START howto_operator_gce_set_machine_type_no_project_id] gce_set_machine_type2 = ComputeEngineSetMachineTypeOperator( zone=GCE_ZONE, resource_id=GCE_INSTANCE, body={ 'machineType': f'zones/{GCE_ZONE}/machineTypes/{GCE_SHORT_MACHINE_TYPE_NAME}' }, task_id='gcp_compute_set_machine_type2', ) # [END howto_operator_gce_set_machine_type_no_project_id] chain( gce_instance_start, gce_instance_start2, gce_instance_stop, gce_instance_stop2, gce_set_machine_type, gce_set_machine_type2, )
) # [START howto_operator_emr_add_steps] step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id=cluster_creator.output, steps=SPARK_STEPS, ) # [END howto_operator_emr_add_steps] # [START howto_sensor_emr_step_sensor] step_checker = EmrStepSensor( task_id='watch_step', job_flow_id=cluster_creator.output, step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", ) # [END howto_sensor_emr_step_sensor] # [START howto_operator_emr_terminate_job_flow] cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id=cluster_creator.output, ) # [END howto_operator_emr_terminate_job_flow] chain( step_adder, step_checker, cluster_remover, )
) # [END howto_operator_glue_crawler] # [START howto_sensor_glue_crawler] wait_for_crawl = GlueCrawlerSensor(task_id='wait_for_crawl', crawler_name=GLUE_CRAWLER_NAME) # [END howto_sensor_glue_crawler] # [START howto_operator_glue] job_name = 'example_glue_job' submit_glue_job = GlueJobOperator( task_id='submit_glue_job', job_name=job_name, wait_for_completion=False, script_location=f's3://{GLUE_EXAMPLE_S3_BUCKET}/etl_script.py', s3_bucket=GLUE_EXAMPLE_S3_BUCKET, iam_role_name=GLUE_CRAWLER_ROLE.split('/')[-1], create_job_kwargs={'GlueVersion': '3.0', 'NumberOfWorkers': 2, 'WorkerType': 'G.1X'}, ) # [END howto_operator_glue] # [START howto_sensor_glue] wait_for_job = GlueJobSensor( task_id='wait_for_job', job_name=job_name, # Job ID extracted from previous Glue Job Operator task run_id=submit_glue_job.output, ) # [END howto_sensor_glue] chain(setup_upload_artifacts_to_s3, crawl_s3, wait_for_crawl, submit_glue_job, wait_for_job)
SNS_TOPIC_ARN = getenv("SNS_TOPIC_ARN", "arn:aws:sns:<region>:<account number>:MyTopic") RDS_DB_IDENTIFIER = getenv("RDS_DB_IDENTIFIER", "database-identifier") with DAG( dag_id='example_rds_event', schedule_interval=None, start_date=datetime(2021, 1, 1), tags=['example'], catchup=False, ) as dag: # [START howto_operator_rds_create_event_subscription] create_subscription = RdsCreateEventSubscriptionOperator( task_id='create_subscription', subscription_name=SUBSCRIPTION_NAME, sns_topic_arn=SNS_TOPIC_ARN, source_type='db-instance', source_ids=[RDS_DB_IDENTIFIER], event_categories=['availability'], ) # [END howto_operator_rds_create_event_subscription] # [START howto_operator_rds_delete_event_subscription] delete_subscription = RdsDeleteEventSubscriptionOperator( task_id='delete_subscription', subscription_name=SUBSCRIPTION_NAME, ) # [END howto_operator_rds_delete_event_subscription] chain(create_subscription, delete_subscription)
delete_task = DmsDeleteTaskOperator( task_id='delete_task', replication_task_arn=create_task.output, trigger_rule='all_done', ) # [END howto_operator_dms_delete_task] delete_db_instance = RdsDeleteDbInstanceOperator( task_id='delete_db_instance', db_instance_identifier=RDS_INSTANCE_NAME, rds_kwargs={ "SkipFinalSnapshot": True, }, trigger_rule='all_done', ) chain( create_db_instance, create_sample_table(), create_dms_assets(), create_task, start_task, describe_tasks, await_task_start, stop_task, await_task_stop, delete_task, delete_dms_assets(), delete_db_instance, )
with DAG( dag_id='example_ec2', start_date=datetime(2021, 1, 1), tags=['example'], catchup=False, ) as dag: # [START howto_operator_ec2_start_instance] start_instance = EC2StartInstanceOperator( task_id="ec2_start_instance", instance_id=INSTANCE_ID, ) # [END howto_operator_ec2_start_instance] # [START howto_sensor_ec2_instance_state] instance_state = EC2InstanceStateSensor( task_id="ec2_instance_state", instance_id=INSTANCE_ID, target_state="running", ) # [END howto_sensor_ec2_instance_state] # [START howto_operator_ec2_stop_instance] stop_instance = EC2StopInstanceOperator( task_id="ec2_stop_instance", instance_id=INSTANCE_ID, ) # [END howto_operator_ec2_stop_instance] chain(start_instance, instance_state, stop_instance)
source=SOURCE_NAME, flow_name=FLOW_NAME, source_field="LastModifiedDate", filter_date="3000-01-01", # Future date, so no records to dump ) # [END howto_operator_appflow_run_after] # [START howto_operator_appflow_shortcircuit] campaign_dump_short_circuit = AppflowRecordsShortCircuitOperator( task_id="campaign_dump_short_circuit", flow_name=FLOW_NAME, appflow_run_task_id= "campaign_dump_after", # Should shortcircuit, no records expected ) # [END howto_operator_appflow_shortcircuit] should_be_skipped = BashOperator( task_id="should_be_skipped", bash_command="echo 1", ) chain( campaign_dump, campaign_dump_full, campaign_dump_daily, campaign_dump_before, campaign_dump_after, campaign_dump_short_circuit, should_be_skipped, )
}, } # [END howto_operator_emr_eks_config] with DAG( dag_id='example_emr_eks', start_date=datetime(2021, 1, 1), tags=['example'], catchup=False, ) as dag: # [START howto_operator_emr_container] job_starter = EmrContainerOperator( task_id="start_job", virtual_cluster_id=VIRTUAL_CLUSTER_ID, execution_role_arn=JOB_ROLE_ARN, release_label="emr-6.3.0-latest", job_driver=JOB_DRIVER_ARG, configuration_overrides=CONFIGURATION_OVERRIDES_ARG, name="pi.py", wait_for_completion=False, ) # [END howto_operator_emr_container] # [START howto_sensor_emr_container] job_waiter = EmrContainerSensor(task_id="job_waiter", virtual_cluster_id=VIRTUAL_CLUSTER_ID, job_id=str(job_starter.output)) # [END howto_sensor_emr_container] chain(job_starter, job_waiter)
update_queue = CloudTasksQueueUpdateOperator( task_queue=Queue(stackdriver_logging_config=dict(sampling_ratio=1)), location=LOCATION, queue_name=QUEUE_ID, update_mask={"paths": ["stackdriver_logging_config.sampling_ratio"]}, task_id="update_queue", ) list_queue = CloudTasksQueuesListOperator(location=LOCATION, task_id="list_queue") chain( create_queue, update_queue, pause_queue, resume_queue, purge_queue, get_queue, list_queue, delete_queue, ) # Tasks operations create_task = CloudTasksTaskCreateOperator( location=LOCATION, queue_name=QUEUE_ID, task=TASK, task_name=TASK_NAME, retry=Retry(maximum=10.0), timeout=5, task_id="create_task_to_run", )
configuration={ "query": { "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`", "useLegacySql": False, } }, ) delete_bucket = GCSDeleteBucketOperator( task_id="delete_bucket", bucket_name=GCS_BUCKET, ) delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", project_id=GCP_PROJECT_ID, dataset_id=DATASET_NAME, delete_contents=True, ) chain( create_bucket, create_dataset, create_table, run_operator, load_csv, read_data_from_gcs_many_chunks, delete_bucket, delete_dataset, )
args = {"owner": "airflow", "start_date": START_DATE} for dag_no in range(1, DAG_COUNT + 1): dag = DAG( dag_id=safe_dag_id("__".join([ DAG_PREFIX, f"SHAPE={SHAPE.name.lower()}", f"DAGS_COUNT={dag_no}_of_{DAG_COUNT}", f"TASKS_COUNT=${TASKS_COUNT}", f"START_DATE=${START_DATE_ENV}", f"SCHEDULE_INTERVAL=${SCHEDULE_INTERVAL_ENV}", ])), is_paused_upon_creation=False, default_args=args, schedule_interval=SCHEDULE_INTERVAL, ) tasks = [ BashOperator(task_id="__".join(["tasks", f"{i}_of_{TASKS_COUNT}"]), bash_command='echo test"', dag=dag) for i in range(1, TASKS_COUNT + 1) ] if SHAPE == DagShape.NO_STRUCTURE: # Do nothing pass elif SHAPE == DagShape.LINEAR: chain(*tasks) globals()[f"dag_{dag_no}"] = dag
python_callable=load_users_data) load_tweets_data = PythonOperator(task_id="load_tweets_data", python_callable=load_tweets_data) load_mentions_data = PythonOperator(task_id="load_mentions_data", python_callable=load_mentions_data) load_hashtags_data = PythonOperator(task_id="load_hashtags_data", python_callable=load_hashtags_data) load_urls_data = PythonOperator(task_id="load_urls_data", python_callable=load_urls_data) load_medias_data = PythonOperator(task_id="load_medias_data", python_callable=load_medias_data) load_scores_data = PythonOperator(task_id="load_scores_data", python_callable=load_scores_data) chain(parse_tweets_data, score_users, [ transform_users_data, transform_tweets_data, transform_entities_data, transform_scores_data ], [ load_users_data, load_tweets_data, [ load_mentions_data, load_hashtags_data, load_medias_data, load_urls_data ], load_scores_data ])
# [START howto_operator_get_build_trigger] get_build_trigger = CloudBuildGetBuildTriggerOperator( task_id="get_build_trigger", project_id=GCP_PROJECT_ID, trigger_id=create_build_trigger.output['id'], ) # [END howto_operator_get_build_trigger] # [START howto_operator_delete_build_trigger] delete_build_trigger = CloudBuildDeleteBuildTriggerOperator( task_id="delete_build_trigger", project_id=GCP_PROJECT_ID, trigger_id=create_build_trigger.output['id'], ) # [END howto_operator_delete_build_trigger] # [START howto_operator_list_build_triggers] list_build_triggers = CloudBuildListBuildTriggersOperator( task_id="list_build_triggers", project_id=GCP_PROJECT_ID, location="global", page_size=5 ) # [END howto_operator_list_build_triggers] chain( create_build_trigger, run_build_trigger, update_build_trigger, get_build_trigger, delete_build_trigger, list_build_triggers, )
from airflow.utils import dates args = { 'owner': 'airflow', } dag = DAG( dag_id='example_short_circuit_operator', default_args=args, start_date=dates.days_ago(2), tags=['example'], ) cond_true = ShortCircuitOperator( task_id='condition_is_True', python_callable=lambda: True, dag=dag, ) cond_false = ShortCircuitOperator( task_id='condition_is_False', python_callable=lambda: False, dag=dag, ) ds_true = [DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2]] ds_false = [DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2]] chain(cond_true, *ds_true) chain(cond_false, *ds_false)
update_tag_template = BashOperator(task_id="update_tag_template", bash_command="echo update_tag_template") update_tag_template_field = BashOperator( task_id="update_tag_template_field", bash_command="echo update_tag_template_field") # Create create_tasks = [ create_entry_group, create_entry_gcs, create_tag_template, create_tag_template_field, create_tag, ] chain(*create_tasks) create_entry_group >> delete_entry_group create_entry_group >> create_entry_group_result create_entry_group >> create_entry_group_result2 create_entry_gcs >> delete_entry create_entry_gcs >> create_entry_gcs_result create_entry_gcs >> create_entry_gcs_result2 create_tag_template >> delete_tag_template_field create_tag_template >> create_tag_template_result create_tag_template >> create_tag_template_result2 create_tag_template_field >> delete_tag_template_field create_tag_template_field >> create_tag_template_field_result
# [END howto_transfer_dynamodb_to_s3] # [START howto_transfer_dynamodb_to_s3_segmented] # Segmenting allows the transfer to be parallelized into {segment} number of parallel tasks. backup_db_segment_1 = DynamoDBToS3Operator( task_id='backup-1', dynamodb_table_name=TABLE_NAME, s3_bucket_name=BUCKET_NAME, # Max output file size in bytes. If the Table is too large, multiple files will be created. file_size=1000, dynamodb_scan_kwargs={ "TotalSegments": 2, "Segment": 0, }, ) backup_db_segment_2 = DynamoDBToS3Operator( task_id="backup-2", dynamodb_table_name=TABLE_NAME, s3_bucket_name=BUCKET_NAME, # Max output file size in bytes. If the Table is too large, multiple files will be created. file_size=1000, dynamodb_scan_kwargs={ "TotalSegments": 2, "Segment": 1, }, ) # [END howto_transfer_dynamodb_to_s3_segmented] chain(backup_db, [backup_db_segment_1, backup_db_segment_2])
# [START howto_operator_s3_delete_objects] delete_objects = S3DeleteObjectsOperator( task_id="s3_delete_objects", bucket=BUCKET_NAME_2, keys=KEY_2, ) # [END howto_operator_s3_delete_objects] # [START howto_operator_s3_delete_bucket] delete_bucket = S3DeleteBucketOperator(task_id='s3_delete_bucket', bucket_name=BUCKET_NAME, force_delete=True) # [END howto_operator_s3_delete_bucket] chain( create_bucket, put_tagging, get_tagging, delete_tagging, create_object, list_prefixes, list_keys, [sensor_one_key, sensor_two_keys, sensor_key_with_function], copy_object, transforms_file, sensor_keys_unchanged, delete_objects, delete_bucket, )