start_date = datetime.datetime.utcnow() dag = DAG( "lesson3.exercise3", start_date=start_date, ) trips_task_id = "trips_subdag" trips_subdag_task = SubDagOperator( subdag=get_s3_to_redshift_dag( "lesson3.exercise3", trips_task_id, "redshift", "aws_credentials", "trips", sql_statements.CREATE_TRIPS_TABLE_SQL, s3_bucket="udac-data-pipelines", s3_key="divvy/unpartitioned/divvy_trips_2018.csv", start_date=start_date, ), task_id=trips_task_id, dag=dag, ) stations_task_id = "stations_subdag" stations_subdag_task = SubDagOperator( subdag=get_s3_to_redshift_dag( "lesson3.exercise3", stations_task_id, "redshift", "aws_credentials",
transfer_options={'deleteObjectsUniqueInSink': True}, dag=dag, ) # Spark job reads gcs json and writes gcs parquet crash_report_parquet = SubDagOperator( task_id="crash_report_parquet", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name='crash_report_parquet', default_args=default_args, cluster_name=cluster_name, job_name="Socorro_Crash_Reports_to_Parquet", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py", py_args=[ "--date", "{{ ds_nodash }}", "--source-gcs-path", "gs://{}/v1/crash_report".format(gcs_data_bucket), "--dest-gcs-path", "gs://{}/{}".format(gcs_data_bucket, dataset) ], idle_delete_ttl='14400', num_workers=8, worker_machine_type='n1-standard-8', aws_conn_id=read_aws_conn_id, gcp_conn_id=gcp_conn_id)) bq_gcp_conn_id = 'google_cloud_derived_datasets' bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id) gke_location = "us-central1-a" gke_cluster_name = "bq-load-gke-1"
"to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="first_shutdown_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", cluster_by=["sample_id"], drop=["submission_date"], rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], ), task_id="first_shutdown_summary_bigquery_load", dag=dag) first_shutdown_summary >> first_shutdown_summary_bigquery_load
PROJECT_VERSION = '1.0' PROJECT_NAME = 'post-collector' # MAIN DAGS # interval = "0 3 */1 * *" interval = "*/10 * * * *" DAG_ID = 'post_collector' start_date = datetime.strptime(Variable.get("post_collector_start_date"), "%Y-%m-%d %H:%M:%S") emails = Variable.get('support_email_list').split(',') default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': start_date, 'email': emails, 'email_on_failure': True, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=2) } with DAG(dag_id=DAG_ID, default_args=default_args, schedule_interval=interval, start_date=start_date) as dag: main_subdags_id = 'all_process' process_keywords_dag = SubDagOperator( task_id=main_subdags_id, subdag=all_process( "{0}.{1}".format(DAG_ID, main_subdags_id), start_date, interval, default_args), depends_on_past=True, dag=dag )
begin_execution = DummyOperator(task_id='begin_execution', dag=dag) end_execution = DummyOperator(task_id='end_execution', dag=dag) staging_events_task_id = "staging_events_subdag" staging_events_subdag_task = SubDagOperator( subdag=stg_subdag( "a_song_plays_hourly", staging_events_task_id, "aws_credentials", "redshift", "staging_events", sql_statements.CREATE_TABLE_STAGING_EVENTS, s3_prefix="s3:/", s3_bucket="udacity-dend", s3_key="log_data/{execution_date.year}/{execution_date.month}", s3_jsonpath_file="log_json_path.json", sw_delete_stages=sw_delete_stages, partition_year="{execution_date.year}", partition_month="{execution_date.month}", start_date=start_date, ), task_id=staging_events_task_id, dag=dag, ) staging_songs_task_id = "staging_songs_subdag" staging_songs_subdag_task = SubDagOperator( subdag=stg_subdag( "a_song_plays_hourly", staging_songs_task_id,
src="/airflow/dags/spark-scripts/generate_show_comments.py", dst="spark-jobs/generate_show_comments.py", bucket=gcs_netflix_bucket, google_cloud_storage_conn_id=gcp_conn, dag=dag) catalog_task_id = "show_catalog_subdag" catalog_path = "catalog/clean/catalog.parquet" download_catalog_show_subdag = SubDagOperator(subdag=catalog_show_to_gcs( "content_review", catalog_task_id, kaggle_bucket="shivamb/netflix-shows", kaggle_local_destination_path="/airflow/datasources/catalog/csv", gcp_conn_id=gcp_conn, gcs_bucket=gcs_netflix_bucket, gcs_raw_destination_path="catalog/raw/catalog.csv", gcs_clean_destination_path=catalog_path, cluster_name=cluster_name, spark_code_path="gs://" + gcs_netflix_bucket + "/spark-jobs/clean_netflix_catalog.py", region=region, start_date=start_date), task_id=catalog_task_id, dag=dag) consume_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/consume_reddit_comments.py" reddit_destination_path = "gs://" + gcs_netflix_bucket + "/comments/raw/comments.parquet" gcp_netflix_catalog_path = "gs://" + gcs_netflix_bucket + "/" + catalog_path consume_show_comment_to_datalake = DataProcPySparkOperator( task_id='consume_show_comment_to_datalake', main=consume_show_comments_job_path,
def nested_subdags(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubdag_0 # master.opSubdag_0: # -> opSubDag_A # master.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # master.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # master.opSubdag_1: # -> opSubdag_C # master.opSubdag_1.opSubdag_C: # -> subdag_C.task # -> opSubDag_D # master.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C def subdag_D(): subdag_D = DAG('master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20190910', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options ) reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
register_status(main_summary, "Main Summary", "A summary view of main pings.") main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**", "*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag) main_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="main_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="main_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="main_summary_bigquery_load", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}",
bgbb_fit_dataproc = SubDagOperator( task_id=task_id, dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="bgbb_fit_dataproc", cluster_name="bgbb-fit-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=3, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/wcbeard/bgbb_airflow.git" }, python_driver_code="gs://{}/jobs/bgbb_runner.py".format( params.artifact_bucket), py_args=[ "bgbb_fit", "--submission-date", "{{ next_ds }}", "--model-win", "90", "--start-params", "[0.387, 0.912, 0.102, 1.504]", "--sample-ids", "[42]", "--sample-fraction", "1.0", "--penalizer-coef", "0.01", "--source", "bigquery", "--view-materialization-project", params.project_id if params.is_dev else "moz-fx-data-shared-prod", "--view-materialization-dataset", "analysis", "--bucket-protocol", "gs", "--bucket", params.output_bucket, "--prefix", "bgbb/params/v1", ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), )
ltv_daily = SubDagOperator( task_id=task_id, dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="ltv-daily", cluster_name="ltv-daily-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=5, worker_machine_type="n1-standard-8", optional_components=["ANACONDA"], init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"}, python_driver_code="gs://{}/jobs/ltv_daily.py".format( params.artifact_bucket), py_args=[ "--submission-date", "{{ ds }}", "--prediction-days", "364", "--project-id", "moz-fx-data-shared-prod", "--source-qualified-table-id", "moz-fx-data-shared-prod.search.search_rfm", "--dataset-id", "analysis", "--intermediate-table-id", "ltv_daily_temporary_search_rfm_day", "--model-input-table-id", "ltv_daily_model_perf", "--model-output-table-id", "ltv_daily", "--temporary-gcs-bucket", params.storage_bucket, ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), )
format_type="json", format_style="auto") load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, target_table="songplays", redshift_conn_id="redshift", select_sql_stmt=SqlQueries.songplay_table_insert, ) dim_task_id = "load_data_into_dimension_tables" load_dimension_subdag_task = SubDagOperator( subdag=load_to_dimension_tables_dag("sparkify_pipeline_3", dim_task_id, "redshift", SqlQueries, start_date=start_date), task_id=dim_task_id, dag=dag) dq_checks = [{ 'check_sql': "SELECT COUNT(*) FROM songplays;", 'test_expr': "{} < 1" }, { 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is NULL;", 'test_expr': "{} >= 1" }] run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift",
provide_context=True, python_callable=generate_search_terms, dag=dag, ) email_links = EmailOperator( task_id="email_best_links", to="*****@*****.**", subject="Latest popular links", html_content="Check out the latest!!", files=["{}/latest_links.txt".format(RAW_TWEET_DIR)], dag=dag, ) sub = SubDagOperator(subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag) clear_latest = BashOperator( bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR), task_id="clear_latest", dag=dag, ) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r"\W+", "", term) simple_search = PythonOperator( task_id="search_{}_twitter".format(term_without_punctuation), provide_context=True,
def repeat_dag(context, dag_run_obj): rq = context['params']['rq'] if rq.QueueSize() > 0: return dag_run_obj # @TODO find a way to make these separate tasks. Difficult because they # can't be pickled, therefore they can't be returned via a task. Session, _ = db_connect('pdsdi_dev') session = Session() rq = RedisQueue('DI_ReadyQueue') process_operator = SubDagOperator(subdag=process_subdag('di_process', 'di_checksum', session=session, archiveID=archiveID, n_procs=5, rq=rq), task_id='di_checksum', dag=dag) loop_operator = TriggerDagRunOperator(task_id='loop', provide_context=True, params={'rq': rq}, trigger_dag_id='di_process', python_callable=repeat_dag, dag=dag) process_operator >> loop_operator
for i in range(2): DummyOperator( task_id='%s-task-%s' % (child_dag_name, i + 1), default_args=args, dag=dag_subdag, ) return dag_subdag with DAG( dag_id=DAG_NAME, start_date=datetime(2019, 1, 1), max_active_runs=1, default_args=DEFAULT_TASK_ARGS, schedule_interval=timedelta(minutes=1), ) as dag: start = DummyOperator(task_id='start', ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', DEFAULT_TASK_ARGS), default_args=DEFAULT_TASK_ARGS, ) some_other_task = DummyOperator(task_id='some-other-task', ) start >> section_1 >> some_other_task # pylint: disable=W0104
# start_date = datetime.datetime.utcnow() start_date = datetime.datetime(2018, 1, 1, 0, 0, 0, 0) end_date = datetime.datetime(2018, 6, 1, 0, 0, 0, 0) dag = DAG("lesson3.exercise3", start_date=start_date, end_date=end_date, schedule_interval="@monthly") trips_task_id = "trips_subdag" trips_subdag_task = SubDagOperator(subdag=get_s3_to_redshift_dag( parent_dag_name="lesson3.exercise3", task_id=trips_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials_redshift", table="trips", create_sql_stmt=sql_statements.CREATE_TRIPS_TABLE_SQL, s3_bucket="udacity-dend", s3_key="udac-data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv", start_date=start_date), task_id=trips_task_id, dag=dag) stations_task_id = "stations_subdag" stations_subdag_task = SubDagOperator(subdag=get_s3_to_redshift_dag( parent_dag_name="lesson3.exercise3", task_id=stations_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials_redshift", table="stations", create_sql_stmt=sql_statements.CREATE_STATIONS_TABLE_SQL, s3_bucket="udacity-dend",
load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", table="songplays", sql_source=SqlQueries.songplay_table_insert) # Create & Load dimension tables # artists table load_artists_dimension_table_task_id = "artists_subdag" load_artists_dimension_table = SubDagOperator( subdag=load_dim_table_dag(parent_dag_name=parent_task_id, task_id=load_artists_dimension_table_task_id, redshift_conn_id="redshift", table="artists", create_sql_stmt=CreateTables.create_artists, select_stmt=SqlQueries.artist_table_insert, append_rows=False, start_date=start_date), task_id=load_artists_dimension_table_task_id, dag=dag, ) # songs table load_songs_dimension_table_task_id = "songs_subdag" load_songs_dimension_table = SubDagOperator( subdag=load_dim_table_dag(parent_dag_name=parent_task_id, task_id=load_songs_dimension_table_task_id, redshift_conn_id="redshift", table="songs", create_sql_stmt=CreateTables.create_songs, select_stmt=SqlQueries.song_table_insert,
"com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator(subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", ), task_id="sync_view_bigquery_load", dag=dag) sync_events_view = EMRSparkOperator( task_id="sync_events_view", job_name="Sync Events View", execution_timeout=timedelta(hours=10), instance_count=1, email=['*****@*****.**'], env=tbv_envvar( "com.mozilla.telemetry.views.SyncEventView", {
#################################################################################### # task created by instantiating operators DummmyOerator start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Taks to create, insert from s3_bucket/udacity-dend/log_data and check staging songs table staging_songs_task_id = "staging_songs_subdag" staging_songs_task = SubDagOperator( subdag=get_s3_to_redshift_subdag( "ETL_Sparkify_0", #name parent dag staging_songs_task_id, #task_id "redshift", #redshift_conn_id "aws_credential", #aws_credentials_id create_tbl=CreateTables.staging_songs_table_create, target_table="staging_songs", sql_row=SqlQueries.has_rows, s3_bucket="udacity-dend", s3_key="song_data", custom=" json 'auto' compupdate off region 'us-west-2'", start_date=datetime.datetime(2018, 11, 1, 0, 0, 0, 0), ), task_id=staging_songs_task_id, depends_on_past=True, dag=dag) #Taks to create, insert from s3_bucket/udacity-dend/log_data and check staging events table staging_events_task_id = "staging_events_subdag" staging_events_task = SubDagOperator( subdag=get_s3_to_redshift_subdag( "ETL_Sparkify_0", #name parent dag staging_events_task_id, #task_id "redshift", #redshift_conn_id
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'nested_cycle' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # cycle: # A -> opSubdag_0 # cycle.opSubdag_0: # -> opSubDag_A # cycle.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # cycle.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # cycle.opSubdag_1: # -> opSubdag_C # cycle.opSubdag_1.opSubdag_C: # -> subdag_C.task -> subdag_C.task >Invalid Loop< # -> opSubDag_D # cycle.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator(task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C def subdag_D(): subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
prerelease_telemetry_aggregate_view_dataproc = SubDagOperator( task_id=task_id, dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="prerelease_aggregates", cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=10, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/mozilla/python_mozaggregator.git" }, python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format( artifact_bucket ), py_args=[ "aggregator", "--date", "{{ ds_nodash }}", "--channels", "nightly,aurora,beta", "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_postgres_pass }}", "--postgres-host", "{{ var.value.mozaggregator_postgres_host }}", "--postgres-ro-host", "{{ var.value.mozaggregator_postgres_ro_host }}", "--num-partitions", str(10 * 32), ] + ( ["--source", "bigquery", "--project-id", "moz-fx-data-shared-prod"] if not EXPORT_TO_AVRO else [ "--source", "avro", "--avro-prefix", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod", ] ), gcp_conn_id=gcp_conn.gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, default_args=subdag_args, ), )
dag=dag6, ) dag6_task2.set_upstream(dag6_task1) # DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator(task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7, ) subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that queued tasks are run dag8 = DAG(dag_id='test_scheduled_queued_tasks', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, default_args=default_args) dag8_task1 = PythonOperator( # use delayed_fail because otherwise LocalExecutor will have a chance to # complete the task python_callable=delayed_fail, task_id='test_queued_task', dag=dag8, pool='test_queued_pool')
dag = DAG( dag_id='cm_load', default_args=args, schedule_interval=schedule_interval, start_date=datetime(2017, 1, 1), max_active_runs=1, # concurrency = 1, catchup=False, dagrun_timeout=timedelta(minutes=24 * 60)) # dag runs out after 1 day of running sub_dag_extract_network_externals_task = SubDagOperator( subdag=extract_network_externals('cm_load', 'extract_network_externals', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='extract_network_externals', dag=dag, ) sub_dag_cm_load_house_keeping_task = SubDagOperator( subdag=run_house_keeping_tasks('cm_load', 'cm_load_house_keeping', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='cm_load_house_keeping', dag=dag, ) sub_dag_parse_and_import_eri_3g4g_cm_files = SubDagOperator( subdag=parse_and_import_eri_3g4g('cm_load',
task_id="clients_daily_keyed_histogram_aggregates", project_id=project_id, source_dataset_id=dataset_id, sample_size=PERCENT_RELEASE_WINDOWS_SAMPLING, overwrite=False, probe_type="keyed_histogram", get_logs=False, dag=dag, ) clients_histogram_aggregates = SubDagOperator( subdag=histogram_aggregates_subdag( GLAM_DAG, GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG, default_args, dag.schedule_interval, dataset_id, ), task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG, executor=get_default_executor(), dag=dag, ) histogram_percentiles = bigquery_etl_query( task_id="histogram_percentiles", destination_table="histogram_percentiles_v1", dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", date_partition_parameter=None, arguments=("--replace", ), dag=dag,
def create_subdag_2(parent_dag_id, subdag_name, schedule_interval): with DAG(dag_id='{}.{}'.format(parent_dag_id, subdag_name), schedule_interval=schedule_interval, catchup=False, default_args=default_args) as subdag: task = BashOperator(task_id='task', bash_command='echo "Sub-DAG 2 executed !!"') return subdag with DAG(dag_id='08_subdags', schedule_interval='*/10 * * * *', catchup=False, default_args=default_args) as dag: sub_dag_1_name = 'sub_dag_1' sub_dag_1_task = SubDagOperator(subdag=create_subdag_1( dag.dag_id, sub_dag_1_name, dag.schedule_interval), task_id=sub_dag_1_name) foo = DummyOperator(task_id='foo') sub_dag_1_task >> foo sub_dag_2_name = 'sub_dag_2' sub_dag_2_task = SubDagOperator(subdag=create_subdag_2( dag.dag_id, sub_dag_2_name, dag.schedule_interval), task_id=sub_dag_2_name) foo >> sub_dag_2_task
'retry_delay': datetime.timedelta(minutes=10), 'schedule_interval': '0 1 * * *', } dag_name = 'bq_events_to_amplitude' with models.DAG(dag_name, default_args=default_args) as dag: fenix_task_id = 'fenix_amplitude_export' SubDagOperator(subdag=export_to_amplitude( dag_name=fenix_task_id, parent_dag_name=dag_name, default_args=default_args, project='moz-fx-data-derived-datasets', dataset='telemetry', table_or_view='fenix_events_v1', s3_prefix='fenix', ), task_id=fenix_task_id) fennec_ios_task_id = 'fennec_ios_amplitude_export' fennec_ios_args = default_args.copy() fennec_ios_args["start_date"] = datetime.datetime(2019, 12, 2) SubDagOperator(subdag=export_to_amplitude( dag_name=fennec_ios_task_id, parent_dag_name=dag_name, default_args=fennec_ios_args, project='moz-fx-data-shared-prod', dataset='telemetry', table_or_view='fennec_ios_events_v1',
load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', sql_query=SqlQueries.songplay_table_insert) load_user_dimension_table_task_id = 'Load_user_dim_table' load_user_dimension_table = SubDagOperator( subdag=load_dimensional_tables_dag( parent_dag_name=dag_name, task_id=load_user_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", start_date=datetime(2018, 5, 1), table="users", sql_query=SqlQueries.user_table_insert, ), task_id=load_user_dimension_table_task_id, dag=dag, ) load_song_dimension_table_task_id = 'Load_song_dim_table' load_song_dimension_table = SubDagOperator( subdag=load_dimensional_tables_dag( parent_dag_name=dag_name, task_id=load_song_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", start_date=datetime(2018, 5, 1),
crash_report_parquet = SubDagOperator( task_id="hardware_report", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="hardware_report", default_args=default_args, cluster_name=cluster_name, job_name="Firefox_Hardware_Report", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/hardware_report.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={ 'PIP_PACKAGES': "google-cloud-bigquery==1.21.0 python_moztelemetry==0.10.2 boto3==1.9.87 click==6.7 click_datetime==0.2 requests-toolbelt==0.8.0 requests==2.20.1 typing==3.6.4" }, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key }, py_args=[ "--start_date", DS_WEEKLY, "--bucket", "telemetry-public-analysis-2", "--spark-provider", "dataproc", ], idle_delete_ttl='14400', num_workers=15, worker_machine_type='n1-standard-4', gcp_conn_id=gcp_conn_id))
"input_event_response_coalesced_ms_main_above_2500", "input_event_response_coalesced_ms_content_above_150", "input_event_response_coalesced_ms_content_above_250", "input_event_response_coalesced_ms_content_above_2500", "ghost_windows_main_above_1", "ghost_windows_content_above_1", ] main_summary_export = SubDagOperator(subdag=export_to_parquet( table= "moz-fx-data-shared-prod:telemetry_derived.main_summary_v4${{ds_nodash}}", static_partitions=["submission_date_s3={{ds_nodash}}"], arguments=[ "--partition-by=sample_id", "--replace='{{ds_nodash}}' AS submission_date", "--maps-from-entries", ] + main_summary_bigint_columns, parent_dag_name=dag.dag_id, dag_name="main_summary_export", default_args=default_args, num_workers=40), task_id="main_summary_export", executor=get_default_executor(), dag=dag) clients_daily_export = SubDagOperator( subdag=export_to_parquet( table= "moz-fx-data-shared-prod:telemetry_derived.clients_daily_v6${{ds_nodash}}", static_partitions=["submission_date_s3={{ds_nodash}}"], arguments=[ # restore legacy schema
instance_count=10, env=mozetl_envvar("churn", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) churn_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="churn_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="churn", dataset_version="v3", date_submission_col="week_start", gke_cluster_name="bq-load-gke-1", ), task_id="churn_bigquery_load", dag=dag) churn_v2 = MozDatabricksSubmitRunOperator( task_id="churn_v2", job_name="churn 7-day v2", execution_timeout=timedelta(hours=4), instance_count=5, env=mozetl_envvar("churn", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"
modules_with_missing_symbols = SubDagOperator( task_id="modules_with_missing_symbols", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="modules_with_missing_symbols", default_args=default_args, cluster_name="modules-with-missing-symbols-{{ ds }}", job_name="modules-with-missing-symbols", python_driver_code= "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", "spark-env:AWS_ACCESS_KEY_ID": ses_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key, }, py_args=[ "--run-on-days", "0", # run monday "--date", "{{ ds }}" ], idle_delete_ttl="14400", num_workers=2, worker_machine_type="n1-standard-4", gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ), )
dag = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval="@once", ) start = DummyOperator( task_id='start', default_args=args, dag=dag, ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, )
execution_timeout=timedelta(hours=10), instance_count=5, env = tbv_envvar("com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1", ), task_id="sync_view_bigquery_load", dag=dag) sync_events_view = EMRSparkOperator( task_id="sync_events_view", job_name="Sync Events View", execution_timeout=timedelta(hours=10), instance_count=1, email=['*****@*****.**'], env = tbv_envvar("com.mozilla.telemetry.views.SyncEventView", { "from": "{{ ds_nodash }}",