# ============================================================================= # Task definitions # ============================================================================= dag = DAG('sparkify_ELT', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, s3_key="log_data", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events", s3_bucket="udacity-dend", json_path="s3://udacity-dend/log_json_path.json", region="us-west-2", overwrite=True) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, s3_key="song_data", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_songs", s3_bucket="udacity-dend", json_path="auto",
description='Inserts data into the fact and dimension tables in Redshift', schedule_interval='@hourly', default_args=default_args) # Operators start_operator = DummyOperator( task_id='start_insert_tables', dag=dag) stage_events = StageToRedshiftOperator( task_id='staging_events', dag=dag, create_table_sql=create_tables.staging_events, s3_bucket='udacity-dend', s3_key='log_data', schema='public', table='staging_events', redshift_conn_id='redshift', aws_conn_id='aws_credentials', copy_options=["JSON 'auto ignorecase'"]) stage_songs = StageToRedshiftOperator( task_id='staging_songs', dag=dag, create_table_sql=create_tables.staging_songs, s3_bucket='udacity-dend', s3_key='song_data', schema='public', table='staging_songs', redshift_conn_id='redshift',
dag = DAG('udaci_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@daily') start_operator = PostgresOperator(task_id='Begin_execution', dag=dag, postgres_conn_id='redshift', sql="create_tables.sql") stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", json_path="s3://udacity-dend/log_json_path.json", file_type='json', redshift_conn_id='redshift', aws_conn_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table="staging_songs", json_path="auto", file_type='json', redshift_conn_id='redshift', aws_conn_id="aws_credentials", s3_bucket="udacity-dend",
dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.drop_staging_tables) pres_staging_table_create = PostgresOperator( task_id="pres_staging_table_create", dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.pres_staging_table_create) pres_staging_table_populate = StageToRedshiftOperator( task_id="pres_staging_table_populate", dag=dag, provide_context=True, redshift_conn_id="my_redshift_conn", aws_credentials_id="my_aws_conn", table="pres_staging_table", s3_bucket="prescribing-data", s3_key= "{{ execution_date.year }}_{{ ds[5:7] }}/T{{ execution_date.year }}{{ ds[5:7] }}PDPI_BNFT", header=True) pres_fact_table_insert = PostgresOperator( task_id="pres_fact_table_insert", dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.pres_fact_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag,
#dag = DAG('udac_example_dag', #default_args=default_args, #description='Load and transform data in Redshift with Airflow', #schedule_interval='0 * * * *' #) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table_name='staging_events', redshift_conn_id='redshift', s3_bucket='udacity-dend', s3_key='log_data/2018/11/{ds}-events.json', aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-west-2', provide_context=True) stage_songs_to_redshift = StageToRedshiftOperator(task_id='Stage_songs', dag=dag, table_name='staging_songs', redshift_conn_id='redshift', s3_bucket='udacity-dend', s3_key='song_data/', aws_credentials={ 'key': AWS_KEY,
def stage_dim_s3_to_redshift( parent_dag_name, child_dag_name, start_date, end_date, schedule_interval, redshift_conn_id, s3_data, create_sql, table, s3_bucket, s3_key, iam_role, region, file_format, *args, **kwargs): """ Subdag used to create dimension table, copy data from s3 to Redshift dimension table and lastly perform a data quality check. Keyword Arguments: parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object child_dag_name -- Child DAG name used to define subdag ID redshift_conn_id -- Redshift connection ID (str) aws_credentials_id -- AWS connection ID (str) table -- Staging table name (str) create_sql -- Create staging table query (str) s3_bucket -- AWS S3 bucket name (str) s3_key -- AWS S3 bucket data directory/file (str) region -- Redshift cluster configured region (str) file_format -- File format for AWS S3 files (currently only: 'JSON' or 'CSV') (str) """ dag = DAG( dag_id=f"{parent_dag_name}.{child_dag_name}", start_date=start_date, end_date=end_date, schedule_interval=schedule_interval, **kwargs ) start_task = DummyOperator(task_id=f'{table}', dag=dag) create_task = CreatedTableOperator( task_id=f'create_{table}_table', redshift_conn_id=redshift_conn_id, create_sql=create_sql.format(table), table=table, provide_context=True ) copy_task = StageToRedshiftOperator( task_id=f'staging_{table}_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, s3_bucket=s3_bucket, s3_key=s3_key, iam_role=iam_role, s3_data=s3_data, region=region, file_format=file_format, provide_context=True ) check_task = DataQualityOperator( task_id=f'data_quality_check_{table}', dag=dag, redshift_conn_id=redshift_conn_id, table=table, provide_context=True ) start_task >> create_task create_task >> copy_task copy_task >> check_task return dag
start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_redshift_tables = CreateTablesOperator( task_id='Create_tables', dag=dag, redshift_conn_id ="redshift" ) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, provide_context = True, table = "events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket = "udacity-dend", s3_key = "log_data", region="us-west-2", file_format="JSON", execution_date=start_date ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, provide_context=True, table="songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend",
'catchup_by_default': False, 'email_on_retry': False } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@once', max_active_runs=1) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_categories_to_redshift = StageToRedshiftOperator( task_id='stage_categories', redshift_conn_id='redshift', aws_conn_id='aws_credentials', table='staging_category', s3_bucket='podcast-project', s3_key='categories.csv', dag=dag) stage_podcast_to_redshift = StageToRedshiftOperator( task_id='stage_podcast', redshift_conn_id='redshift', table='staging_podcast', s3_bucket='podcast-project', s3_key='podcast.csv', dag=dag) stage_reviews_to_redshift = StageToRedshiftOperator( task_id='stage_reviews', redshift_conn_id='redshift',
drop_all_tables = PostgresOperator(task_id="drop_all_tables", dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.drop_all_tables) create_all_tables_if_not_exist = PostgresOperator( task_id="create_all_tables_if_not_exist", dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.create_all_tables) pres_staging_table_populate = StageToRedshiftOperator( task_id="pres_staging_table_populate", dag=dag, redshift_conn_id="my_redshift_conn", aws_credentials_id="my_aws_conn", table="pres_staging_table", s3_bucket="prescribing-data", s3_key="2019_12/T201912PDPI_BNFT", header=True) gp_prac_staging_table_populate = StageToRedshiftOperator( task_id="gp_prac_staging_table_populate", dag=dag, redshift_conn_id="my_redshift_conn", aws_credentials_id="my_aws_conn", table="gp_pracs_staging_table", s3_bucket="prescribing-data", s3_key="2019_11/T201911ADDR_BNFT", header=False)
dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *' #schedule_interval='@once' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, target_table="staging_events", sql_table_create=SqlQueries.staging_events_table_create, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", json_file="s3://udacity-dend/log_json_path.json", region="us-west-2") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, target_table="staging_songs", sql_table_create=SqlQueries.staging_songs_table_create, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data",
} dag = DAG( 'udac_example_dag', default_args=default_args, description='Load and transform data from S3 to Redshift with Airflow', schedule_interval='@hourly', ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, aws_credentials_id="aws_credentials", iam_role="Redshift_Read_S3", redshift_conn_id="redshift", s3_json_structure_path="s3://udacity-redshift/log_paths.json", s3_data_path="s3://udacity-dend/log_data", table='staging_logs') stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, aws_credentials_id="aws_credentials", iam_role="Redshift_Read_S3", redshift_conn_id="redshift", s3_json_structure_path="s3://udacity-redshift/song_paths.json", s3_data_path="s3://udacity-dend/song_data", table='staging_songs')
create_staging_events_table = PostgresOperator( task_id="create_staging_events_table", dag=dag, postgres_conn_id="redshift", sql=create_tables.CREATE_staging_events_TABLE_SQL) create_staging_songs_table = PostgresOperator( task_id="create_staging_songs_table", dag=dag, postgres_conn_id="redshift", sql=create_tables.CREATE_staging_songs_TABLE_SQL) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data/2018/11/2018-11-01-events.json") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table="staging_songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data/A/B/C/TRABCEI128F424C983.json") load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag,
'catchup': False, 'email_on_retry': False } dag = DAG('udac_example_dag', default_args=default_args, schedule_interval='0 * * * *', description='Load and transform data in Redshift with Airflow') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table='public.staging_events', s3_bucket='udacity-dend', s3_key="log_data", redshift_conn_id='redshift', aws_credentials_id="aws_credentials", json="s3://udacity-dend/log_json_path.json") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table='public.songs', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', json='s3://udacity-dend/song_data', s3_bucket='udacity-dend', s3_key='song_data')
dag = DAG( 'udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval=None #'0 * * * *' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events", s3_bucket="udacity-dend", s3_key="log_data/", #"log_data/{execution_date.year}/{execution_date.month}/{ds}-events.json", #"log_data/2018/11/2018-11-12-events.json", aws_region="us-west-2", json="s3://udacity-dend/log_json_path.json") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_songs", s3_bucket="udacity-dend", s3_key="song_data/", aws_region="us-west-2",
dag = DAG('udacity_music_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly', ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) #Stage_events stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, provide_context=True, redshift_conn='redshift', aws_credentials='aws_credentials', table='staging_events', s3_src_bucket='udacity-dend', s3_src_pattern='log_data', jsonpaths='s3://udacity-dend/log_json_path.json', data_format='json' ) #Stage_song stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, provide_context=True, redshift_conn='redshift', aws_credentials='aws_credentials', table='staging_song', s3_src_bucket='udacity-dend', s3_src_pattern='song_data/A/A/A',
#Dag Initialization dag = DAG('airflow_de_project', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Calling StageToRedShift Custom Operator to load data into staging events table stage_log_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', redshift_connection_id='redshift', table_name='staging_events', aws_credential_id='aws_credentials', s3_bucket='udacity-dend', s3_key='log-data/{execution_date.year}/{execution_date.month}', json_path="s3://udacity-dend/log_json_path.json", dag=dag ) # Calling StageToRedShift Custom Operator to load data into songs staging table stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', redshift_connection_id='redshift', table_name='staging_songs', aws_credential_id='aws_credentials', s3_bucket='udacity-dend', s3_key='song-data/A/A', json_path="auto", dag=dag
description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=1) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables = PostgresOperator(task_id='Create_tables', dag=dag, sql='create_tables.sql', postgres_conn_id='redshift') stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table='staging_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', s3_key='log_data/', aws_region='us-west-2', jsonpath='log_json_path.json') stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table='staging_songs', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', s3_key='song_data', aws_region='us-west-2')
} dag = DAG('load_songs_and_events', default_args=default_args, description='Load and transform data in Redshift with Airflow', max_active_runs=1, schedule_interval='@hourly') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key= "log-data/{execution_date.year}/{execution_date.month}/{execution_date.year}-{execution_date.month}-{execution_date.day}-events.json", format_json=Variable.get('json_event_format', default_var=default_json_event_format)) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table="staging_songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data/", format_json=Variable.get('json_song_format',
'kpi_item': 'utilisateur', 'S3_key': 'Utilisateur' }, { 'kpi_item': 'enseigne', 'S3_key': 'enseignes' }, { 'kpi_item': 'magasin', 'S3_key': 'magasin' }] for kpi_item in kpi_items: stage_to_redshift = StageToRedshiftOperator( task_id=f"stage_{kpi_item['kpi_item']}_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table=f"staging_{kpi_item['kpi_item']}", S3_bucket="darties", S3_key=kpi_item['S3_key'], delimiter=",", formatting="JSON 'auto'") stage_to_redshifts.append(stage_to_redshift) ### Build and load dimensions milestone_1 = DummyOperator(task_id='milestone_1', dag=dag) build_dimension_tables = [] dimension_items = ["temps", "famille_produit"] for dimension_item in dimension_items: build_dimension_table = BuildDimensionOperator( task_id=f"build_{dimension_item}_dimension_table",
'sparkify-etl-dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', catchup=False, max_active_runs=1, ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn='redshift', aws_credentials='aws_credentials', table='staging_events', s3_bucket='udacity-dend', s3_key='log_data', json_path='s3://udacity-dend/log_json_path.json', sql=SqlQueries.staging_table_copy, provide_context=True, ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, redshift_conn='redshift', aws_credentials='aws_credentials', table='staging_songs', s3_bucket='udacity-dend', s3_key='song_data', json_path='auto',
stage_immigration_to_redshift = StageParquetToRedshiftOperator( task_id='Stage_immigration', dag=dag, table='staging_immigration', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='dend-bucket-oregon-123', s3_key= 'capstone_immigration/immigration_parquet', # s3 does not support wildcard such as * iam_role_arn=AWS_IAM_ROLE_ARN) stage_states_to_redshift = StageToRedshiftOperator( task_id='Stage_states', dag=dag, table='states', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='dend-bucket-oregon-123', s3_key='capstone_immigration/states') stage_airport_code_to_redshift = StageToRedshiftOperator( task_id='Stage_airport_code', dag=dag, table='airport_code', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='dend-bucket-oregon-123', s3_key='capstone_immigration/airport_code') stage_countries_to_redshift = StageToRedshiftOperator( task_id='Stage_countries',
dag = DAG('capstone_main_dag', default_args=default_args, start_date=datetime.datetime.now() - datetime.timedelta(days=1), description='Load and transform data in Redshift with Airflow', schedule_interval=None) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_immigr_to_redshift = StageToRedshiftOperator( task_id='Stage_immigr', dag=dag, table_name='staging_immigr', redshift_conn_id='redshift', s3_bucket='capstone-bucket-immigr', s3_key='staging_immigr.csv', aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-east-1', provide_context=True) stage_demo_to_redshift = StageToRedshiftOperator( task_id='Stage_demo', dag=dag, table_name='staging_demo', redshift_conn_id='redshift', s3_bucket='capstone-bucket-demo', s3_key='demo', aws_credentials={
'catchup': False } dag = DAG('dag_s3_to_redshift', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", target_table_name="staging_events", s3_data_path="s3://udacity-dend/log_data", json_schema="s3://udacity-dend/log_json_path.json", ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", target_table_name="staging_songs", s3_data_path="s3://udacity-dend/song_data", json_schema="auto", )
'data/transformed_citibike_data.csv', 'bucket_name': 'ud-covid-citibike', 'key': 'citibike' }, dag=dag) table_creation = PostgresOperator(task_id='tables_creation', dag=dag, postgres_conn_id='redshift', sql='/create_tables.sql') s3_dates_to_redshift = StageToRedshiftOperator( task_id='s3_dates_to_redshift', dag=dag, conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket='ud-covid-citibike', s3_key='dates', table="dates", file_format='CSV') s3_bike_to_redshift = StageToRedshiftOperator( task_id='s3_bike_to_redshift', dag=dag, conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket='ud-covid-citibike', s3_key='citibike', table="bike", file_format='CSV')
0) # end time for debugging so run dag 10 times ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Create tables in Redshift to store S3 data create_tables = PostgresOperator(task_id='Create_tables', dag=dag, postgres_conn_id='redshift', sql='create_tables.sql') stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table='staging_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', s3_key= 'log_data', # only load events for execution year, for full 'log_data/{{ execution_date.year }}', s3 does not support wildcard such as * json_format='s3://udacity-dend/log_json_path.json') stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table='staging_songs', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', s3_key= 'song_data', # load a small portion of song data with 'song_data/A/A/A' json_path='auto')
def stage_fact_s3_to_redshift( parent_dag_name, child_dag_name, start_date, end_date, schedule_interval, redshift_conn_id, degree_list, s3_data, create_sql, s3_bucket, s3_key, iam_role, region, file_format, extra_copy_parameters='', *args, **kwargs): """ Subdag used to create staging table, copy data from s3 to staging table in redshift and lastly perform a data quality check. Keyword Arguments: parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object child_dag_name -- Child DAG name used to define subdag ID start_date -- DAG start date end_date -- DAG end date schedule_interval -- (e.g. '@monthly', '@weekly', etc.) redshift_conn_id -- Redshift connection ID (str) degree_list -- List of degree names (list) aws_credentials_id -- AWS connection ID (str) s3_bucket -- AWS S3 bucket name (str) s3_date -- S3 data name used to format staging table name create_sql -- SQL used to create staging table s3_key -- AWS S3 bucket data directory/file (str) region -- Redshift cluster configured region (str) file_format -- File format for AWS S3 files (currently only: 'JSON' or 'CSV') (str) """ dag = DAG( dag_id=f"{parent_dag_name}.{child_dag_name}", start_date=start_date, end_date=end_date, schedule_interval=schedule_interval, **kwargs ) for degree in degree_list: table = f'{degree}_{s3_data}' error_table = f'{table}_errors' start_task = DummyOperator(task_id=f'{degree}', dag=dag) create_task = CreatedTableOperator( task_id=f'create_{table}_table', redshift_conn_id=redshift_conn_id, create_sql=create_sql.format(table), table=table, provide_context=True ) copy_task = StageToRedshiftOperator( task_id=f'staging_{table}_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, s3_bucket=s3_bucket, s3_key=s3_key, iam_role=iam_role, s3_data=s3_data, degree=degree, region=region, file_format=file_format, extra_copy_parameters=extra_copy_parameters, provide_context=True ) #push count to xcom for stl count comparison count_check_task = DataQualityOperator( task_id=f'data_quality_check_{table}', dag=dag, redshift_conn_id=redshift_conn_id, table=table, provide_context=True ) check_stl_branch = STLCheckOperator( task_id=f'stl_check_{table}', table=table, error_table=error_table, redshift_conn_id=redshift_conn_id ) staging_success_task = PythonOperator( task_id=f'staging_success_check_{table}', python_callable=staging_success_check, op_kwargs={'redshift_conn_id': redshift_conn_id, 'table': table, 'error_table': error_table}, dag=dag, provide_context=True ) start_task >> create_task create_task >> copy_task copy_task >> [check_stl_branch, count_check_task] check_stl_branch >> staging_success_task return dag
#Instantiate DAG dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') #1. Dummy Task - no functionality start_operator = DummyOperator(task_id='Begin_execution', dag=dag) #2. Copy log files to staging table in Redshift stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_events', s3_bucket="udacity-dend", s3_key="log_data", region='us-west-2', json_path="s3://udacity-dend/log_json_path.json") #3. Copy song files to staging table in Redshift stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_songs', s3_bucket="udacity-dend", s3_key="song_data", region='us-west-2',
dag = DAG('udac_sparkify_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval="@daily", catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=Variable.get('s3_bucket'), s3_key=Variable.get('s3_key_log_data'), log_json_path=Variable.get('s3_key_log_data_json_path'), depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table="staging_songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=Variable.get('s3_bucket'), s3_key=Variable.get('s3_key_song_data'),
redshift_conn_id = 'redshift', dag = dag ) """ connecting to S3 connecting to redshift running the StageToRedshiftOperator operator """ stage_events_to_redshift = StageToRedshiftOperator( task_id="stage_events_to_redshift", dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", json="s3://udacity-dend/log_json_path.json" ) """ connecting to S3 connecting to redshift running the StageToRedshiftOperator operator """ stage_songs_to_redshift = StageToRedshiftOperator( task_id="stage_songs_to_redshift", dag=dag,
) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_redshift_tables = CreateTablesOperator( task_id='Create_tables', dag=dag, redshift_conn_id="redshift" ) logging.info('Starting staging to redshift') stage_covid_to_redshift = StageToRedshiftOperator( task_id='Stage_covid', dag=dag, table="staging_covid", redshift_conn_id="redshift", aws_credentials_id="aws_s3_connection", s3_bucket="udacity-data-lake", s3_key="covid19/staging", region="us-west-2", extra_params="delimiter ';'" ) load_covid_cases_fact_table = LoadFactOperator( task_id='Load_covid_cases_fact_table', dag=dag, table='fact_covid_cases', redshift_conn_id="redshift", load_sql_stmt=SqlQueries.covid_cases_insert ) load_dim_location_table = LoadDimensionOperator(