sql='/sql/fact_reviews.sql', postgres_conn_id='redshift') process_fact_reviews.set_upstream( [process_dim_times, process_dim_users, process_dim_business]) process_fk = PostgresOperator(dag=dag, task_id='process_foreign_keys', sql='/sql/dim_fk.sql', postgres_conn_id='redshift') process_fk.set_upstream([process_fact_tips, process_fact_reviews]) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', dag=dag, redshift_conn_id='redshift', queries=({ "table": "dim_times", "where": "day IS NULL", "result": 0 }, { "table": "fact_review", "where": "user_id IS NULL", "result": 0 }, { "table": "fact_review", "result": 6685900 })) run_quality_checks.set_upstream(process_fk) end_operator = DummyOperator(dag=dag, task_id='end_operator') end_operator.set_upstream(run_quality_checks)
create_table_fact_airbnb.set_upstream(load_table_dim_reviews) load_table_fact_airbnb_austin_la.set_upstream(create_table_fact_airbnb) ## RUN DATA QUALITY CHECKS TO ENSURE Recors have been moved correctly through platforms without any errors run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_CHECKS', dag=dag, provide_context=True, redshift_conn_id='redshift', tables=[ 'DIM_HOSTS', 'DIM_REVIEWS', 'DIM_CALENDARS', 'DIM_PROPERTIES', 'FACT_AIRBNB_AUSTIN_LA' ]) run_quality_checks.set_upstream(load_table_fact_airbnb_austin_la) end_operator = DummyOperator(task_id='END_TASK', dag=dag) end_operator.set_upstream(run_quality_checks) ''' start_operator >> [create_staging_listings_table, create_staging_calendar_table, create_staging_reviews_table] >> [stage_listings_to_redshift, stage_calendars_to_redshift, stage_reviews_to_redshift] >> MID_operator MID_operator >> [create_table_dim_hosts, create_table_dim_properties, create_table_dim_reviews, create_table_dim_calendar] >> [load_table_dim_hosts,
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', table_target='time', dag=dag, redshift_connection_id='redshift', query=SqlQueries.time_table_insert, truncate_before=True) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_connection_id='redshift', tables=['songplays', 'users', 'songs', 'artists', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator.set_downstream( [stage_events_to_redshift, stage_songs_to_redshift]) load_songplays_table.set_upstream( [stage_events_to_redshift, stage_songs_to_redshift]) load_songplays_table.set_downstream([ load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table ]) run_quality_checks.set_upstream([ load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table ]) end_operator.set_upstream(run_quality_checks)
dag=dag, query=SqlQueries.calendars_table_insert, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", operation="insert", table="DIM_CALENDARS") load_dim_calendars_table.set_upstream(create_dim_calendars_table) create_load_fact_airbnb_amst_table = LoadFactOperator( task_id='Create_Load_FACT_AIRBNB_AMST_Table', dag=dag, query=SqlQueries.CREATE_LOAD_FACT_AIRBNB_AMST, redshift_conn_id="redshift", aws_credentials_id="aws_credentials") create_load_fact_airbnb_amst_table.set_upstream(load_dim_hosts_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_reviews_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_properties_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_calendars_table) ##RUN DATA QULAITY CHECKS TO ENSURE THAT RECORDS HAD BEEN MOVED CORRECTLY THROUGH PLATFORMS WITHOUT ANY ERRORS run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_Checks', dag=dag, redshift_conn_id="redshift") run_quality_checks.set_upstream(create_load_fact_airbnb_amst_table) ##DUMMY OPERATOR to indicate that the DAG has run successfully - DAG end_operator = DummyOperator(task_id='END_OPERATOR', dag=dag) end_operator.set_upstream(run_quality_checks)