sql_statement=f"SELECT COUNT(*) FROM {table_name_staging_game_match}", result_to_assert=0, should_assert_for_equality=False, ) validator_fact_game_match = DataQualityValidator( sql_statement=f"SELECT COUNT(*) FROM {table_name_fact_game_match}", result_to_assert=0, should_assert_for_equality=False, ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_conn_id=AWS_REDSHIFT_CONN_ID, data_quality_validations=[ validator_staging_game_match, validator_fact_game_match ], dag=dag ) # OPERATORS start_operator = DummyOperator( task_id="Begin_Execution", dag=dag, ) # fetch_external_summoner_data_to_s3_task = DummyOperator( # task_id="Fetch_External_Summoner_To_S3_Data_Task", # dag=dag, # ) fetch_external_champion_to_s3_data_task = FetchAndStageChampionsExternalData(
, dag=dag , task_id='Load_Special_Dimension_Table' ) city_dim_table = LoadDimensionOperator(table='city_dim' , sql_query=SqlQueries.city_dim_insert , redshift_conn_id=redshift_conn_id , dag=dag , task_id='Load_City_Dimension_Table' ) # Check Data Quality run_quality_checks = DataQualityOperator(tables=['candidate_fact' , 'city_fact' , 'candidate_dim' , 'student_dim' , 'special_dim' , 'city_dim'] , redshift_conn_id=redshift_conn_id , dag=dag , task_id='Checking_Data_Quality' ) # Ending Operator end_operator = DummyOperator(task_id='Stop_Execution') ################################# # DAG: Start loading data into S3 start_operator >> brazil_to_s3 start_operator >> enem_to_s3 # DAG: Load data from S3 into Redshift stage tables
select_sql=SqlQueries.insert_artists_table, mode='truncate') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', select_sql=SqlQueries.insert_time_table, mode='truncate') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, check_stmts=[{ 'sql': 'SELECT COUNT(*) FROM songplays;', 'op': 'gt', 'val': 0 }, { 'sql': 'SELECT COUNT(*) FROM songplays WHERE songid IS NULL;', 'op': 'eq', 'val': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependencies start_operator >> create_table_staging_songs start_operator >> create_table_staging_events start_operator >> create_table_songplays start_operator >> create_table_artists start_operator >> create_table_songs start_operator >> create_table_users
redshift_conn_id='redshift', query_list=dim_tables_insert_queries, query_type='insert', dag=dag, ) insert_facts_tables = RedshifQueriesOperator( task_id='insert_facts_tables', redshift_conn_id='redshift', query_list=facts_tables_insert_queries, query_type='insert', dag=dag, ) run_quality_checks = DataQualityOperator(task_id='run_quality_checks', redshift_conn_id='redshift', tables_list=tables_list, dag=dag) clean_demography_csv >> copy_demography_to_S3 clean_airports_csv >> copy_airports_to_S3 clean_immigration_parquet >> copy_immigration_to_S3 [copy_demography_to_S3, copy_airports_to_S3, copy_immigration_to_S3 ] >> drop_redshift_tables >> create_redshift_tables >> [ stage_demography_to_redshift, stage_airports_to_redshift, stage_immigration_to_redshift ] >> insert_dim_tables >> insert_facts_tables >> run_quality_checks
drop_tables = SQLFileOperator(dag=dag, task_id='drop_tables', query_file='drop_tables.sql', message='Dropping tables', should_run=should_run) create_tables = SQLFileOperator(dag=dag, task_id='create_tables', query_file='create_tables.sql', message='Creating tables', should_run=should_run) copy_immigration_table = CSVToTableOperator(dag=dag, task_id='copy_immigration_table', should_run=should_run) copy_airport_codes_table = CSVToTableOperator( dag=dag, task_id='copy_airport_codes_table', should_run=should_run) copy_global_temperatures_table = CSVToTableOperator( dag=dag, task_id='copy_global_temperatures_table', should_run=should_run) copy_us_cities_table = CSVToTableOperator(dag=dag, task_id='copy_us_cities_table', should_run=should_run) quality_checks_task = DataQualityOperator(dag=dag, task_id='data_quality_checks', quality_checks=quality_checks, should_run=should_run) (start_operator >> drop_tables >> create_tables >> [ copy_immigration_table, copy_airport_codes_table, copy_global_temperatures_table, copy_us_cities_table ] >> quality_checks_task >> end_operator)
dag=dag, table="d_port", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key='immigration_processed_files/port/I94_Port.csv', file_format='csv', create_stmt=SqlQueries.create_table_d_port) # Check loaded data not null run_quality_checks_airports = DataQualityOperator( task_id="run_quality_checks_airports", dag=dag, redshift_conn_id="redshift", dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM staging_airport WHERE ident is null", 'expected_result': 0 }]) run_quality_checks_us_cities_demo = DataQualityOperator( task_id="run_quality_checks_us_cities_demo", dag=dag, redshift_conn_id="redshift", dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM staging_us_cities_demographics WHERE city is null", 'expected_result': 0 }])
redshift_conn_id="redshift", sql_query=SqlQueries.time_table_insert, mode="truncate-insert") # Task to perform quality checks on the data uploaded in Redshift run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, qa_check_list=[{ 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM users", 'expected_result': 104 }, { 'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songplays WHERE playid is null AND userid IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0 }], redshift_conn_id="redshift") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # order of execution for the dag start_operator >> create_table
load_time_dimension_table = LoadDimensionOperator( task_id=load_time_dimension_table_task_id, redshift_conn_id="redshift", table="time", aws_credentials_id="aws_credentials", start_date= datetime(2018, 5, 1), sql_source=SqlQueries.time_table_insert, dag=dag ) run_quality_checks = DataQualityOperator( task_id=run_data_quality_checks_task_id, redshift_conn_id="redshift", table="time", dag=dag, provide_context=True, aws_credentials_id="aws_credentials", tables=["staging_events", "users", 'staging_songs', "songs", "artists", "time"] ) end_operator = DummyOperator(task_id=stop_execution_task_id, dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table
sql_load_data=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table="time", redshift_conn_id="redshift", sql_load_data=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", dq_query = 'SELECT COUNT(*) FROM public.songplays WHERE userid IS NULL', expected_result = 0 ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> stage_events_to_redshift create_tables >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table
insert_mode='truncate' ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql_query=SqlQueries.time_table_insert, insert_mode='truncate' ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', sql_query='select count(*) from songplays where userid is null;', expected_result=0 ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # # Task ordering for the DAG tasks # start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table
table="songplays", data_source=SqlQueries.songplay_table_insert, dag=dag) dim_tables_and_sources = [ ("users", SqlQueries.user_table_insert), ("songs", SqlQueries.song_table_insert), ("artists", SqlQueries.artist_table_insert), ("time", SqlQueries.time_table_insert), ] load_dimension_tables = LoadDimensionOperator(task_id='load_dim_tables', redshift_conn_id="redshift", tables=dim_tables_and_sources, dag=dag) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', redshift_conn_id="redshift", tables=dim_tables_and_sources, dag=dag) end_operator = DummyOperator(task_id='stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_dimension_tables load_dimension_tables >> run_quality_checks run_quality_checks >> end_operator
final_table=dim_artists_table_name, dql_sql=SqlQueries.artist_table_insert, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID, final_table=dim_time_table_name, dql_sql=SqlQueries.time_table_insert, dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID, data_quality_validations=[ validator_stage_events, validator_stage_songs, validator_songplays, validator_songs, validator_artists, validator_events, validator_time, validator_users ], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependency setup start_operator >> db_setup_task db_setup_task >> [ stage_events_s3_to_redshift_and_validate_task, stage_songs_s3_to_redshift_and_validate_task, ] >> load_songplays_table_task
table='artists', select_sql=SqlQueries.artist_table_insert, append_insert=True, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id='redshift', table='time', select_sql=SqlQueries.time_table_insert, append_insert=True, dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id='redshift', sql=SqlQueries.check_count, expected_result=0, dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Operator Dependencies start_operator >> create_tables create_tables >> stage_events_to_redshift >> load_songplays_table create_tables >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table >> run_quality_checks load_songplays_table >> load_song_dimension_table >> run_quality_checks load_songplays_table >> load_artist_dimension_table >> run_quality_checks load_songplays_table >> load_time_dimension_table >> run_quality_checks run_quality_checks >> end_operator
load_visitor_arrival_fact_table = LoadFactOperator( task_id='Load_fact_visitor_arrival_table', dag=dag, redshift_conn_id='redshift', table='fact_visitor_arrival', select_query=SqlQueries.visitor_arrival_table_insert, truncate_insert=True ) run_has_rows_quality_checks = DataQualityOperator( task_id='Run_has_rows_data_quality_checks', dag=dag, redshift_conn_id='redshift', checks=[ {"test": "SELECT COUNT(*) FROM dim_port", "not_equals": "0"}, {"test": "SELECT COUNT(*) FROM dim_us_city", "not_equals": "0"}, {"test": "SELECT COUNT(*) FROM dim_us_state", "not_equals": "0"}, {"test": "SELECT COUNT(*) FROM dim_country", "not_equals": "0"}, {"test": "SELECT COUNT(*) FROM dim_date", "not_equals": "0"}, {"test": "SELECT COUNT(*) FROM fact_visitor_arrival", "not_equals": "0"} ] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Execution order start_operator >> [ stage_airport_codes_to_redshift, stage_countries_mapping_to_redshift, stage_travel_mode_mapping_to_redshift, stage_us_cities_to_redshift,
catchup=False ) start_data_to_redshift_operation = DummyOperator( task_id='Begin_Migrating_Data_To_Staging_Tables', dag=dag) end_data_to_redshift_operation = DummyOperator( task_id='Begin_Loading_Data_To_Fact_Dimension_Tables', dag=dag) end_of_tasks = DummyOperator( task_id='End_Of_Execution', dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id='redshift', target_tables=["dim_airport_table", "dim_demographic_table", "dim_visitor_table", "fact_city_data_table"], ) for data_type, table_name in staging_tables.items(): stage_data_to_redshift = S3ToRedshiftOperator( task_id='Stage_' + data_type, dag=dag, table=table_name, drop_table=True, s3_bucket='udend-data', s3_folder=data_type, aws_connection_id='aws_credentials', redshift_connection_id='redshift', create_query=globals()[table_name], copy_options="json 'auto'"
table='artists', redshift_conn_id='redshift', sql_create=SqlQueries.artist_table_create, sql_select=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id='redshift', sql_create=SqlQueries.time_table_create, sql_select=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=['songplays', 'users', 'artists', 'songs'], sql_check=SqlQueries.check) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #---#---# Define Task Dependencies #---#---# start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_time_dimension_table load_songplays_table >> load_user_dimension_table
table='lounges', redshift_conn_id='redshift', sql=SqlQueries.lounges_table_insert) load_fact_ratings_table = LoadFactOperator( task_id='Load_fact_ratings_fact_table', dag=dag, redshift_conn_id='redshift', sql=SqlQueries.fact_ratings_table_insert) ensure_data_load_in_dims = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', test_cases=[ (SqlQueries.airlines_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), (SqlQueries.lounges_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), ]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_airlines_to_redshift start_operator >> stage_airports_to_redshift start_operator >> stage_lounges_to_redshift start_operator >> stage_seats_to_redshift stage_airlines_to_redshift >> load_airlines_dimension_table >> load_aircrafts_dimension_table >> load_passengers_dimension_table stage_airports_to_redshift >> load_airports_dimension_table >> load_lounges_dimension_table >> load_passengers_dimension_table
dag=dag) load_dim_foreign_table = LoadToRedshiftOperator( task_id='Load_Dim_Foreign_Table', aws_credentials_id='aws_credentials', redshift_conn_id='redshift', s3_bucket='my-bucket', s3_key='dim_foreign_table', table='dim_foreign_table', mode='truncate', dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_Data_Quality_Checks', redshift_conn_id='redshift', tables=[ 'fact_table', 'dim_state_table', 'dim_visa_table', 'dim_foreign_table' ], dag=dag) end_operator = DummyOperator(task_id='Stop_Execution', dag=dag) start_operator >> load_fact_table load_fact_table >> [ load_dim_state_table, load_dim_visa_table, load_dim_foreign_table ] [load_dim_state_table, load_dim_visa_table, load_dim_foreign_table ] >> run_quality_checks
) dim_operators = [ StageToRedshiftOperator( task_id=f"Copy_{dim_table_name}_dim_table", dag=dag, table=dim_table_name, conn_id=REDSHIFT_CONN_ID, aws_credentials_id=AWS_CREDENTIALS_ID, s3_bucket=INPUT_BUCKET, s3_key=f"{dim_table_name}.csv/*.csv", file_format="CSV", provide_context=True, ) for dim_table_name in dim_table_names ] run_quality_checks = DataQualityOperator( task_id="Run_data_quality_checks", dag=dag, conn_id=REDSHIFT_CONN_ID, tables=dim_table_names + [fact_table_name], ) end_operator = DummyOperator(task_id="Stop_execution", dag=dag) start_operator >> [process_sale, process_postcode] process_sale >> create_sale_tables >> dim_operators[:-1] + [stage_sale_table] process_postcode >> create_postcode_tables >> dim_operators[-1] dim_operators + [stage_sale_table] >> run_quality_checks run_quality_checks >> end_operator
load_covid19_fact_table = LoadTableOperator( task_id='load_covid19_fact_table', dag=dag, table='fact.us_covid_19', select_sql=SqlLoad.insert_covid_table, mode='truncate') load_accident_fact_table = LoadTableOperator( task_id='load_accident_fact_table', dag=dag, table='fact.us_accidents', select_sql=SqlLoad.insert_accident_table, mode='truncate') run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, check_stmts=data_quality_args) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependencies start_operator >> [ create_staging_schema, create_dim_schema, create_fact_schema ] [create_staging_schema, create_dim_schema, create_fact_schema ] >> schema_created schema_created >> [ create_table_staging_covid_19, create_table_staging_us_demographics,
parent_dag_name=dag_name, task_id=load_time_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="users", start_date= datetime(2018, 5, 1), sql_query=SqlQueries.time_table_insert, ), task_id=load_time_dimension_table_task_id, dag=dag ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', tables=["songplay", "users", "song", "artist", "time"] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Setting tasks dependencies start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks
task_id='Load_i94visitors_fact', dag=dag, redshift_conn_id="redshift", table_query=SqlQueries.visitors_fact_insert) load_dates_dim = LoadDimensionOperator(task_id='Load_dates_dim', dag=dag, redshift_conn_id="redshift", table_query=SqlQueries.dates_dim_insert, table="dates_dim", truncate_flag='Y') run_quality_check = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", check_query= "select count(1) from public.i94visitors_fact where reasonforvisit is null", expected_count=10000) run_intg_check = IntegrityCheckOperator( task_id='Run_data_integrity_check', dag=dag, redshift_conn_id="redshift", check_query= "select count(airportid) from public.i94visitors_fact where airportid not in (select airportid from airports_dim)", table_name="i94visitors_fact") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_i94vistors
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id='redshift', table='artists', sql=SqlQueries.artists_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', table_list=['songplays', 'artists', 'songs', 'users', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> table_created_check table_created_check >> [stage_events_to_redshift, stage_songs_to_redshift] stage_events_to_redshift >> [ load_songplays_fact_table, load_users_dimension_table ] stage_songs_to_redshift >> [ load_songplays_fact_table, load_artists_dimension_table, load_songs_dimension_table
"bc", "temperature", "presure", "humidity", "clouds", "wind_speed", "wind_deg", "measure_date", "weather_id", ], ) run_quality_checks = DataQualityOperator( task_id="Run_data_quality_checks", dag=dag, conn_id="postgres", table= None, # If table is none, means that quality checks are done to all tables ) end_operator = DummyOperator(task_id="Stop_execution", dag=dag) # Level 1 start_operator >> create_location_table start_operator >> create_time_table start_operator >> create_weather_table start_operator >> create_staging_table # Level 2 create_location_table >> create_measures_table create_location_table >> populate_location_table
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id='redshift', sql_query=SqlQueries.artist_table_insert, table_name="artists", append_only=False) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', sql_query=SqlQueries.time_table_insert, table_name="time", append_only=False) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=["artists", "songplays", "songs", "time", "users"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Task dependencies start_operator >> stage_events_to_redshift >> load_songplays_table start_operator >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table >> run_quality_checks load_songplays_table >> load_song_dimension_table >> run_quality_checks load_songplays_table >> load_artist_dimension_table >> run_quality_checks load_songplays_table >> load_time_dimension_table >> run_quality_checks run_quality_checks >> end_operator
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", table="artists", sql_columns=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql_columns=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table_collection=["songplays", "users", "songs", "artists", "time"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_staging_songs_table >> transient_operator start_operator >> create_staging_events_table >> transient_operator start_operator >> create_songplays_table >> transient_operator start_operator >> create_artists_table >> transient_operator start_operator >> create_songs_table >> transient_operator start_operator >> create_users_table >> transient_operator start_operator >> create_time_table >> transient_operator transient_operator >> stage_events_to_redshift >> load_songplays_table transient_operator >> stage_songs_to_redshift >> load_songplays_table
load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', redshift_conn_id="redshift", destination_table="artists", sql_query=SqlQueries.artist_table_insert, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", destination_table="time", sql_query=SqlQueries.time_table_insert, dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", target_table="time", dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_time_dimension_table >> run_quality_checks load_artist_dimension_table >> run_quality_checks load_song_dimension_table >> run_quality_checks
) copy_business_reviews_toRedshift = StageToRedshiftOperator( task_id='copy_business_reviews_toRedshift', dag=dag, table="business_reviews", s3_bucket=Variable.get("yelp_bucket"), s3_key='output/business_reviews.json', redshift_conn_id="redshift", aws_credentials_id="aws_credentials", is_json=True ) data_quality_task = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table="business_reviews", ) end_operator = DummyOperator(task_id='end_of_execution', dag=dag) copy_start_operator = DummyOperator(task_id='copy_start_operator', dag=dag) start_operator >> state_dim_task >> business_reviews_fact_task start_operator >> category_dim_task >> business_reviews_fact_task start_operator >> ambience_dim_task >> business_reviews_fact_task start_operator >> user_dim_task >> business_reviews_fact_task # business_reviews_fact_task >> delete_from_s3_task business_reviews_fact_task >> create_tables_task
insert_query=SqlQueries.rides_table_insert, task_id='Load_rides_facts_table', dag=dag) load_stations_dimension_table = LoadDimensionOperator( redshift_conn_id="redshift", table="stations", create_query=SqlQueries.create_stations, insert_query=SqlQueries.stations_table_insert, task_id='Load_stations_dim_table', dag=dag) load_weather_dimension_table = LoadDimensionOperator( redshift_conn_id="redshift", table="weather", create_query=SqlQueries.create_weather, insert_query=SqlQueries.weather_table_insert, task_id='Load_weather_dim_table', dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_weather_raw_to_redshift, stage_bikes_raw_to_redshift ] >> wait_operator >> [ load_rides_facts_table, load_stations_dimension_table, load_weather_dimension_table ] >> run_quality_checks >> end_operator
task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id='redshift', dimension_query=SqlQueries.time_table_insert, delect_or_append= 'delete' # can either be 'append' to append data or 'delete' to truncate table and then add data. ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, table='songplays', column='userid', redshift_conn_id='redshift', dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0 }], ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table