def sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval, *args, **kwargs): dag = DAG( '%s.%s' % (parent_dag_name, child_dag_name), start_date=start_date, schedule_interval=schedule_interval, ) run_check_songplays = DataQualityOperator( task_id='Run_data_check_songplays', dag=dag, table='songplays', column='playid', is_null=['playid', 'start_time', 'userid'], redshift_conn_id='redshift_conn_id') run_check_user = DataQualityOperator(task_id='Run_data_check_user', dag=dag, table='users', column='userid', is_null=['userid'], redshift_conn_id='redshift_conn_id') run_check_song = DataQualityOperator(task_id='Run_data_check_song', dag=dag, table='songs', column='songid', is_null=['songid'], redshift_conn_id='redshift_conn_id') run_check_artist = DataQualityOperator(task_id='Run_data_check_artist', dag=dag, table='artists', column='artistid', is_null=['artistid'], redshift_conn_id='redshift_conn_id') run_check_time = DataQualityOperator(task_id='Run_data_check_time', dag=dag, table='"time"', column='start_time', is_null=['start_time'], redshift_conn_id='redshift_conn_id') return dag
def stage_s3_to_redshift_dag( parent_dag_name: str, task_id: str, redshift_conn_id: str = "", aws_credentials_id: str = "", target_table: str = "", s3_bucket: str = None, s3_key: str = None, json_path: Optional[str] = None, ignore_headers: Optional[int] = None, delimiter: Optional[str] = None, default_args: Dict[str, Any] = dict, *args, **kwargs, ): dag = DAG(dag_id=f"{parent_dag_name}.{task_id}", default_args=default_args, **kwargs) stage_events_to_redshift = StageToRedshiftOperator( task_id=f"{parent_dag_name}.Stage_events", redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, target_table=target_table, s3_bucket=s3_bucket, s3_key=s3_key, json_path=json_path, ignore_headers=ignore_headers, delimiter=delimiter, dag=dag, *args, **kwargs) validation_songplays = DataQualityValidator( sql_statement=f"SELECT COUNT(*) FROM {target_table}", result_to_assert=0, should_assert_for_equality=False, ) check_data_task = DataQualityOperator( task_id=f"{parent_dag_name}.Data_Quality_Check", redshift_conn_id=redshift_conn_id, data_quality_validations=[validation_songplays], dag=dag, ) stage_events_to_redshift >> check_data_task return dag
) dim_operators = [ StageToRedshiftOperator( task_id=f"Copy_{dim_table_name}_dim_table", dag=dag, table=dim_table_name, conn_id=REDSHIFT_CONN_ID, aws_credentials_id=AWS_CREDENTIALS_ID, s3_bucket=INPUT_BUCKET, s3_key=f"{dim_table_name}.csv/*.csv", file_format="CSV", provide_context=True, ) for dim_table_name in dim_table_names ] run_quality_checks = DataQualityOperator( task_id="Run_data_quality_checks", dag=dag, conn_id=REDSHIFT_CONN_ID, tables=dim_table_names + [fact_table_name], ) end_operator = DummyOperator(task_id="Stop_execution", dag=dag) start_operator >> [process_sale, process_postcode] process_sale >> create_sale_tables >> dim_operators[:-1] + [stage_sale_table] process_postcode >> create_postcode_tables >> dim_operators[-1] dim_operators + [stage_sale_table] >> run_quality_checks run_quality_checks >> end_operator
load_dim_date = StageToRedshiftOperator( task_id='load_dim_date', dag=dag, table="dim_date", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="i94project", s3_key="stage/output/date.parquet", # copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1") copy_options=('FORMAT AS PARQUET',) ) checks = SqlChecks() run_quality_checks = DataQualityOperator( task_id='run_data_quality_checks', dag=dag, redshift_conn_id="redshift", dq_checks=checks.load_dq_checks() ) end_operator = DummyOperator(task_id='end_execution', dag=dag) # task dependencies start_operator >> [load_dim_arrival_type, load_dim_visa_mode, load_dim_visa_type, process_fact_immigration, process_dim_us_state, process_dim_us_airport, process_dim_country] process_fact_immigration >> [load_dim_date,
table="songplays", data_source=SqlQueries.songplay_table_insert, dag=dag) dim_tables_and_sources = [ ("users", SqlQueries.user_table_insert), ("songs", SqlQueries.song_table_insert), ("artists", SqlQueries.artist_table_insert), ("time", SqlQueries.time_table_insert), ] load_dimension_tables = LoadDimensionOperator(task_id='load_dim_tables', redshift_conn_id="redshift", tables=dim_tables_and_sources, dag=dag) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', redshift_conn_id="redshift", tables=dim_tables_and_sources, dag=dag) end_operator = DummyOperator(task_id='stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_dimension_tables load_dimension_tables >> run_quality_checks run_quality_checks >> end_operator
select_sql=SqlQueries.insert_artists_table, mode='truncate') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', select_sql=SqlQueries.insert_time_table, mode='truncate') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, check_stmts=[{ 'sql': 'SELECT COUNT(*) FROM songplays;', 'op': 'gt', 'val': 0 }, { 'sql': 'SELECT COUNT(*) FROM songplays WHERE songid IS NULL;', 'op': 'eq', 'val': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependencies start_operator >> create_table_staging_songs start_operator >> create_table_staging_events start_operator >> create_table_songplays start_operator >> create_table_artists start_operator >> create_table_songs start_operator >> create_table_users
redshift_conn_id='redshift', table='artists', sql=SqlQueries.artist_table_insert, append=False) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql=SqlQueries.time_table_insert, append=False) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=['songplays', 'users', 'songs', 'artists', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #Task dependencies start_operator >> create_tables_task create_tables_task >> stage_events_to_redshift >> load_songplays_table create_tables_task >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_artist_dimension_table >> run_quality_checks load_songplays_table >> load_user_dimension_table >> run_quality_checks load_songplays_table >> load_song_dimension_table >> run_quality_checks load_songplays_table >> load_time_dimension_table >> run_quality_checks
redshift_conn_id='redshift', query_list=dim_tables_insert_queries, query_type='insert', dag=dag, ) insert_facts_tables = RedshifQueriesOperator( task_id='insert_facts_tables', redshift_conn_id='redshift', query_list=facts_tables_insert_queries, query_type='insert', dag=dag, ) run_quality_checks = DataQualityOperator(task_id='run_quality_checks', redshift_conn_id='redshift', tables_list=tables_list, dag=dag) clean_demography_csv >> copy_demography_to_S3 clean_airports_csv >> copy_airports_to_S3 clean_immigration_parquet >> copy_immigration_to_S3 [copy_demography_to_S3, copy_airports_to_S3, copy_immigration_to_S3 ] >> drop_redshift_tables >> create_redshift_tables >> [ stage_demography_to_redshift, stage_airports_to_redshift, stage_immigration_to_redshift ] >> insert_dim_tables >> insert_facts_tables >> run_quality_checks
, dag=dag , task_id='Load_Special_Dimension_Table' ) city_dim_table = LoadDimensionOperator(table='city_dim' , sql_query=SqlQueries.city_dim_insert , redshift_conn_id=redshift_conn_id , dag=dag , task_id='Load_City_Dimension_Table' ) # Check Data Quality run_quality_checks = DataQualityOperator(tables=['candidate_fact' , 'city_fact' , 'candidate_dim' , 'student_dim' , 'special_dim' , 'city_dim'] , redshift_conn_id=redshift_conn_id , dag=dag , task_id='Checking_Data_Quality' ) # Ending Operator end_operator = DummyOperator(task_id='Stop_Execution') ################################# # DAG: Start loading data into S3 start_operator >> brazil_to_s3 start_operator >> enem_to_s3 # DAG: Load data from S3 into Redshift stage tables
load_covid19_fact_table = LoadTableOperator( task_id='load_covid19_fact_table', dag=dag, table='fact.us_covid_19', select_sql=SqlLoad.insert_covid_table, mode='truncate') load_accident_fact_table = LoadTableOperator( task_id='load_accident_fact_table', dag=dag, table='fact.us_accidents', select_sql=SqlLoad.insert_accident_table, mode='truncate') run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, check_stmts=data_quality_args) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependencies start_operator >> [ create_staging_schema, create_dim_schema, create_fact_schema ] [create_staging_schema, create_dim_schema, create_fact_schema ] >> schema_created schema_created >> [ create_table_staging_covid_19, create_table_staging_us_demographics,
task_id='Load_i94visitors_fact', dag=dag, redshift_conn_id="redshift", table_query=SqlQueries.visitors_fact_insert) load_dates_dim = LoadDimensionOperator(task_id='Load_dates_dim', dag=dag, redshift_conn_id="redshift", table_query=SqlQueries.dates_dim_insert, table="dates_dim", truncate_flag='Y') run_quality_check = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", check_query= "select count(1) from public.i94visitors_fact where reasonforvisit is null", expected_count=10000) run_intg_check = IntegrityCheckOperator( task_id='Run_data_integrity_check', dag=dag, redshift_conn_id="redshift", check_query= "select count(airportid) from public.i94visitors_fact where airportid not in (select airportid from airports_dim)", table_name="i94visitors_fact") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_i94vistors
table='artists', redshift_conn_id='redshift', sql_create=SqlQueries.artist_table_create, sql_select=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id='redshift', sql_create=SqlQueries.time_table_create, sql_select=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=['songplays', 'users', 'artists', 'songs'], sql_check=SqlQueries.check) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #---#---# Define Task Dependencies #---#---# start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_time_dimension_table load_songplays_table >> load_user_dimension_table
dag=dag) load_dim_foreign_table = LoadToRedshiftOperator( task_id='Load_Dim_Foreign_Table', aws_credentials_id='aws_credentials', redshift_conn_id='redshift', s3_bucket='my-bucket', s3_key='dim_foreign_table', table='dim_foreign_table', mode='truncate', dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_Data_Quality_Checks', redshift_conn_id='redshift', tables=[ 'fact_table', 'dim_state_table', 'dim_visa_table', 'dim_foreign_table' ], dag=dag) end_operator = DummyOperator(task_id='Stop_Execution', dag=dag) start_operator >> load_fact_table load_fact_table >> [ load_dim_state_table, load_dim_visa_table, load_dim_foreign_table ] [load_dim_state_table, load_dim_visa_table, load_dim_foreign_table ] >> run_quality_checks
target_table="artists", sql=SqlQueries.artist_table_insert, append = True ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', target_table="time", sql=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=[ 'songplays' ,'songs','artists', 'users','time'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator.set_downstream([stage_events_to_redshift, stage_songs_to_redshift]) load_songplays_table.set_upstream( [stage_events_to_redshift, stage_songs_to_redshift]) load_songplays_table.set_downstream([load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,load_time_dimension_table]) run_quality_checks.set_upstream( [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,load_time_dimension_table])
sql_load_data=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table="time", redshift_conn_id="redshift", sql_load_data=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", dq_query = 'SELECT COUNT(*) FROM public.songplays WHERE userid IS NULL', expected_result = 0 ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> stage_events_to_redshift create_tables >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table
load_time_dimension_table = LoadDimensionOperator( task_id=load_time_dimension_table_task_id, redshift_conn_id="redshift", table="time", aws_credentials_id="aws_credentials", start_date= datetime(2018, 5, 1), sql_source=SqlQueries.time_table_insert, dag=dag ) run_quality_checks = DataQualityOperator( task_id=run_data_quality_checks_task_id, redshift_conn_id="redshift", table="time", dag=dag, provide_context=True, aws_credentials_id="aws_credentials", tables=["staging_events", "users", 'staging_songs', "songs", "artists", "time"] ) end_operator = DummyOperator(task_id=stop_execution_task_id, dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table
redshift_conn_id="redshift", sql_query=SqlQueries.time_table_insert, mode="truncate-insert") # Task to perform quality checks on the data uploaded in Redshift run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, qa_check_list=[{ 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM users", 'expected_result': 104 }, { 'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songplays WHERE playid is null AND userid IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0 }], redshift_conn_id="redshift") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # order of execution for the dag start_operator >> create_table
"bc", "temperature", "presure", "humidity", "clouds", "wind_speed", "wind_deg", "measure_date", "weather_id", ], ) run_quality_checks = DataQualityOperator( task_id="Run_data_quality_checks", dag=dag, conn_id="postgres", table= None, # If table is none, means that quality checks are done to all tables ) end_operator = DummyOperator(task_id="Stop_execution", dag=dag) # Level 1 start_operator >> create_location_table start_operator >> create_time_table start_operator >> create_weather_table start_operator >> create_staging_table # Level 2 create_location_table >> create_measures_table create_location_table >> populate_location_table
parent_dag_name=dag_name, task_id=load_time_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="users", start_date= datetime(2018, 5, 1), sql_query=SqlQueries.time_table_insert, ), task_id=load_time_dimension_table_task_id, dag=dag ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', tables=["songplay", "users", "song", "artist", "time"] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Setting tasks dependencies start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks
) copy_business_reviews_toRedshift = StageToRedshiftOperator( task_id='copy_business_reviews_toRedshift', dag=dag, table="business_reviews", s3_bucket=Variable.get("yelp_bucket"), s3_key='output/business_reviews.json', redshift_conn_id="redshift", aws_credentials_id="aws_credentials", is_json=True ) data_quality_task = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table="business_reviews", ) end_operator = DummyOperator(task_id='end_of_execution', dag=dag) copy_start_operator = DummyOperator(task_id='copy_start_operator', dag=dag) start_operator >> state_dim_task >> business_reviews_fact_task start_operator >> category_dim_task >> business_reviews_fact_task start_operator >> ambience_dim_task >> business_reviews_fact_task start_operator >> user_dim_task >> business_reviews_fact_task # business_reviews_fact_task >> delete_from_s3_task business_reviews_fact_task >> create_tables_task
drop_tables = SQLFileOperator(dag=dag, task_id='drop_tables', query_file='drop_tables.sql', message='Dropping tables', should_run=should_run) create_tables = SQLFileOperator(dag=dag, task_id='create_tables', query_file='create_tables.sql', message='Creating tables', should_run=should_run) copy_immigration_table = CSVToTableOperator(dag=dag, task_id='copy_immigration_table', should_run=should_run) copy_airport_codes_table = CSVToTableOperator( dag=dag, task_id='copy_airport_codes_table', should_run=should_run) copy_global_temperatures_table = CSVToTableOperator( dag=dag, task_id='copy_global_temperatures_table', should_run=should_run) copy_us_cities_table = CSVToTableOperator(dag=dag, task_id='copy_us_cities_table', should_run=should_run) quality_checks_task = DataQualityOperator(dag=dag, task_id='data_quality_checks', quality_checks=quality_checks, should_run=should_run) (start_operator >> drop_tables >> create_tables >> [ copy_immigration_table, copy_airport_codes_table, copy_global_temperatures_table, copy_us_cities_table ] >> quality_checks_task >> end_operator)
catchup=False ) start_data_to_redshift_operation = DummyOperator( task_id='Begin_Migrating_Data_To_Staging_Tables', dag=dag) end_data_to_redshift_operation = DummyOperator( task_id='Begin_Loading_Data_To_Fact_Dimension_Tables', dag=dag) end_of_tasks = DummyOperator( task_id='End_Of_Execution', dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id='redshift', target_tables=["dim_airport_table", "dim_demographic_table", "dim_visitor_table", "fact_city_data_table"], ) for data_type, table_name in staging_tables.items(): stage_data_to_redshift = S3ToRedshiftOperator( task_id='Stage_' + data_type, dag=dag, table=table_name, drop_table=True, s3_bucket='udend-data', s3_folder=data_type, aws_connection_id='aws_credentials', redshift_connection_id='redshift', create_query=globals()[table_name], copy_options="json 'auto'"
load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', redshift_conn_id="redshift", destination_table="artists", sql_query=SqlQueries.artist_table_insert, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", destination_table="time", sql_query=SqlQueries.time_table_insert, dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", target_table="time", dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_time_dimension_table >> run_quality_checks load_artist_dimension_table >> run_quality_checks load_song_dimension_table >> run_quality_checks
table='lounges', redshift_conn_id='redshift', sql=SqlQueries.lounges_table_insert) load_fact_ratings_table = LoadFactOperator( task_id='Load_fact_ratings_fact_table', dag=dag, redshift_conn_id='redshift', sql=SqlQueries.fact_ratings_table_insert) ensure_data_load_in_dims = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', test_cases=[ (SqlQueries.airlines_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), (SqlQueries.lounges_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), (SqlQueries.aircrafts_count_test, operator.gt, 0), ]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_airlines_to_redshift start_operator >> stage_airports_to_redshift start_operator >> stage_lounges_to_redshift start_operator >> stage_seats_to_redshift stage_airlines_to_redshift >> load_airlines_dimension_table >> load_aircrafts_dimension_table >> load_passengers_dimension_table stage_airports_to_redshift >> load_airports_dimension_table >> load_lounges_dimension_table >> load_passengers_dimension_table
insert_query=SqlQueries.rides_table_insert, task_id='Load_rides_facts_table', dag=dag) load_stations_dimension_table = LoadDimensionOperator( redshift_conn_id="redshift", table="stations", create_query=SqlQueries.create_stations, insert_query=SqlQueries.stations_table_insert, task_id='Load_stations_dim_table', dag=dag) load_weather_dimension_table = LoadDimensionOperator( redshift_conn_id="redshift", table="weather", create_query=SqlQueries.create_weather, insert_query=SqlQueries.weather_table_insert, task_id='Load_weather_dim_table', dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_weather_raw_to_redshift, stage_bikes_raw_to_redshift ] >> wait_operator >> [ load_rides_facts_table, load_stations_dimension_table, load_weather_dimension_table ] >> run_quality_checks >> end_operator
final_table=dim_artists_table_name, dql_sql=SqlQueries.artist_table_insert, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID, final_table=dim_time_table_name, dql_sql=SqlQueries.time_table_insert, dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID, data_quality_validations=[ validator_stage_events, validator_stage_songs, validator_songplays, validator_songs, validator_artists, validator_events, validator_time, validator_users ], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG dependency setup start_operator >> db_setup_task db_setup_task >> [ stage_events_s3_to_redshift_and_validate_task, stage_songs_s3_to_redshift_and_validate_task, ] >> load_songplays_table_task
redshift_conn_id="redshift", table="artists", sql=SqlQueries.artist_table_insert, update_strategy="overwrite") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql=SqlQueries.time_table_insert, update_strategy="overwrite") run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=["songplays", "artists", "songs", "users", "time"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Tasks dependencies start_operator >> create_tables >> [ stage_events_to_redshift, stage_songs_to_redshift ] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table
dag=dag, table="d_port", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key='immigration_processed_files/port/I94_Port.csv', file_format='csv', create_stmt=SqlQueries.create_table_d_port) # Check loaded data not null run_quality_checks_airports = DataQualityOperator( task_id="run_quality_checks_airports", dag=dag, redshift_conn_id="redshift", dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM staging_airport WHERE ident is null", 'expected_result': 0 }]) run_quality_checks_us_cities_demo = DataQualityOperator( task_id="run_quality_checks_us_cities_demo", dag=dag, redshift_conn_id="redshift", dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM staging_us_cities_demographics WHERE city is null", 'expected_result': 0 }])
s3_path='s3://capstone-project-janga/equipments.json', table='equipments' ) stage_notifications_to_redshift = StageToRedshiftOperator( task_id='Stage_notifications', dag=dag, redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_path='s3://capstone-project-janga/notifications.json/', table='notifications' ) test_equipments_and_notifications_table = DataQualityOperator( task_id='test_equipments_and_notifications_table', dag=dag, redshift_conn_id='redshift', tables=['equipments', 'notifications', ] ) insert_locations = PythonOperator( task_id='Insert_locations', dag=dag, python_callable=create_location_table ) insert_equipment_notifications = PythonOperator( task_id='Insert_equipment_notifications', dag=dag, python_callable=create_equipment_notification_table )
insert_mode='truncate' ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql_query=SqlQueries.time_table_insert, insert_mode='truncate' ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', sql_query='select count(*) from songplays where userid is null;', expected_result=0 ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # # Task ordering for the DAG tasks # start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table