Exemplo n.º 1
0
                                        sql='/sql/fact_reviews.sql',
                                        postgres_conn_id='redshift')
process_fact_reviews.set_upstream(
    [process_dim_times, process_dim_users, process_dim_business])

process_fk = PostgresOperator(dag=dag,
                              task_id='process_foreign_keys',
                              sql='/sql/dim_fk.sql',
                              postgres_conn_id='redshift')
process_fk.set_upstream([process_fact_tips, process_fact_reviews])

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id='redshift',
                                         queries=({
                                             "table": "dim_times",
                                             "where": "day IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "fact_review",
                                             "where": "user_id IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "fact_review",
                                             "result": 6685900
                                         }))
run_quality_checks.set_upstream(process_fk)

end_operator = DummyOperator(dag=dag, task_id='end_operator')
end_operator.set_upstream(run_quality_checks)
create_table_fact_airbnb.set_upstream(load_table_dim_reviews)

load_table_fact_airbnb_austin_la.set_upstream(create_table_fact_airbnb)

## RUN DATA QUALITY CHECKS TO ENSURE Recors have been moved correctly through platforms without any errors
run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_CHECKS',
                                         dag=dag,
                                         provide_context=True,
                                         redshift_conn_id='redshift',
                                         tables=[
                                             'DIM_HOSTS', 'DIM_REVIEWS',
                                             'DIM_CALENDARS', 'DIM_PROPERTIES',
                                             'FACT_AIRBNB_AUSTIN_LA'
                                         ])

run_quality_checks.set_upstream(load_table_fact_airbnb_austin_la)

end_operator = DummyOperator(task_id='END_TASK', dag=dag)

end_operator.set_upstream(run_quality_checks)
'''
start_operator >> [create_staging_listings_table, 
				   create_staging_calendar_table,
				   create_staging_reviews_table] >> [stage_listings_to_redshift, 
				   									stage_calendars_to_redshift,
				   									stage_reviews_to_redshift] >> MID_operator

MID_operator >> [create_table_dim_hosts, 
				create_table_dim_properties,
				create_table_dim_reviews,
				create_table_dim_calendar] >> [load_table_dim_hosts,
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    table_target='time',
    dag=dag,
    redshift_connection_id='redshift',
    query=SqlQueries.time_table_insert,
    truncate_before=True)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_connection_id='redshift',
    tables=['songplays', 'users', 'songs', 'artists', 'time'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator.set_downstream(
    [stage_events_to_redshift, stage_songs_to_redshift])
load_songplays_table.set_upstream(
    [stage_events_to_redshift, stage_songs_to_redshift])
load_songplays_table.set_downstream([
    load_song_dimension_table, load_user_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
])
run_quality_checks.set_upstream([
    load_song_dimension_table, load_user_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
])
end_operator.set_upstream(run_quality_checks)
Exemplo n.º 4
0
    dag=dag,
    query=SqlQueries.calendars_table_insert,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    operation="insert",
    table="DIM_CALENDARS")
load_dim_calendars_table.set_upstream(create_dim_calendars_table)

create_load_fact_airbnb_amst_table = LoadFactOperator(
    task_id='Create_Load_FACT_AIRBNB_AMST_Table',
    dag=dag,
    query=SqlQueries.CREATE_LOAD_FACT_AIRBNB_AMST,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials")
create_load_fact_airbnb_amst_table.set_upstream(load_dim_hosts_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_reviews_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_properties_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_calendars_table)

##RUN DATA QULAITY CHECKS TO ENSURE THAT RECORDS HAD BEEN MOVED CORRECTLY THROUGH PLATFORMS WITHOUT ANY ERRORS
run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_Checks',
                                         dag=dag,
                                         redshift_conn_id="redshift")
run_quality_checks.set_upstream(create_load_fact_airbnb_amst_table)

##DUMMY OPERATOR to indicate that the DAG has run successfully - DAG

end_operator = DummyOperator(task_id='END_OPERATOR', dag=dag)

end_operator.set_upstream(run_quality_checks)