# # Data quality check on the Trips table # check_trips = HasRowsOperator(task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips") # # We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is # `tripduration` and the groupby_column is `bikeid` # calculate_facts = FactsCalculatorOperator(task_id="calculate_facts_trips", dag=dag, redshift_conn_id="redshift", origin_table="trips", destination_table="trips_facts", fact_column="tripduration", groupby_column="bikeid") # # Task ordering for the DAG tasks # copy_trips_task >> check_trips check_trips >> calculate_facts # ------------------------PART 2: Custome Operator-------------------------------------------- # This is the solution code for the Custom Operator: facts_calculator import logging
dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv") # # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, provide_context=True, table='trips', redshift_conn_id='redshift') # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator(redshift_conn_id="redshift", origin_table="trips", destination_table="fact_table", fact_column="tripduration", groupby_column="bikeid") # # TODO: Define task ordering for the DAG tasks you defined # copy_trips_task >> check_trips check_trips >> calculate_facts
# # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, redshift_conn_id='redshift', table='trips') # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator(task_id='calculate_facts_table', dag=dag, redshift_conn_id='redshift', origin_table='trips', destination_table='trips_facts', fact_columns='tripduration', groupby_column='bikeid') # # TODO: Define task ordering for the DAG tasks you defined # copy_trips_task >> check_trips check_trips >> calculate_facts ##################### Facts calculator ##################### import logging from airflow.hooks.postgres_hook import PostgresHook from airflow.models import BaseOperator
dag=customDag) # load data from S3 to Resdhift loadTripData = S3ToRedshiftOperator( task_id='loadTripData', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='trips', truncate=False, s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', delimiter=',', ignore_headers=1, dag=customDag) # check data quality checkDataQuality = HasRowsOperator(task_id='checkDataQuality', redshift_conn_id='redshift', table='trips', dag=customDag) # create fact table createFactTable = FactsCalculatorOperator(task_id='createFactTable', redshift_conn_id='redshift', origin_table='trips', destination_table='trip_facts', fact_column='tripduration', group_by_column='bikeid') # set task dependencies createTripsTable >> loadTripData >> checkDataQuality >> createFactTable