示例#1
0
#
#  Data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

#
# We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is
#  `tripduration` and the groupby_column is `bikeid`
#
calculate_facts = FactsCalculatorOperator(task_id="calculate_facts_trips",
                                          dag=dag,
                                          redshift_conn_id="redshift",
                                          origin_table="trips",
                                          destination_table="trips_facts",
                                          fact_column="tripduration",
                                          groupby_column="bikeid")

#
# Task ordering for the DAG tasks
#
copy_trips_task >> check_trips
check_trips >> calculate_facts

# ------------------------PART 2: Custome Operator--------------------------------------------

# This is the solution code for the Custom Operator: facts_calculator

import logging
示例#2
0
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              provide_context=True,
                              table='trips',
                              redshift_conn_id='redshift')

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(redshift_conn_id="redshift",
                                          origin_table="trips",
                                          destination_table="fact_table",
                                          fact_column="tripduration",
                                          groupby_column="bikeid")

#
# TODO: Define task ordering for the DAG tasks you defined
#
copy_trips_task >> check_trips
check_trips >> calculate_facts
#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              redshift_conn_id='redshift',
                              table='trips')

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(task_id='calculate_facts_table',
                                          dag=dag,
                                          redshift_conn_id='redshift',
                                          origin_table='trips',
                                          destination_table='trips_facts',
                                          fact_columns='tripduration',
                                          groupby_column='bikeid')

#
# TODO: Define task ordering for the DAG tasks you defined
#
copy_trips_task >> check_trips
check_trips >> calculate_facts

##################### Facts calculator #####################
import logging

from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import BaseOperator
                                    dag=customDag)

# load data from S3 to Resdhift
loadTripData = S3ToRedshiftOperator(
    task_id='loadTripData',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='trips',
    truncate=False,
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    delimiter=',',
    ignore_headers=1,
    dag=customDag)

# check data quality
checkDataQuality = HasRowsOperator(task_id='checkDataQuality',
                                   redshift_conn_id='redshift',
                                   table='trips',
                                   dag=customDag)

# create fact table
createFactTable = FactsCalculatorOperator(task_id='createFactTable',
                                          redshift_conn_id='redshift',
                                          origin_table='trips',
                                          destination_table='trip_facts',
                                          fact_column='tripduration',
                                          group_by_column='bikeid')

# set task dependencies
createTripsTable >> loadTripData >> checkDataQuality >> createFactTable