Пример #1
0
def get_s3_to_redshift_dag(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        aws_credentials_id,
        table,
        create_sql_stmt,
        s3_bucket,
        s3_key,
        *args, **kwargs):
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )

    create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=create_sql_stmt
    )

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key
    )

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    #
    check_trips = HasRowsOperator(
        task_id="check_trips_data",
        dag=dag,
        redshift_conn_id="redshift",
        table="trips"
    )

    check_stations = HasRowsOperator(
        task_id="check_stations_data",
        dag=dag,
        redshift_conn_id="redshift",
        table="stations"
    )

    create_task >> copy_task
    #
    # TODO: Use DAG ordering to place the check task
    #
    check_trips >> check_stations

    return dag
Пример #2
0
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    check_task = HasRowsOperator(ask_id=f"check_{table}_data",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_task >> copy_task
    copy_task >> check_task
    return dag
def load_dim_table_dag(parent_dag_name, task_id, redshift_conn_id, table,
                       create_sql_stmt, select_stmt, append_rows, *args,
                       **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    insert_sql = """
        INSERT INTO {}
        {}
        ;
    """

    #create_dimtable_task = DummyOperator(task_id="create_{}_table".format(table),  dag=dag)
    create_dimtable_task = PostgresOperator(task_id=f"create_{table}_table",
                                            dag=dag,
                                            postgres_conn_id=redshift_conn_id,
                                            sql=create_sql_stmt)

    #insert_to_table = DummyOperator(task_id="insert_into_{}".format(table),  dag=dag)
    #insert_to_table = PostgresOperator(
    #    task_id=f"insert_into_{table}",
    #    dag=dag,
    #    postgres_conn_id=redshift_conn_id,
    #    sql=insert_sql.format(
    #        table,
    #        select_stmt
    #    )
    #)
    insert_to_table = LoadDimensionOperator(task_id=f"insert_into_{table}",
                                            dag=dag,
                                            redshift_conn_id="redshift",
                                            table=table,
                                            sql_source=select_stmt,
                                            append_rows=append_rows)

    #check_task = DummyOperator(task_id="check_{}_data".format(table),  dag=dag)
    check_task = HasRowsOperator(task_id=f"check_{table}_data",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_dimtable_task >> insert_to_table
    insert_to_table >> check_task

    return dag
Пример #4
0
def get_s3_to_redshift_subdag(parent_dag_name, task_id, redshift_conn_id,
                              aws_credentials_id, create_tbl, target_table,
                              sql_row, s3_bucket, s3_key, custom, *args,
                              **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    templated_command = """
    echo "**************** {{ task.owner }}, {{ task.task_id }}"
    echo "**************** The execution date : {{ ds }}"
    echo "**************** {{ task_instance_key_str }} is running"
    """

    info_task = BashOperator(
        task_id=f"Info_about_{parent_dag_name}.{task_id}",
        dag=dag,
        depends_on_past=False,
        bash_command=templated_command,
    )

    copy_task = StageToRedshiftOperator(
        task_id=f"copy_{target_table}_from_s3_to_redshift",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        create_tbl=create_tbl,
        target_table=target_table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        custom=custom)

    check_staging = HasRowsOperator(
        task_id=f"check_{target_table}_rows",
        dag=dag,
        sql_row=sql_row,
        redshift_conn_id=redshift_conn_id,
        target_table=target_table,
    )

    info_task >> copy_task
    copy_task >> check_staging
    return dag
    ingest_fact_transaction_table = IngestOperator(
        task_id='ingest_fact_transaction_table',
        schema='public',
        table='fact_transaction',
        create_sql=CreateSQL.create_fact_transaction,
        insert_sql=(InsertSQL.insert_fact_transaction.format(
            tstamp='{{ ts_nodash }}')),
        has_rows=(
            f"{{{{ {xcom_template.format(id='extract_transaction_data')} }}}}"
        ),
    )

    data_check_dim_country = HasRowsOperator(
        task_id='data_check_dim_country',
        schema='public',
        table='dim_country',
        date_filter='{{ ds }}',
        has_rows=(
            f"{{{{ {xcom_template.format(id='extract_country_data')} }}}}"),
    )

    data_check_dim_date = HasRowsOperator(
        task_id='data_check_dim_date',
        schema='public',
        table='dim_date',
        date_filter='{{ ds }}',
        has_rows=(
            f"{{{{ {xcom_template.format(id='extract_transaction_data')} }}}}"
        ),
    )

    data_check_dim_fx_rate = HasRowsOperator(
Пример #6
0
        "stations",
        sql_statements.CREATE_STATIONS_TABLE_SQL,
        s3_bucket="udac-data-pipelines",
        s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
        start_date=start_date,
    ),
    task_id=stations_task_id,
    dag=dag,
)

#
# TODO: Consolidate check_trips and check_stations into a single check in the subdag
#       as we did with the create and copy in the demo
#
check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

check_stations = HasRowsOperator(task_id="check_stations_data",
                                 dag=dag,
                                 redshift_conn_id="redshift",
                                 table="stations")

location_traffic_task = PostgresOperator(
    task_id="calculate_location_traffic",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.LOCATION_TRAFFIC_SQL)

#
# TODO: Reorder the Graph once you have moved the checks
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

copy_trips_task = S3ToRedshiftOperator(
    redshift_conn_id='redshift',
    aws_credentials='aws_credentials',
    table='trips',
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    task_id='copy_trips_data_from_s3_to_redshift',
    dag=dag)

check_trips_task = HasRowsOperator(redshift_conn_id='redshift',
                                   table='trips',
                                   task_id='check_trips_table_data',
                                   dag=dag)

calculate_facts_task = FactsCalculatorOperator(redshift_conn_id='redshift',
                                               origin_table='trips',
                                               destination_table='trips_facts',
                                               fact_column='tripduration',
                                               groupby_column='bikeid',
                                               task_id='calculate_facts_trips',
                                               dag=dag)

copy_trips_task >> check_trips_task
check_trips_task >> calculate_facts_task
Пример #8
0
        sql_statements.CREATE_STATIONS_TABLE_SQL,
        s3_bucket="udac-data-pipelines",
        s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
        start_date=start_date,
    ),
    task_id=stations_task_id,
    dag=dag,
)

#
# TODO: Consolidate check_trips and check_stations into a single check in the subdag
#       as we did with the create and copy in the demo
#
check_trips = HasRowsOperator(
    task_id="check_trips_data",
    dag=dag,
    redshift_conn_id="redshift",
    table="trips"
)

check_stations = HasRowsOperator(
    task_id="check_stations_data",
    dag=dag,
    redshift_conn_id="redshift",
    table="stations"
)location_traffic_task = PostgresOperator(
    task_id="calculate_location_traffic",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.LOCATION_TRAFFIC_SQL
)
Пример #9
0
        **kwargs
    )

    create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=create_sql_stmt
    )

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key
    )

   check_rows = HasRowsOperator(
    task_id=f"check_{table}_rows",
    dag=dag,
    redshift_conn_id=redshift_conn_id,
    table=table
)

    create_task >> copy_task >> check_rows

    return dag