Exemplo n.º 1
0
def create_dag(
        dag_id=DAG_ID,
        args=DAG_DEFAULT_ARGS,
        concurrency=CONCURRENCY,
        max_active_runs=CONCURRENCY,
        postgres_conn_id=DB_CONN_ID,
):
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=max_active_runs,
        catchup=False,
        schedule_interval=None,
    )

    with dag:
        start_task = ops.get_log_operator(dag, dag.dag_id, 'Starting')
        run_task = operators.get_smithsonian_sub_provider_update_operator(
          dag,
          postgres_conn_id
        )
        end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished')

        start_task >> run_task >> end_task

    return dag
def create_dag(
        dag_id=DAG_ID,
        args=DAG_DEFAULT_ARGS,
        concurrency=CONCURRENCY,
        max_active_runs=CONCURRENCY,
        schedule_cron=SCHEDULE_CRON,
        postgres_conn_id=DB_CONN_ID,
):
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=max_active_runs,
        schedule_interval=schedule_cron,
        catchup=False
    )
    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        update_image_view = operators.update_image_view(
            dag, postgres_conn_id
        )
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> update_image_view >> end_task

    return dag
Exemplo n.º 3
0
def create_id_partitioned_cleaner_dag(
    dag_id=DAG_ID,
    prefix_length=PREFIX_LENGTH,
    postgres_conn_id=DB_CONN_ID,
    start_date=datetime(1970, 1, 1),
    concurrency=CONCURRENCY,
    default_args=config.DAG_DEFAULT_ARGS,
):
    args = deepcopy(default_args)
    args.update(start_date=start_date)
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=concurrency,
        schedule_interval=None,
        start_date=start_date,
        catchup=False,
    )
    hex_prefixes = pg_cleaner.hex_counter(prefix_length)
    with dag:
        cleaner_list = [
            _get_pg_cleaner_operator(dag, prefix, postgres_conn_id)
            for prefix in hex_prefixes
        ]
        start_task = operator_util.get_log_operator(dag, dag.dag_id, "Started")
        end_task = operator_util.get_log_operator(dag, dag.dag_id, "Ended")
        start_task >> cleaner_list >> end_task
    return dag
Exemplo n.º 4
0
def create_dag(
    dag_id=DAG_ID,
    args=DAG_DEFAULT_ARGS,
    concurrency=CONCURRENCY,
    max_active_runs=CONCURRENCY,
    postgres_conn_id=DB_CONN_ID,
):
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=max_active_runs,
        catchup=False,
        schedule_interval=None,
    )

    with dag:
        start_task = ops.get_log_operator(dag, dag.dag_id, 'Starting')
        run_task_list = [
            operators.get_image_expiration_operator(dag, postgres_conn_id,
                                                    provider)
            for provider in sql.OLDEST_PER_PROVIDER
        ]
        end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished')

        start_task >> run_task_list >> end_task

    return dag
Exemplo n.º 5
0
def create_dag():
    dag = DAG(dag_id=DAG_ID,
              default_args=DAG_DEFAULT_ARGS,
              start_date=datetime(2020, 1, 15),
              schedule_interval="0 16 15 * *",
              catchup=False)

    with dag:
        start_task = get_log_operator(dag, DAG_ID, "Starting")
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, "Finished")

        start_task >> run_task >> end_task

    return dag
Exemplo n.º 6
0
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        start_date=datetime(1970, 1, 1),
        schedule_interval='@daily',
        catchup=False,
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> run_task >> end_task

    return dag
def create_dag():
    dag = DAG(dag_id=DAG_ID,
              default_args=DAG_DEFAULT_ARGS,
              start_date=datetime(2020, 1, 15),
              schedule_interval="0 16 15 * *",
              catchup=False)

    with dag:
        start_task = get_log_operator(dag, DAG_ID, "Starting")
        create_dir_task = get_creator_operator(dag)
        sync_tsvs_task = get_syncer_operator(dag)
        clean_tsvs_task = get_cleaner_operator(dag)
        empty_dir_task = get_deleter_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, "Finished")

        (start_task >> create_dir_task >> sync_tsvs_task >> clean_tsvs_task >>
         empty_dir_task >> end_task)
    return dag
Exemplo n.º 8
0
def create_dag(source,
               script_location,
               dag_id,
               crontab_str=None,
               default_args=DAG_DEFAULT_ARGS):

    dag = DAG(dag_id=dag_id,
              default_args=default_args,
              schedule_interval=crontab_str,
              catchup=False)

    with dag:
        start_task = get_log_operator(dag, source, 'starting')
        run_task = get_runner_operator(dag, source, script_location)
        end_task = get_log_operator(dag, source, 'finished')

        start_task >> run_task >> end_task

    return dag
def create_dag(
    dag_id=DAG_ID,
    args=DAG_DEFAULT_ARGS,
    concurrency=CONCURRENCY,
    max_active_runs=CONCURRENCY,
    schedule_cron=SCHEDULE_CRON,
    postgres_conn_id=DB_CONN_ID,
):
    dag = DAG(dag_id=dag_id,
              default_args=args,
              concurrency=concurrency,
              max_active_runs=max_active_runs,
              schedule_interval=schedule_cron,
              catchup=False)
    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        drop_relations = operators.drop_image_popularity_relations(
            dag,
            postgres_conn_id,
        )
        drop_functions = operators.drop_image_popularity_functions(
            dag,
            postgres_conn_id,
        )
        create_metrics = operators.create_image_popularity_metrics(
            dag, postgres_conn_id)
        update_metrics = operators.update_image_popularity_metrics(
            dag, postgres_conn_id)
        create_percentile = operators.create_image_popularity_percentile(
            dag, postgres_conn_id)
        create_constants = operators.create_image_popularity_constants(
            dag, postgres_conn_id)
        create_popularity = operators.create_image_standardized_popularity(
            dag, postgres_conn_id)
        create_image_view = operators.create_image_view(dag, postgres_conn_id)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        (start_task >> [drop_relations, drop_functions] >> create_metrics >>
         [update_metrics, create_percentile] >> create_constants >>
         create_popularity >> create_image_view >> end_task)

    return dag
Exemplo n.º 10
0
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        # It is important that we don't run the Flickr job in parallel;
        # Otherwise, we might blow through the rate limit
        concurrency=1,
        max_active_runs=1,
        # Flickr has a few images which claim to be uploaded at Unix
        # Timestamp 0 (1 Jan 1970)
        start_date=datetime(1970, 1, 1),
        schedule_interval='@daily',
        catchup=False,
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> run_task >> end_task

    return dag
Exemplo n.º 11
0
def create_day_partitioned_ingestion_dag(
    dag_id,
    main_function,
    reingestion_day_list_list,
    start_date=datetime(1970, 1, 1),
    concurrency=1,
    default_args=conf.DAG_DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=23),
    ingestion_task_timeout=timedelta(hours=2)
):
    """
    Given a `main_function` and `reingestion_day_list_list`, this
    factory method instantiates a DAG that will run the given
    `main_function`, parameterized by a number of dates, whose
    calculation is described below.

    Required Arguments:

    dag_id:                     string giving a unique id of the DAG to
                                be created.
    main_function:              python function to be run. The
                                function must take a single parameter
                                (date) which will be a string of the
                                form 'YYYY-MM-DD'.
    reingestion_day_list_list:  list of lists of integers. It gives the
                                set of days before the current execution
                                date of the DAG for which the
                                `main_function` should be run, and
                                describes how the calls to the function
                                should be prioritized.

    Optional Arguments:

    start_date:              datetime.datetime giving the
                             first valid execution_date of the DAG.
    concurrency:             integer that sets the number of tasks which
                             can run simultaneously for this DAG. It's
                             important to keep the rate limits of the
                             Provider API in mind when setting this
                             parameter.
    default_args:            dictionary which is passed to the
                             airflow.dag.DAG __init__ method.
    dagrun_timeout:          datetime.timedelta giving the total amount
                             of time a given dagrun may take.
    ingestion_task_timeout:  datetime.timedelta giving the amount of
                             time a call to the `main_function` is
                             allowed to take.

    Calculation of ingestion dates:

    The `reingestion_day_list_list` should have the form
        [
            [int, ..., int],
            [int, ..., int],
            ...,
            [int, ..., int]
        ]
    It's not necessary for the inner lists to be the same length. The
    DAG instantiated by this factory method will first run the
    `main_function` for the current execution_date, then for the current
    date minus the number of days given by integers in the first list
    (in an arbitrary order, and possibly in parallel if so configured),
    then for the dates calculated from the second list, and so on.  For
    example, given the `reingestion_day_list_list`
        [
            [1, 2, 3],
            [8, 13, 18],
            [28, 38, 48]
        ],
    and assuming the current execution date is 2020-01-01, the
    instantiated dag will run the `main_function` with the parameters
        [
            ['2020-01-01'],
            ['2019-12-31', 2019-12-30', '2019-12-29'],
            ['2019-12-24', 2019-12-19', '2019-12-14'],
            ['2019-12-04', 2019-11-24', '2019-11-14']
        ].
    The order of the inner lists gives the order in which sets of dates
    may be run.  The order within the inner lists is not relevant.  The
    size of the inner lists does *not* set the number of simultaneous
    executions of the `main_function` allowed; that is set by the
    `concurrency` parameter.
    """
    args = deepcopy(default_args)
    args.update(start_date=start_date)
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=concurrency,
        dagrun_timeout=dagrun_timeout,
        schedule_interval='@daily',
        start_date=start_date,
        catchup=False,
    )
    with dag:
        ingest_operator_list_list = _build_ingest_operator_list_list(
            reingestion_day_list_list,
            dag,
            main_function,
            ingestion_task_timeout
        )
        end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished')
        for i in range(len(ingest_operator_list_list) - 1):
            wait_operator = ops.get_wait_till_done_operator(
                dag,
                f'wait_L{i}'
            )
            cross_downstream(
                ingest_operator_list_list[i],
                [
                    wait_operator,
                    end_task
                ]
            )
            wait_operator >> ingest_operator_list_list[i + 1]
        ingest_operator_list_list[-1] >> end_task

    return dag
Exemplo n.º 12
0
def create_provider_api_workflow(
        dag_id,
        main_function,
        default_args=conf.DAG_DEFAULT_ARGS,
        start_date=datetime(1970, 1, 1),
        concurrency=1,
        schedule_string='@daily',
        dated=True,
        day_shift=0,
        dagrun_timeout=timedelta(minutes=30),
):
    """
    This factory method instantiates a DAG that will run the given
    `main_function`.

    Required Arguments:

    dag_id:         string giving a unique id of the DAG to be created.
    main_function:  python function to be run. If the optional argument
                    `dated` is True, then the function must take a
                    single parameter (date) which will be a string of
                    the form 'YYYY-MM-DD'. Otherwise, the function
                    should take no arguments.

    Optional Arguments:

    default_args:     dictionary which is passed to the airflow.dag.DAG
                      __init__ method.
    start_date:       datetime.datetime giving the first valid execution
                      date of the DAG.
    concurrency:      integer that sets the number of tasks which can
                      run simultaneously for this DAG, and the number of
                      dagruns of this DAG which can be run in parallel.
                      It's important to keep the rate limits of the
                      Provider API in mind when setting this parameter.
    schedule_string:  string giving the schedule on which the DAG should
                      be run.  Passed to the airflow.dag.DAG __init__
                      method.
    dated:            boolean giving whether the `main_function` takes a
                      string parameter giving a date (i.e., the date for
                      which data should be ingested).
    day_shift:        integer giving the number of days before the
                      current execution date the `main_function` should
                      be run (if `dated=True`).
    dagrun_timeout:   datetime.timedelta giving the total amount of time
                      a given dagrun may take.
    """
    args = deepcopy(default_args)
    args.update(start_date=start_date)
    print(args)
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=concurrency,
        dagrun_timeout=dagrun_timeout,
        start_date=start_date,
        schedule_interval=schedule_string,
        catchup=False,
    )

    with dag:
        start_task = ops.get_log_operator(dag, dag.dag_id, 'Starting')
        if dated:
            run_task = ops.get_dated_main_runner_operator(
                dag,
                main_function,
                dagrun_timeout,
                day_shift=day_shift
            )
        else:
            run_task = ops.get_main_runner_operator(dag, main_function)
        end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished')

        start_task >> run_task >> end_task

    return dag