示例#1
0
    # [START load_function]
    def load(**kwargs):
        ti = kwargs['ti']
        total_value_string = ti.xcom_pull(task_ids='transform', key='total_order_value')
        total_order_value = json.loads(total_value_string)

        print(total_order_value)
    # [END load_function]

    # [START main_flow]
    extract_task = PythonOperator(
        task_id='extract',
        python_callable=extract,
    )
    extract_task.doc_md = """\
#### Extract task
A simple Extract task to get data ready for the rest of the data pipeline.
In this case, getting data is simulated by reading from a hardcoded JSON string.
This data is then put into xcom, so that it can be processed by the next task.
"""

    transform_task = PythonOperator(
        task_id='transform',
        python_callable=transform,
    )
    transform_task.doc_md = """\
#### Transform task
A simple Transform task which takes in the collection of order data from xcom
and computes the total order value.
This computed value is then put into xcom, so that it can be processed by the next task.
示例#2
0
        dag_id,
        default_args=default_args(),
        schedule_interval="10 * * * *",
        start_date=datetime(2021, 1, 1,
                            tzinfo=pendulum.timezone("Asia/Tokyo")),
) as dag:
    dag.doc_md = __doc__

    start = DummyOperator(task_id="start")

    a = PythonOperator(
        task_id="a",
        params={},
        python_callable=task_sample,
    )
    a.doc_md = task_sample.__doc__

    b = BranchPythonOperator(
        task_id="b",
        params={},
        python_callable=task_branch,
    )

    c = DummyOperator(task_id="c")

    d = DummyOperator(task_id="d")

    e = ShortCircuitOperator(
        task_id="e",
        params={},
        trigger_rule="none_failed",
示例#3
0
         echo There are $NUM_TO_PROCESS files to process.
         test $NUM_TO_PROCESS -gt 0
         '''),
     params={'product': product},
 )
 # Thanks https://stackoverflow.com/questions/48580341/how-to-add-manual-tasks-in-an-apache-airflow-dag
 manual_sign_off = PythonOperator(
     task_id=f"manual_sign_off_{product}",
     python_callable=task_to_fail,
     retries=1,
     retry_delay=TIMEOUT,
 )
 manual_sign_off.doc_md = dedent("""
         ## Instructions
         Perform some manual checks that the number of COGs to be generated seems to be about right.
         
         You can also do spot checks that files don't already exist in S3.
         
         Once you're happy, mark this job as **Success** for the DAG to continue running.
     """)
 submit_task_id = f'submit_cog_convert_job_{product}'
 submit_bulk_cog_convert = SSHOperator(
     task_id=submit_task_id,
     command=dedent(COMMON + """
         cd {{work_dir}}
         mkdir out
         
         qsub <<EOF
         #!/bin/bash
         #PBS -l wd,walltime=5:00:00,mem=190GB,ncpus=48,jobfs=1GB
         #PBS -P {{params.project}}
         #PBS -q {{params.queue}}
    df.to_sql(f'clean_{tablename}', con, if_exists='replace', index=False)


for table in tables:
    clean_data = PythonOperator(
        task_id=f'clean_data_{table}',
        python_callable=clean_data_df,
        op_kwargs={'tablename': table},
        dag=dag,
    )
    load_data >> clean_data

# [START documentation]
dag.doc_md = __doc__

load_data.doc_md = """\
#### Load Data 
This task loads data from the csv files in the data directory (set as 
an environment variable DATA_DIR) into the database Airflow creates.
"""

read_data.doc_md = """\
#### Read Data 
This task does nothing. It demonstrates how to use the SQLite operator.
"""

clean_data.doc_md = """\
#### Clean Data 
This task removes a column with pandas. It demonstrates how to alter data 
and write it back into the same table.
"""
re_parse_authors_data = PythonOperator(
    task_id='re_parse_authors',
    dag=dag,
    provide_context=True,
    python_callable=helpers.load_authors,
    op_kwargs={
        'aws_credentials_id': 'aws_credentials',
        'redshift_connection_id': 'redshift',
        's3_credentials_id': 's3_credentials',
        'region': 'us-east-1',
        'bucket': 'arxiv-etl',
        'file_name': 'staging/authors/authors-parsed.json'
    },
)
re_parse_authors_data.doc_md = """
# Parses data from S3 locally and re-formats it to easily work with Redshift COPY, then saves it back to S3
"""

stage_authors_to_redshift = StageFromS3ToRedshiftOperator(
    task_id='stage_authors',
    dag=dag,
    provide_context=True,
    table="staging.authors",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="arxiv-etl",
    s3_key="staging/authors/authors_parsed.csv",
    region="us-east-1",
    file_type="csv")
stage_authors_to_redshift.doc_md = """
示例#6
0
    dag=math_dag
)

t2 = PythonOperator(
    task_id="subtraction_task",
    python_callable=sub_nos,
    depends_on_past=False,
    retries=3,
    dag=math_dag
)

square_task = PythonOperator(
    task_id="square_task",
    python_callable=square_no,
    depends_on_past=True,
    retries=3,
    dag=math_dag
)

math_dag.doc_md = __doc__

t1.doc_md = """\
#### Addition Task Documentation
A simple task to add two numbers
![miztiik-success-green](https://img.shields.io/badge/Miztiik:Automation:Airflow:Level-300-blue)
"""

# Configure Task Dependencies
t1 >> t2
t1 >> square_task
    task_id='get_listings',
    python_callable=get_listings,
    dag=dag,
)

t2 = PythonOperator(
    task_id='send_email',
    provide_context=True,
    python_callable=send_email,
    dag=dag,
)

# noinspection PyStatementEffect
t1 >> t2

# Documentation
dag.doc_md = f"""
#### DAG Documentation
{dag.description}
"""

t1.doc_md = """
#### Task Documentation
Retrieves and stores Zoopla data
"""

t2.doc_md = """
#### Task Documentation
Sends email notification when new data is available
"""
    dag=dag,
)

t2 = PythonOperator(
    task_id='Fetch_Data_and_Create_CSV',
    python_callable=task2,
    retries=3,
    dag=dag,
)

t3 = PythonOperator(
    task_id='Upload_Big_Query',
    python_callable=task3,
    retries=3,
    dag=dag,
)

dag.doc_md = __doc__

t1.doc_md = """\
            #### Task 1 : Install Requirements.
            Install requirements present in requirements.txt
            """

t2.doc_md = """\
            #### Task 2 : Fetch data from API & create a local csv.
            The API provides the change in Covid-19 Cases state-wise everyday
            """

t1 >> t2 >> t3
示例#9
0
        file.columns = file.columns.map(lambda x: x.replace('(', '').replace(
            ')', ''))  # удаляем символы скобок из имен колонок
        engine = PostgresHook(
            postgres_conn_id='postgres_local').get_sqlalchemy_engine()
        file.to_sql('airflow_stg_mining_po',
                    con=engine,
                    index=True,
                    if_exists='replace',
                    schema='beeline')

    # читаем файл и записываем во временную таблицу целевой БД
    process_file = PythonOperator(task_id='process_file',
                                  provide_context=True,
                                  python_callable=process_xls_file)

    process_file.doc_md = """\
        #### Task Documentation
        You can document your task using the attributes `doc_md` (markdown),
        `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
        rendered in the UI's Task Instance Details page.
        ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
        """

    # обновляем целевую таблицу
    update_target_table = PostgresOperator(task_id='update_target_table',
                                           sql='''
            insert into beeline.airflow_mining_po 
                select * from beeline.airflow_stg_mining_po
            on conflict do nothing;
        ''',
                                           postgres_conn_id='postgres_local',
示例#10
0
                           python_callable=createlog,
                           dag=dag)

ExtracttoDF = PythonOperator(task_id='sqlite_to_df',
                             python_callable=getdf,
                             dag=dag)

LoadTask = PythonOperator(task_id='Destinationdb',
                          python_callable=createdb,
                          dag=dag)

UpsertTask = PythonOperator(task_id='Destinationdb_Upsert',
                            python_callable=updatedb,
                            dag=dag)

dag.doc_md = __doc__

ExtracttoDF.doc_md = """\
Extract data from source DB
"""

templated_command = """
{% for i in range(5) %}
    echo "{{ ds }}"
    echo "{{ macros.ds_add(ds, 7)}}"
    echo "{{ params.my_param }}"
{% endfor %}
"""

[ExtracttoDF, CreateLog] >> LoadTask >> UpsertTask
示例#11
0
    """
    message_task(SQL_CONN_STRING, KEY_WORDS, FREQ)


with DAG(
    'create_postgres_db',
    description="Creates Postgres DB for tweets if it doesn't already exist",
    schedule_interval="@once",
    default_args=default_args
) as create_pgdb_dag:

    create_db = PythonOperator(
        task_id='create_db', python_callable=create_postgres_db,
        dag=create_pgdb_dag
    )
    create_db.doc_md = """\
    #### CREATE PGDB
    Creates a database in Postgres for the transformed tweet data, \
    if one does not already exist
    """

    create_db


with DAG(
    'tweetl_dag',
    description='Performs ETL round and triggers slackbot',
    schedule_interval=timedelta(seconds=FREQ),
    catchup=False,
    default_args=default_args
) as tweetl_dag: