Пример #1
0
    # [START upload_sheet_to_gcs]
    upload_sheet_to_gcs = GoogleSheetsToGCSOperator(
        task_id="upload_sheet_to_gcs",
        destination_bucket=GCS_BUCKET,
        spreadsheet_id=SPREADSHEET_ID,
    )
    # [END upload_sheet_to_gcs]

    # [START create_spreadsheet]
    create_spreadsheet = GoogleSheetsCreateSpreadsheetOperator(
        task_id="create_spreadsheet", spreadsheet=SPREADSHEET)
    # [END create_spreadsheet]

    # [START print_spreadsheet_url]
    print_spreadsheet_url = BashOperator(
        task_id="print_spreadsheet_url",
        bash_command=f"echo {create_spreadsheet.output['spreadsheet_url']}",
    )
    # [END print_spreadsheet_url]

    # [START upload_gcs_to_sheet]
    upload_gcs_to_sheet = GCSToGoogleSheetsOperator(
        task_id="upload_gcs_to_sheet",
        bucket_name=GCS_BUCKET,
        object_name="{{ task_instance.xcom_pull('upload_sheet_to_gcs')[0] }}",
        spreadsheet_id=NEW_SPREADSHEET_ID,
    )
    # [END upload_gcs_to_sheet]

    create_spreadsheet >> print_spreadsheet_url
    upload_sheet_to_gcs >> upload_gcs_to_sheet
    "retry_delay": timedelta(minutes=5),
}


def _wait_for_file():
    return os.path.exists("/opt/airflow/data/wait.txt")


with DAG(
        "08_sensor",
        default_args=default_args,
        description="A simple tutorial DAG",
        schedule_interval=timedelta(days=1),
) as dag:
    t1 = BashOperator(
        task_id="touch_file_1",
        bash_command="touch /opt/airflow/data/1.txt",
    )

    wait = PythonSensor(
        task_id="wait_for_file",
        python_callable=_wait_for_file,
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    t3 = BashOperator(
        task_id="touch_file_3",
        depends_on_past=True,
        bash_command="touch /opt/airflow/data/2.txt",
Пример #3
0
        schedule_interval='@once',
        start_date=START_DATE,
        catchup=False,
        tags=["example"],
) as build_dag:

    # [START howto_operator_create_build_from_storage]
    create_build_from_storage = CloudBuildCreateBuildOperator(
        task_id="create_build_from_storage",
        project_id=GCP_PROJECT_ID,
        build=create_build_from_storage_body)
    # [END howto_operator_create_build_from_storage]

    # [START howto_operator_create_build_from_storage_result]
    create_build_from_storage_result = BashOperator(
        bash_command=f"echo { create_build_from_storage.output['results'] }",
        task_id="create_build_from_storage_result",
    )
    # [END howto_operator_create_build_from_storage_result]

    # [START howto_operator_create_build_from_repo]
    create_build_from_repo = CloudBuildCreateBuildOperator(
        task_id="create_build_from_repo",
        project_id=GCP_PROJECT_ID,
        build=create_build_from_repo_body)
    # [END howto_operator_create_build_from_repo]

    # [START howto_operator_create_build_from_repo_result]
    create_build_from_repo_result = BashOperator(
        bash_command=f"echo { create_build_from_repo.output['results'] }",
        task_id="create_build_from_repo_result",
    )
Пример #4
0
dag = DAG(dag_id='example_bash_operator',
          default_args=args,
          schedule_interval='0 0 * * *',
          start_date=days_ago(2),
          dagrun_timeout=timedelta(minutes=60),
          tags=['example'])

run_this_last = DummyOperator(
    task_id='run_this_last',
    dag=dag,
)

# [START howto_operator_bash]
run_this = BashOperator(
    task_id='run_after_loop',
    bash_command='echo 1',
    dag=dag,
)
# [END howto_operator_bash]

run_this >> run_this_last

for i in range(3):
    task = BashOperator(
        task_id='runme_' + str(i),
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag,
    )
    task >> run_this

# [START howto_operator_bash_template]
Пример #5
0
) as dag:

    # [START howto_operator_video_intelligence_detect_labels]
    detect_video_label = CloudVideoIntelligenceDetectVideoLabelsOperator(
        input_uri=INPUT_URI,
        output_uri=None,
        video_context=None,
        timeout=5,
        task_id="detect_video_label",
    )
    # [END howto_operator_video_intelligence_detect_labels]

    # [START howto_operator_video_intelligence_detect_labels_result]
    detect_video_label_result = BashOperator(
        bash_command="echo {{ task_instance.xcom_pull('detect_video_label')"
        "['annotationResults'][0]['shotLabelAnnotations'][0]['entity']}}",
        task_id="detect_video_label_result",
    )
    # [END howto_operator_video_intelligence_detect_labels_result]

    # [START howto_operator_video_intelligence_detect_explicit_content]
    detect_video_explicit_content = CloudVideoIntelligenceDetectVideoExplicitContentOperator(
        input_uri=INPUT_URI,
        output_uri=None,
        video_context=None,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id="detect_video_explicit_content",
    )
    # [END howto_operator_video_intelligence_detect_explicit_content]
Пример #6
0

@task
def print_value(value, ts=None):
    """Dummy function"""
    log.info("The knights of Ni say: %s (at %s)", value, ts)


with DAG(
        dag_id='example_xcom_args',
        start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
        catchup=False,
        schedule_interval=None,
        tags=['example'],
) as dag:
    print_value(generate_value())

with DAG(
        "example_xcom_args_with_operators",
        start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
        catchup=False,
        schedule_interval=None,
        tags=['example'],
) as dag2:
    bash_op1 = BashOperator(task_id="c", bash_command="echo c")
    bash_op2 = BashOperator(task_id="d", bash_command="echo c")
    xcom_args_a = print_value("first!")
    xcom_args_b = print_value("second!")

    bash_op1 >> xcom_args_a >> xcom_args_b >> bash_op2
Пример #7
0
        schedule_interval=None,
        tags=['example'],
) as dag:
    create_bucket1 = GCSCreateBucketOperator(task_id="create_bucket1",
                                             bucket_name=BUCKET_1,
                                             project_id=PROJECT_ID)

    create_bucket2 = GCSCreateBucketOperator(task_id="create_bucket2",
                                             bucket_name=BUCKET_2,
                                             project_id=PROJECT_ID)

    list_buckets = GCSListObjectsOperator(task_id="list_buckets",
                                          bucket=BUCKET_1)

    list_buckets_result = BashOperator(
        task_id="list_buckets_result",
        bash_command="echo \"{{ task_instance.xcom_pull('list_buckets') }}\"",
    )

    upload_file = LocalFilesystemToGCSOperator(
        task_id="upload_file",
        src=PATH_TO_UPLOAD_FILE,
        dst=BUCKET_FILE_LOCATION,
        bucket=BUCKET_1,
    )

    transform_file = GCSFileTransformOperator(
        task_id="transform_file",
        source_bucket=BUCKET_1,
        source_object=BUCKET_FILE_LOCATION,
        transform_script=["python", PATH_TO_TRANSFORM_SCRIPT])
    # [START howto_operator_gcs_bucket_create_acl_entry_task]
with DAG(
        "example_passing_params_via_test_command",
        schedule_interval='*/1 * * * *',
        start_date=days_ago(1),
        dagrun_timeout=timedelta(minutes=4),
        tags=['example'],
) as dag:

    my_templated_command = dedent("""
        echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
        echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
    """)

    run_this = PythonOperator(
        task_id='run_this',
        python_callable=my_py_command,
        params={"miff": "agg"},
    )

    also_run_this = BashOperator(
        task_id='also_run_this',
        bash_command=my_templated_command,
        params={"miff": "agg"},
    )

    env_var_test_task = PythonOperator(task_id='env_var_test_task',
                                       python_callable=print_env_vars)

    run_this >> also_run_this
        date = str(ds)
        prices = prices_json['bpi'][date]
        df = pd.DataFrame(views_json['items'])
        df[['bitcoin_price_index']] = prices

        # convert DF to CSV and store in /tmp/home/bitcoin
        df.to_csv(f"/tmp/home/bitcoin/viewsAndFiles_{ds_nodash}.csv",
                  index=False)


with DAG(dag_id="bitcoin-views-and-price-pipeline",
         start_date=airflow.utils.dates.days_ago(5),
         schedule_interval="@daily") as dag:

    fetch_bitcoin_views = BashOperator(
        task_id="fetch_daily_bitcoin_views",
        bash_command=
        "curl -o /tmp/views_{{ ds }}.json -L 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Bitcoin/daily/{{ ds_nodash }}00/{{ ds_nodash }}00'"
    )

    fetch_bitcoin_prices = BashOperator(
        task_id="fetch_daily_bitcoin_prices",
        bash_command=
        "curl -o /tmp/prices_{{ ds }}.json -L 'https://api.coindesk.com/v1/bpi/historical/close.json?start={{ ds }}&end={{ ds }}'"
    )

    save_to_csv = PythonOperator(task_id="save_to_csv",
                                 python_callable=_save_to_csv,
                                 provide_context=True)

    [fetch_bitcoin_views, fetch_bitcoin_prices] >> save_to_csv
Пример #10
0
        "example_gcp_cloud_build",
        default_args=dict(start_date=dates.days_ago(1)),
        schedule_interval='@once',
        tags=['example'],
) as dag:
    # [START howto_operator_create_build_from_storage]
    create_build_from_storage = CloudBuildCreateBuildOperator(
        task_id="create_build_from_storage",
        project_id=GCP_PROJECT_ID,
        body=create_build_from_storage_body)
    # [END howto_operator_create_build_from_storage]

    # [START howto_operator_create_build_from_storage_result]
    create_build_from_storage_result = BashOperator(
        bash_command=
        "echo '{{ task_instance.xcom_pull('create_build_from_storage')['images'][0] }}'",
        task_id="create_build_from_storage_result",
    )
    # [END howto_operator_create_build_from_storage_result]

    create_build_from_repo = CloudBuildCreateBuildOperator(
        task_id="create_build_from_repo",
        project_id=GCP_PROJECT_ID,
        body=create_build_from_repo_body)

    create_build_from_repo_result = BashOperator(
        bash_command=
        "echo '{{ task_instance.xcom_pull('create_build_from_repo')['images'][0] }}'",
        task_id="create_build_from_repo_result",
    )
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator

dag = DAG(
    dag_id="09_no_catchup",
    schedule_interval="@daily",
    start_date=dt.datetime(year=2019, month=1, day=1),
    end_date=dt.datetime(year=2019, month=1, day=5),
    catchup=False,
)

fetch_events = BashOperator(
    task_id="fetch_events",
    bash_command=("mkdir -p /data/events && "
                  "curl -o /data/events/{{ds}}.json "
                  "http://events_api:5000/events?"
                  "start_date={{ds}}&"
                  "end_date={{next_ds}}"),
    dag=dag,
)


def _calculate_stats(**context):
    """Calculates event statistics."""
    input_path = context["templates_dict"]["input_path"]
    output_path = context["templates_dict"]["output_path"]

    events = pd.read_json(input_path)
    stats = events.groupby(["date", "user"]).size().reset_index()

    Path(output_path).parent.mkdir(exist_ok=True)
Пример #12
0
) as dag:

    with open(dag.params["region_cfg"], 'r') as stream:
        regions = yaml.safe_load(stream)

    last_exec_date = dag.get_latest_execution_date()

    if last_exec_date is None:
        last_exec_date = datetime.datetime(year=1970, month=1, day=1)

    unique_id = str(round(last_exec_date.timestamp()))
    directory_output = WORKING_DIR + "/data/exports/whole-genome-clades/" + unique_id + "/"

    mk_dir_task = BashOperator(
        task_id='make_directory',
        bash_command='mkdir -p {{params.directory_output}}',
        params={"directory_output": directory_output},
        dag=dag,
    )

    clades = [
        "B.1.2", "B.1.596", "B.1", "B.1.1.519", "B.1.243", "B.1.234",
        "B.1.526.1", "B.1.1", "B.1.526.2", "B.1.575", "R.1", "B.1.1.7",
        "B.1.429", "B.1.427", "B.1.351", "P.1", "B.1.526", "P.2", "B.1.525",
        "B.1.617", "B.1.617.1", "B.1.617.2"
    ]

    for clade in clades:

        params = {}

        params[
Пример #13
0
        task_id='is_forex_currencies_file_available',
        fs_conn_id='forex_path',
        filepath='forex_currencies.csv',
        poke_interval=5,
        timeout=20
    )

    downloading_rates = PythonOperator(
        task_id='downloading_rates',
        python_callable=_download_rates
    )

    saving_rates = BashOperator(
        task_id='saving_rates',
        bash_command="""
            hdfs dfs -mkdir -p /forex && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex
        """
    )

    creating_forex_rates_table = HiveOperator(
        task_id="creating_forex_rates_table",
        hive_cli_conn_id="hive_default",
        hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates(
                base STRING,
                last_update DATE,
                eur DOUBLE,
                usd DOUBLE,
                nzd DOUBLE,
                gbp DOUBLE,
            location=location,
        )

        # [START howto_operator_bigquery_get_data]
        get_data = BigQueryGetDataOperator(
            task_id="get_data",
            dataset_id=DATASET_NAME,
            table_id=TABLE_1,
            max_results=10,
            selected_fields="value,name",
            location=location,
        )
        # [END howto_operator_bigquery_get_data]

        get_data_result = BashOperator(
            task_id="get_data_result",
            bash_command=f"echo {get_data.output}",
        )

        # [START howto_operator_bigquery_check]
        check_count = BigQueryCheckOperator(
            task_id="check_count",
            sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
            use_legacy_sql=False,
            location=location,
        )
        # [END howto_operator_bigquery_check]

        # [START howto_operator_bigquery_value_check]
        check_value = BigQueryValueCheckOperator(
            task_id="check_value",
            sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
Пример #15
0
with DAG('pipeline',
         start_date=datetime(2022, 3, 28),
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    is_csv_available = FileSensor(task_id='is_csv_available',
                                  fs_conn_id='path',
                                  filepath="owid-covid-data.csv",
                                  poke_interval=5,
                                  timeout=20)

    push_to_hive = BashOperator(task_id="push_to_hive",
                                bash_command="""
            hdfs dfs -mkdir -p /covidData && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/owid-covid-data.csv /covidData
        """)

    create_hive_table = HiveOperator(task_id='create_hive_table',
                                     hive_cli_conn_id='hive_conn',
                                     hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS cov_data(
                iso_code STRING,
                continent STRING,
                location STRING,
                `date` STRING,
                total_cases BIGINT,
                new_cases BIGINT,
                new_cases_smoothed FLOAT,
                total_deaths BIGINT,
Пример #16
0
    "email": "*****@*****.**",
    "start_date": datetime(2021, 3, 12, 17),
    "depends_on_past": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    "email_on_retry": False,
    "email_on_failure": IS_PROD
}

with DAG("gojek-assignment",
         default_args=DEFAULT_ARGS,
         schedule_interval="0 22 * * *",
         max_active_runs=1,
         catchup=True,
         dagrun_timeout=timedelta(minutes=90)) as dag:
    q1_job = BashOperator(task_id="gojek-assignment_q1",
                          bash_command="python q1.py",
                          dag=dag,
                          execution_timeout=timedelta(minutes=60),
                          retry_delay=timedelta(minutes=10),
                          retries=2)

    q2_job = BashOperator(task_id="gojek-assignment_q2",
                          bash_command="python q2.py",
                          dag=dag,
                          execution_timeout=timedelta(minutes=60),
                          retry_delay=timedelta(minutes=10),
                          retries=2)

    q1_job >> q2_job
Пример #17
0
    )
    # [END howto_operator_gcp_pubsub_create_subscription]

    # [START howto_operator_gcp_pubsub_pull_message_with_sensor]
    subscription = "{{ task_instance.xcom_pull('subscribe_task') }}"

    pull_messages = PubSubPullSensor(
        task_id="pull_messages",
        ack_messages=True,
        project_id=GCP_PROJECT_ID,
        subscription=subscription,
    )
    # [END howto_operator_gcp_pubsub_pull_message_with_sensor]

    # [START howto_operator_gcp_pubsub_pull_messages_result]
    pull_messages_result = BashOperator(task_id="pull_messages_result", bash_command=echo_cmd)
    # [END howto_operator_gcp_pubsub_pull_messages_result]

    # [START howto_operator_gcp_pubsub_publish]
    publish_task = PubSubPublishMessageOperator(
        task_id="publish_task",
        project_id=GCP_PROJECT_ID,
        topic=TOPIC_FOR_SENSOR_DAG,
        messages=[MESSAGE] * 10,
    )
    # [END howto_operator_gcp_pubsub_publish]

    # [START howto_operator_gcp_pubsub_unsubscribe]
    unsubscribe_task = PubSubDeleteSubscriptionOperator(
        task_id="unsubscribe_task",
        project_id=GCP_PROJECT_ID,
from datetime import datetime
from pathlib import Path

import pandas as pd
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator

dag = DAG(dag_id="01_unscheduled",
          start_date=datetime(2019, 1, 1),
          schedule_interval=None)

fetch_events = BashOperator(
    task_id="fetch_events",
    bash_command=("mkdir -p /data/events && "
                  "curl -o /data/events.json http://events_api:5000/events"),
    dag=dag,
)


def _calculate_stats(input_path, output_path):
    """Calculates event statistics."""

    Path(output_path).parent.mkdir(exist_ok=True)

    events = pd.read_json(input_path)
    stats = events.groupby(["date", "user"]).size().reset_index()

    stats.to_csv(output_path, index=False)

Пример #19
0
    'example_gcp_vision_annotate_image', default_args=default_args, schedule_interval=None
) as dag_annotate_image:
    # ############################## #
    # ### Annotate image example ### #
    # ############################## #

    # [START howto_operator_vision_annotate_image]
    annotate_image = CloudVisionImageAnnotateOperator(
        request=annotate_image_request, retry=Retry(maximum=10.0), timeout=5, task_id='annotate_image'
    )
    # [END howto_operator_vision_annotate_image]

    # [START howto_operator_vision_annotate_image_result]
    annotate_image_result = BashOperator(
        bash_command="echo {{ task_instance.xcom_pull('annotate_image')"
        "['logoAnnotations'][0]['description'] }}",
        task_id='annotate_image_result',
    )
    # [END howto_operator_vision_annotate_image_result]

    # [START howto_operator_vision_detect_text]
    detect_text = CloudVisionDetectTextOperator(
        image=DETECT_IMAGE,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id="detect_text",
        language_hints="en",
        web_detection_params={'include_geo_results': True},
    )
    # [END howto_operator_vision_detect_text]
Пример #20
0
        schedule_interval='@once',
        tags=['example'],
) as dag:
    create_bucket1 = GCSCreateBucketOperator(task_id="create_bucket1",
                                             bucket_name=BUCKET_1,
                                             project_id=PROJECT_ID)

    create_bucket2 = GCSCreateBucketOperator(task_id="create_bucket2",
                                             bucket_name=BUCKET_2,
                                             project_id=PROJECT_ID)

    list_buckets = GCSListObjectsOperator(task_id="list_buckets",
                                          bucket=BUCKET_1)

    list_buckets_result = BashOperator(
        task_id="list_buckets_result",
        bash_command=f"echo {list_buckets.output}",
    )

    upload_file = LocalFilesystemToGCSOperator(
        task_id="upload_file",
        src=PATH_TO_UPLOAD_FILE,
        dst=BUCKET_FILE_LOCATION,
        bucket=BUCKET_1,
    )

    transform_file = GCSFileTransformOperator(
        task_id="transform_file",
        source_bucket=BUCKET_1,
        source_object=BUCKET_FILE_LOCATION,
        transform_script=["python", PATH_TO_TRANSFORM_SCRIPT],
    )
Пример #21
0
            "name": MODEL_NAME,
        },
    )
    # [END howto_operator_gcp_mlengine_create_model]

    # [START howto_operator_gcp_mlengine_get_model]
    get_model = MLEngineGetModelOperator(
        task_id="get-model",
        project_id=PROJECT_ID,
        model_name=MODEL_NAME,
    )
    # [END howto_operator_gcp_mlengine_get_model]

    # [START howto_operator_gcp_mlengine_print_model]
    get_model_result = BashOperator(
        bash_command="echo \"{{ task_instance.xcom_pull('get-model') }}\"",
        task_id="get-model-result",
    )
    # [END howto_operator_gcp_mlengine_print_model]

    # [START howto_operator_gcp_mlengine_create_version1]
    create_version = MLEngineCreateVersionOperator(
        task_id="create-version",
        project_id=PROJECT_ID,
        model_name=MODEL_NAME,
        version={
            "name": "v1",
            "description": "First-version",
            "deployment_uri": '{}/keras_export/'.format(JOB_DIR),
            "runtime_version": "1.15",
            "machineType": "mls1-c1-m2",
            "framework": "TENSORFLOW",
Пример #22
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'schedule_interval': timedelta(1),
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('tutorial', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instatiating operators
t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)

t1.doc_md = """\
#### Task Documentation
You can document your task using the attributes `doc_md` (markdown),
`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
rendered in the UI's Task Details page.
![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
"""

dag.doc_md = __doc__

t2 = BashOperator(task_id='sleep',
                  depends_on_past=False,
                  bash_command='sleep 5',
                  dag=dag)
Пример #23
0
        labels={"foo": "bar"},
        name="airflow-private-image-pod",
        is_delete_operator_pod=True,
        in_cluster=True,
        task_id="task-two",
        get_logs=True,
    )
    # [END howto_operator_k8s_private_image]

    # [START howto_operator_k8s_write_xcom]
    write_xcom = KubernetesPodOperator(
        namespace='default',
        image='alpine',
        cmds=[
            "sh", "-c",
            "mkdir -p /airflow/xcom/;echo '[1,2,3,4]' > /airflow/xcom/return.json"
        ],
        name="write-xcom",
        do_xcom_push=True,
        is_delete_operator_pod=True,
        in_cluster=True,
        task_id="write-xcom",
        get_logs=True,
    )

    pod_task_xcom_result = BashOperator(
        bash_command="echo \"{{ task_instance.xcom_pull('write-xcom')[0] }}\"",
        task_id="pod_task_xcom_result",
    )
    # [END howto_operator_k8s_write_xcom]
Пример #24
0
        con = sqlite3.connect(data_dir / "commit.db")
        with con:
            commits.to_sql(valid_json.stem[:-6] + "_commits", con, if_exists="replace")
            files_changed.to_sql(
                valid_json.stem[:-6] + "_files_changed", con, if_exists="replace"
            )


git_log_etl = DAG("git_log_etl", default_args={"start_date": "2021-01-01"})

clear_data_dir = BashOperator(
    task_id="clear_data_dir",
    bash_command="""
    cd {{ var.value.data_dir }}
    rm -rf *.json
    rm -rf *.csv
    rm -rf *.db
    """,
    dag=git_log_etl,
)

clear_repos_dir = BashOperator(
    task_id="clear_repos_dir",
    bash_command="""
    cd {{ var.value.repos_dir }}
    rm -rf *
    """,
    dag=git_log_etl,
)

git_clone = BashOperator(
Пример #25
0
                                  'processing_tasks.training_model_c'
                              ])
    print(accuracies)


def _is_accurate():
    return ('accurate')


with DAG('xcom_dag',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    downloading_data = BashOperator(task_id='downloading_data',
                                    bash_command='sleep 3',
                                    do_xcom_push=False)

    with TaskGroup('processing_tasks') as processing_tasks:
        training_model_a = PythonOperator(task_id='training_model_a',
                                          python_callable=_training_model)

        training_model_b = PythonOperator(task_id='training_model_b',
                                          python_callable=_training_model)

        training_model_c = PythonOperator(task_id='training_model_c',
                                          python_callable=_training_model)

    choose_model = PythonOperator(task_id='task_4',
                                  python_callable=_choose_best_model)
Пример #26
0
    tags=['lambda','imageprocessing'])

# arg = json.dumps(kwargs['dag_run'].conf
# print(arg)
face_detection = BranchPythonOperator(
    depends_on_past=False,
    task_id='face_detection',
    python_callable=face_detection,
    provide_context=True,
    dag=dag,
)

# [START howto_operator_bash]
photo_not_meet_requirement = BashOperator(
    task_id='photo_not_meet_requirement',
    bash_command='echo photo_not_meet_requirement',
    dag=dag,
)
check_duplicate = BranchPythonOperator(
    task_id='check_duplicate',
    python_callable=check_duplicate,
    provide_context=True,
    dag=dag,
)
duplicate_face = BashOperator(
    task_id='duplicate_face',
    bash_command='echo duplicate_face',
    dag=dag,
)
failure = BashOperator(
    task_id='failure',
Пример #27
0
    to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D']
    yesterday = date.today() - timedelta(days=1)
    dt = yesterday.strftime("%Y-%m-%d")
    # define where you want to store the tweets csv file in your local directory
    local_dir = "/tmp/"
    # define the location where you want to store in HDFS
    hdfs_dir = " /tmp/"

    for channel in to_channels:

        file_name = "to_" + channel + "_" + yesterday.strftime(
            "%Y-%m-%d") + ".csv"

        load_to_hdfs = BashOperator(
            task_id="put_" + channel + "_to_hdfs",
            bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
            local_dir + file_name + hdfs_dir + channel + "/",
        )

        load_to_hdfs << analyze_tweets

        load_to_hive = HiveOperator(
            task_id="load_" + channel + "_to_hive",
            hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name +
            "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        )
        load_to_hive << load_to_hdfs
        load_to_hive >> hive_to_mysql
Пример #28
0
    upload_sheet_to_gcs = GoogleSheetsToGCSOperator(
        task_id="upload_sheet_to_gcs",
        destination_bucket=GCS_BUCKET,
        spreadsheet_id=SPREADSHEET_ID,
    )
    # [END upload_sheet_to_gcs]

    # [START create_spreadsheet]
    create_spreadsheet = GoogleSheetsCreateSpreadsheet(
        task_id="create_spreadsheet", spreadsheet=SPREADSHEET)
    # [END create_spreadsheet]

    # [START print_spreadsheet_url]
    print_spreadsheet_url = BashOperator(
        task_id="print_spreadsheet_url",
        bash_command=
        "echo {{ task_instance.xcom_pull('create_spreadsheet', key='spreadsheet_url') }}",
    )
    # [END print_spreadsheet_url]

    # [START upload_gcs_to_sheet]
    upload_gcs_to_sheet = GCStoGoogleSheets(
        task_id="upload_gcs_to_sheet",
        bucket_name=GCS_BUCKET,
        object_name="{{ task_instance.xcom_pull('upload_sheet_to_gcs')[0] }}",
        spreadsheet_id=NEW_SPREADSHEET_ID,
    )
    # [END upload_gcs_to_sheet]

    create_spreadsheet >> print_spreadsheet_url
    upload_sheet_to_gcs >> upload_gcs_to_sheet
Пример #29
0
                firstname TEXT NOT NULL,
                lastname TEXT NOT NULL,
                country TEXT NOT NULL,
                username TEXT NOT NULL,
                password TEXT NOT NULL,
                email TEXT NOT NULL PRIMARY KEY
            );
            ''')

    is_api_available = HttpSensor(task_id='is_api_available',
                                  http_conn_id='user_api',
                                  endpoint='api/')

    extracting_users = SimpleHttpOperator(
        task_id='extracting_user',
        http_conn_id='user_api',
        endpoint='api/',
        method='GET',
        response_filter=lambda response: json.loads(response.text),
        log_response=True)

    processing_user = PythonOperator(task_id='processing_user',
                                     python_callable=_processing_user)

    storing_user = BashOperator(
        task_id='storing_user',
        bash_command=
        'echo -e ".separator ","\n.import /tmp/processed_user.csv users" | sqlite3 /home/airflow/airflow/airflow.db'
    )

    creating_table >> is_api_available >> extracting_users >> processing_user >> storing_user
seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator',
          default_args=args,
          schedule_interval=None)

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(task_id='run_after_loop',
                        bash_command='echo 1',
                        dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_' + i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)