示例#1
0
_steps = {
    'wofs': {
        'algorithm': "wofs-wf",
        'version': '1.0',
        'queue': queue_utils.assign_queue(),
    },
}

args = {
    'owner': 'mp.mancipe10',
    'start_date': airflow.utils.dates.days_ago(2),
    'execID': "mp.mancipe10_wofs_paso_3_clasificacion_varios_anhos",
    'product': _params['products'][0]
}

dag = DAG(dag_id=args['execID'],
          default_args=args,
          schedule_interval=None,
          dagrun_timeout=timedelta(minutes=120))

wofs = dag_utils.queryMapByTileByYear(lat=_params['lat'],
                                      lon=_params['lon'],
                                      time_ranges=_params['time_ranges'],
                                      product=_params['products'][0],
                                      algorithm=_steps['wofs']['algorithm'],
                                      version=_steps['wofs']['version'],
                                      queue=_steps['wofs']['queue'],
                                      dag=dag,
                                      task_id="wofs")

wofs
示例#2
0
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

default_args = {
    "start_date": datetime(2020, 4, 15),
}

cleandata_dag = DAG("cleandata",
                    default_args=default_args,
                    schedule_interval="@daily")

# Modify the templated command to handle a
# second argument called filename.
templated_command = """
  bash cleandata.sh {{ ds_nodash }} {{ params.filename }}
"""

# Modify clean_task to pass the new argument
clean_task = BashOperator(task_id="cleandata_task",
                          bash_command=templated_command,
                          params={
                              "filename": "salesdata.txt",
                          },
                          dag=cleandata_dag)

# Create a new BashOperator clean_task2
clean_task2 = BashOperator(task_id="cleandata_task2",
                           bash_command=templated_command,
                           params={
                               "filename": "supportdata.txt",
示例#3
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This is an example dag for using the Kubernetes Executor.
"""
import os

import airflow
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator

args = {'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id='example_kubernetes_executor',
          default_args=args,
          schedule_interval=None)

affinity = {
    'podAntiAffinity': {
        'requiredDuringSchedulingIgnoredDuringExecution': [{
            'topologyKey': 'kubernetes.io/hostname',
            'labelSelector': {
                'matchExpressions': [{
                    'key': 'app',
                    'operator': 'In',
                    'values': ['airflow']
                }]
            }
        }]
    }
示例#4
0
import airflow
from airflow.operators.bash_operator import BashOperator
from airflow.models import DAG
from datetime import timedelta

args = {"owner": "Aldo", "start_date": airflow.utils.dates.days_ago(7)}

dag = DAG(
    dag_id="example_bash_lunch_time",
    default_args=args,
    schedule_interval="30 12 * * *",
    dagrun_timeout=timedelta(minutes=60),
)

echo = 'echo "It\'s lunch time!"'
echo_task = BashOperator(task_id="lunch_time", bash_command=echo, dag=dag)

if __name__ == "__main__":
    dag.cli()
示例#5
0
from airflow.operators.email_operator import EmailOperator
from airflow.operators.mysql_operator import MySqlOperator

email = Variable.get('email', deserialize_json=True)

local_tz = pendulum.timezone("America/Mexico_City")

cfg = '/usr/local/airflow/dags/templates/sql_scritps'
default_args = {
    'owner': 'sergio',
    'start_date': datetime(2021, 3, 1, tzinfo=local_tz)
}

with DAG('11-load-dimensions.py',
         default_args=default_args,
         schedule_interval=None,
         template_searchpath=cfg,
         catchup=False,
         is_paused_upon_creation=False) as dag:

    start = DummyOperator(task_id='start')

    truncate_tables = MySqlOperator(task_id='truncate_tables',
                                    mysql_conn_id='cool_car',
                                    sql='00-truncate_tables.sql',
                                    autocommit=True,
                                    dag=dag)

    load_branch = MySqlOperator(task_id='load_branch_office',
                                mysql_conn_id='cool_car',
                                sql='01-dim_branch_office.sql',
                                autocommit=True,
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from datetime import datetime

from airflow.models import DAG
from airflow.operators.bash import BashOperator

DEFAULT_DATE = datetime(2019, 12, 1)

dag = DAG(dag_id='test_dag_under_subdir2',
          start_date=DEFAULT_DATE,
          schedule_interval=None)
task = BashOperator(task_id='task1',
                    bash_command='echo "test dag under sub directory subdir2"',
                    dag=dag)
import time
import logging

from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

logger = logging.getLogger(__name__)

args = {
    'start_date': days_ago(1),
    'owner': 'airflow',
}

dag = DAG(dag_id='common_target', default_args=args, schedule_interval=None)


def run_this_func(dag_run, **kwargs):
    timeout = dag_run.conf['timeout']
    logger.info("Chunk received: {}".format(timeout))
    time.sleep(timeout)


chunk_handler = PythonOperator(task_id='chunk_handler',
                               provide_context=True,
                               python_callable=run_this_func,
                               dag=dag)
示例#8
0
session = settings.Session()

default_args = {
    'owner': DAG_OWNER_NAME,
    'depends_on_past': False,
    'email': ALERT_EMAIL_ADDRESSES,
    'email_on_failure': True,
    'email_on_retry': False,
    'start_date': START_DATE,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG(DAG_ID,
          default_args=default_args,
          schedule_interval=SCHEDULE_INTERVAL,
          start_date=START_DATE)
if hasattr(dag, 'doc_md'):
    dag.doc_md = __doc__
if hasattr(dag, 'catchup'):
    dag.catchup = False


def print_configuration_function(**context):
    logging.info("Loading Configurations...")
    dag_run_conf = context.get("dag_run").conf
    logging.info("dag_run.conf: " + str(dag_run_conf))
    max_db_entry_age_in_days = None
    if dag_run_conf:
        max_db_entry_age_in_days = dag_run_conf.get("maxDBEntryAgeInDays",
                                                    None)
示例#9
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(hours=1),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

# The execution date as YYYY-MM-DD
date = "{{ ds }}"

dag = DAG('etl_daily',
          start_date=datetime(2016, 8, 4),
          schedule_interval="0 9 * * MON-FRI",
          default_args=default_args)

t1 = PythonOperator(task_id='daily_futures_price_ingest',
                    python_callable=DailyFuturesPriceIngest.launch,
                    dag=dag,
                    provide_context=True)

t2 = PythonOperator(task_id='daily_generic_futures_price_ingest',
                    python_callable=DailyGenericFuturesPriceIngest.launch,
                    dag=dag,
                    provide_context=True)

t3 = PythonOperator(task_id='daily_equity_index_price_ingest',
                    python_callable=DailyEquityIndexPriceIngest.launch,
                    dag=dag,
示例#10
0
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import os

from airflow.models import DAG
from airflow.providers.microsoft.azure.operators.wasb_delete_blob import WasbDeleteBlobOperator
from airflow.providers.microsoft.azure.transfers.local_to_wasb import LocalFilesystemToWasbOperator
from airflow.utils.dates import days_ago

PATH_TO_UPLOAD_FILE = os.environ.get('AZURE_PATH_TO_UPLOAD_FILE',
                                     'example-text.txt')

with DAG("example_local_to_wasb",
         schedule_interval="@once",
         start_date=days_ago(2)) as dag:
    upload = LocalFilesystemToWasbOperator(task_id="upload_file",
                                           file_path=PATH_TO_UPLOAD_FILE,
                                           container_name="mycontainer",
                                           blob_name='myblob')
    delete = WasbDeleteBlobOperator(task_id="delete_file",
                                    container_name="mycontainer",
                                    blob_name="myblob")
    upload >> delete
# SEARCH_PATH = f'{AIRFLOW_HOME}/scripts/sql/'  # development
# RESOURCE_PATH = f'{AIRFLOW_HOME}/resources/'  # development
SEARCH_PATH = f'{AIRFLOW_HOME}/dags/efs/uw211dashboard/scripts/sql/'  # production
RESOURCE_PATH = f'{AIRFLOW_HOME}/dags/efs/uw211dashboard/resources/'  # production

args = {
    'owner': '211dashboard',
    'start_date': datetime(2020, 6, 1),
    'concurrency': 1,
    'retries': 0,
    'depends_on_past': False,
    'catchup': False
}

dag = DAG(dag_id='211dash_manual_update',
          schedule_interval='@once',
          template_searchpath=SEARCH_PATH,
          default_args=args)
''' Define manual update operators. '''
''' 1. Census data operators '''

truncate_core_census_tables = PostgresOperator(
    task_id='truncate_core_census_tables', sql='trnctTbls_census.sql', dag=dag)

transform_census_county_files = PythonOperator(
    task_id='transform_census_county_files',
    python_callable=transform_static_s3,
    op_kwargs={
        'data': 'census_county',
        'filename': 'census_data_by_county.csv',
        'resource_path': RESOURCE_PATH,
        'transformer': transform_census_data,
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Example of the LatestOnlyOperator"""

import datetime as dt

from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.latest_only_operator import LatestOnlyOperator
from airflow.utils.dates import days_ago

dag = DAG(dag_id='latest_only',
          schedule_interval=dt.timedelta(hours=4),
          start_date=days_ago(2),
          tags=['example'])

latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag)
task1 = DummyOperator(task_id='task1', dag=dag)

latest_only >> task1
示例#13
0
# init logger
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)

default_args = {
            "owner": "Airflow",
            "start_date": airflow.utils.dates.days_ago(1),
            "depends_on_past": False,
            "email_on_failure": False,
            "email_on_retry": False,
            "email": "*****@*****.**",
            "retries": 1,
            "retry_delay": timedelta(minutes=1)
        }

with DAG(dag_id="ddt-spark-k8s-operator", schedule_interval="@hourly", default_args=default_args, catchup=False) as dag:
    t1 = SparkKubernetesOperator(
        task_id='stage_1_submit',
        namespace="ddt-compute",
        application_file="SparkApplication_stage_1.yaml",
        kubernetes_conn_id="kubernetes_default",
        do_xcom_push=True
    )
    t2 = SparkKubernetesSensor(
        task_id='stage_1_monitor',
        namespace="ddt-compute",
        application_name="{{ task_instance.xcom_pull(task_ids='stage_1_submit')['metadata']['name'] }}",
        kubernetes_conn_id="kubernetes_default",

    )
    t1 >> t2
示例#14
0
                                  python_callable=print_context)
        sd1_op_7 = PythonOperator(task_id='sd1_op_7',
                                  provide_context=True,
                                  python_callable=print_context)

        sd1_op_1 >> [sd1_op_2, sd1_op_3]
        sd1_op_6 >> sd1_op_2
        sd1_op_2 >> [sd1_op_4, sd1_op_5]
        sd1_op_5 >> sd1_op_7
        sd1_op_3 >> sd1_op_7

        return dag


with DAG('ShortCircuitPlayground',
         'A playground DAG',
         default_args=default_args) as dag:
    op_1 = PythonOperator(task_id='op_1',
                          provide_context=True,
                          python_callable=print_context)
    op_2 = PythonOperator(task_id='op_2',
                          provide_context=True,
                          python_callable=print_context)
    op_3 = PythonOperator(task_id='op_3',
                          provide_context=True,
                          python_callable=print_context)
    op_4 = PythonOperator(task_id='op_4',
                          provide_context=True,
                          python_callable=print_context)
    op_5 = PythonOperator(task_id='op_5',
                          provide_context=True,
示例#15
0
try:
    from airflow.operators.empty import EmptyOperator
except ModuleNotFoundError:
    from airflow.operators.dummy import DummyOperator as EmptyOperator  # type: ignore
from airflow.providers.dbt.cloud.operators.dbt import (
    DbtCloudGetJobRunArtifactOperator,
    DbtCloudRunJobOperator,
)
from airflow.providers.dbt.cloud.sensors.dbt import DbtCloudJobRunSensor
from airflow.utils.edgemodifier import Label

with DAG(
    dag_id="example_dbt_cloud",
    default_args={"dbt_cloud_conn_id": "dbt", "account_id": 39151},
    start_date=datetime(2021, 1, 1),
    schedule_interval=None,
    catchup=False,
) as dag:
    begin = EmptyOperator(task_id="begin")
    end = EmptyOperator(task_id="end")

    # [START howto_operator_dbt_cloud_run_job]
    trigger_job_run1 = DbtCloudRunJobOperator(
        task_id="trigger_job_run1",
        job_id=48617,
        check_interval=10,
        timeout=300,
    )
    # [END howto_operator_dbt_cloud_run_job]
示例#16
0
import datetime

from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator

dag = DAG(
    dag_id="chapter12_task_sla",
    default_args={"email": "*****@*****.**"},
    schedule_interval=datetime.timedelta(hours=12),
    start_date=datetime.datetime(2020, 4, 1),
)

sleeptask = BashOperator(
    task_id="sleeptask",
    bash_command="sleep 5",
    sla=datetime.timedelta(seconds=1),
    dag=dag,
)
from airflow import utils
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator

dag = DAG(
    dag_id='pattern_parallel_split',
    default_args={
        'start_date': utils.dates.days_ago(1),
    },
    schedule_interval=None,
)

with dag:
    read_input = DummyOperator(task_id='read_input')

    aggregate_data = DummyOperator(task_id='generate_data')

    convert_to_parquet = DummyOperator(task_id='convert_to_parquet')

    convert_to_avro = DummyOperator(task_id='convert_to_avro')

    read_input >> aggregate_data >> [convert_to_parquet, convert_to_avro]
示例#18
0
from datetime import datetime, timedelta
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
import logging

DAG = DAG(
    dag_id = 'simple_xcom',
    start_date = datetime(2017, 10, 26),
    schedule_interval = None,
)


def push_function(**context):
    msg = 'the_message'
    logging.info("message to push: '%s'" % msg)
    print("message to push: '%s'" % msg)
    task_instance = context['task_instance']
    task_instance.xcom_push(key = 'the_message', value = msg)


push_task = PythonOperator(
    task_id = 'push_task',
    python_callable = push_function,
    provide_context = True,
    dag = DAG,
)


def pull_function(**kwargs):
    ti = kwargs['ti']
    msg = ti.xcom_pull(task_ids = 'push_task', key = 'the_message')
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator

from airflow.hooks.base_hook import BaseHook

args = {
    'owner': 'Airflow',
    #'start_date': airflow.utils.dates.days_ago(2),
    'start_date': datetime.datetime(2019, 11, 1),
}

dag = DAG(
    dag_id='exercise_hooks',
    default_args=args,
    schedule_interval="0 0 * * *",
    dagrun_timeout=datetime.timedelta(minutes=60),
)

get_data = PostgresToGoogleCloudStorageOperator(
    postgres_conn_id="test_connection",
    bucket='test_bucket312312',
    filename="land_registry_price_paid_uk/{{ ds_nodash }}/test_{}.csv",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    task_id='get_data',
    dag=dag,
)

get_data
示例#20
0

def delayed_fail():
    """
    Delayed failure to make sure that processes are running before the error
    is raised.

    TODO handle more directly (without sleeping)
    """
    time.sleep(5)
    raise ValueError('Expected failure.')


# DAG tests backfill with pooled tasks
# Previously backfill would queue the task but never run it
dag1 = DAG(dag_id='test_backfill_pooled_task_dag', default_args=default_args)
dag1_task1 = DummyOperator(
    task_id='test_backfill_pooled_task',
    dag=dag1,
    pool='test_backfill_pooled_task_pool',
)

# DAG tests depends_on_past dependencies
dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args)
dag2_task1 = DummyOperator(
    task_id='test_dop_task',
    dag=dag2,
    depends_on_past=True,
)

# DAG tests that a Dag run that doesn't complete is marked failed
示例#21
0
def _print_exec_date(execution_date, **context):
    print(execution_date)


def _get_weekday(execution_date, **context):
    return execution_date.strftime("%a")


args = {
    'owner': 'Airflow',
    'start_date': airflow.utils.dates.days_ago(2),
}

with DAG(dag_id='dag4_postgres_hook',
         default_args=args,
         schedule_interval=None,
         dagrun_timeout=timedelta(minutes=60)) as dag:

    print_data = PythonOperator(
        task_id='print_data',
        python_callable=_get_data,
    )

    # filename='gdd_data{}_{rundate}.csv'.format(rundate=rundate)
    print(filename)
    copy_data_to_gcs = PostgresToGoogleCloudStorageOperator(
        task_id='copy_data_to_gcs',
        sql=sql,
        bucket='gdd_bucket',
        filename='gdd_data{}_/{{ ds }}.json',
        postgres_conn_id='postgres_cursus_db',
示例#22
0
 def _get_task(self, **kwargs):
     return BaseOperator(task_id='test_task', dag=DAG('test_dag'), **kwargs)
import os

from libs import print_stuff

from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'airflow',
    'start_date': days_ago(2)
}

with DAG(
    dag_id='example_kubernetes_executor_config',
    default_args=default_args,
    schedule_interval=None,
    tags=['example'],
) as dag:

    def test_volume_mount():
        """
        Tests whether the volume has been mounted.
        """
        with open('/foo/volume_mount_test.txt', 'w') as foo:
            foo.write('Hello')

        return_code = os.system("cat /foo/volume_mount_test.txt")
        if return_code != 0:
            raise ValueError(f"Error when checking volume mount. Return code {return_code}")

    # You can use annotations on your kubernetes pods!
示例#24
0
            except:
                print('No report for {}'.format(s['name']))
                pass

    publish_mattermost = MattermostOperator(
        task_id="publish_result",
        mattermost_endpoint=MATTERMOST_ENDPOINT,
        text=message)
    publish_mattermost.execute(dict())


with DAG(
        dag_id=DAG_NAME,
        schedule_interval='0 5 * * *',
        start_date=days_ago(1),
        dagrun_timeout=timedelta(minutes=120),
        tags=['schemas', 'irve', 'consolidation', 'datagouv'],
        default_args=default_args,
) as dag:

    clean_previous_outputs = CleanFolderOperator(
        task_id="clean_previous_outputs", folder_path=TMP_FOLDER + DAG_FOLDER)

    tmp_folder = TMP_FOLDER + DAG_FOLDER + '{{ ds }}' + "/"

    shared_notebooks_params = {
        "msgs": "Ran from Airflow " + '{{ ds }}' + "!",
        "WORKING_DIR": AIRFLOW_DAG_HOME + DAG_FOLDER + 'notebooks/',
        "TMP_FOLDER": tmp_folder,
        "API_KEY": API_KEY,
        "API_URL": API_URL,
示例#25
0
# See the License for the specific language governing permissions and
# limitations under the License.

import airflow
import random
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator

args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2)
}

dag = DAG(
    dag_id='example_branch_operator',
    default_args=args,
    schedule_interval="@daily")

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

join = DummyOperator(
    task_id='join',
示例#26
0
import datetime as dt

import airflow
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.latest_only_operator import LatestOnlyOperator

dag = DAG(
    dag_id='latest_only',
    schedule_interval=dt.timedelta(hours=4),
    start_date=airflow.utils.dates.days_ago(2),
)

latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag)
task1 = DummyOperator(task_id='task1', dag=dag)

latest_only >> task1
示例#27
0
###############################################################################

#自己APP的token
token = 'yPz5v7f3XGdcsTCyme2hKXbu58fKDgEriFNPSo/NcMNoZPWVZEwYIOlQ2jqNQeXF080NRsgf/jzbYI/VjlJTl2H1Xc9ZXN7wBHLJH82E6uJsab+TuUAaT2G4TZtH5T+uWycR5QSotn6TQiy/ykra4wdB04t89/1O/w1cDnyilFU='
#自己的ID
ID = 'U6c8f2685a2918d7afbd819b12c15a848'

########### dag所有參數,就放在這裡面 ###########
args = {
    'owner': 'cheating',  #這個dag的擁有者
    'start_date': airflow.utils.dates.days_ago(0)  #開啟時,設定往前幾天開始執行
}

########### dag設定檔 ###########
dag = DAG(
    dag_id='Stock',  #dag的名稱
    default_args=args,  #把上方的參數放進去
    schedule_interval='10 * * * * *')  #多久執行一次


########### 查看當前價格 ###########
def look_price(stock='3624', bs='>', price=31):
    # 先到yahoo爬取該股票的資料
    url = 'https://tw.stock.yahoo.com/q/q?s=' + stock
    list_req = requests.get(url)
    soup = BeautifulSoup(list_req.content, "html.parser")
    getstock = soup.find('b').text
    #開始進行價格判斷
    if bs == '<':  #判斷大小於
        if float(getstock) < price:
            get = stock + '的價格:' + getstock
            line_bot_api = LineBotApi(token)
示例#28
0
 def setUp(self):
     self.dagbag = models.DagBag(dag_folder='/dev/null',
                                 include_examples=True)
     self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG(TEST_DAG_ID, default_args=self.args)
示例#29
0
def init_dims_sub_dag(parent_dag_name, child_dag_name, start_date,
                      redshift_conn_id):
    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              start_date=start_date)

    drop_dim_vehicles_task = PostgresOperator(
        task_id='drop_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLES)

    drop_dim_vehicle_models_task = PostgresOperator(
        task_id='drop_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLE_MODELS)

    drop_dim_rental_zones_task = PostgresOperator(
        task_id='drop_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_RENTAL_ZONES)

    drop_dim_companies_task = PostgresOperator(
        task_id='drop_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_COMPANIES)

    drop_dim_categories_task = PostgresOperator(
        task_id='drop_dim_categroies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_CATEGORIES)

    drop_dim_date_task = PostgresOperator(
        task_id='drop_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_DATE)

    drop_dim_weather_task = PostgresOperator(
        task_id='drop_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_WEATHER)

    create_dim_vehicles_task = PostgresOperator(
        task_id='create_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLES)

    create_dim_vehicle_models_task = PostgresOperator(
        task_id='create_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLE_MODELS)

    create_dim_rental_zones_task = PostgresOperator(
        task_id='create_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_RENTAL_ZONES)

    create_dim_companies_task = PostgresOperator(
        task_id='create_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_COMPANIES)

    create_dim_categories_task = PostgresOperator(
        task_id='create_dim_categories',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_CATEGORIES)

    create_dim_date_task = PostgresOperator(
        task_id='create_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_DATE)

    create_dim_weather_task = PostgresOperator(
        task_id='create_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_WEATHER)

    drop_dim_vehicles_task >> create_dim_vehicles_task
    drop_dim_vehicle_models_task >> create_dim_vehicle_models_task
    drop_dim_rental_zones_task >> create_dim_rental_zones_task
    drop_dim_companies_task >> create_dim_companies_task
    drop_dim_categories_task >> create_dim_categories_task
    drop_dim_date_task >> create_dim_date_task
    drop_dim_weather_task >> create_dim_weather_task

    return dag
示例#30
0
    pm.execute_notebook(kwargs['notebook'],
                        '/data/notebook-runs/GetTweets-output.ipynb',
                        parameters={'dt': ds},
                        kernel_name='spylon-kernel',
                        progress_bar=False,
                        report_mode=True,
                        start_timeout=60)


dag = DAG('get_tweets',
          default_args={
              'owner': 'data-engineering',
              'depends_on_past': False,
              'email_on_failure': False,
              'email_on_retry': False,
              'retries': 1,
              'retry_delay': timedelta(minutes=5)
          },
          catchup=False,
          start_date=datetime(2020, 10, 25, 22, 0, 0),
          schedule_interval=None,
          max_active_runs=1)

with dag:
    load_tweets = PythonOperator(
        task_id='load_tweets',
        provide_context=True,
        python_callable=execute_notebook,
        op_kwargs={'notebook': '/notebooks/GetTweets.ipynb'},
        dag=dag,
    )