def test_simple(self):
     task = FileSensor(
         task_id="test",
         filepath="etc/hosts",
         fs_conn_id='fs_default',
         _hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
 def test_default_fs_conn_id(self):
     with tempfile.NamedTemporaryFile() as tmp:
         task = FileSensor(
             task_id="test",
             filepath=tmp.name[1:],
             dag=self.dag,
             timeout=0,
         )
         task._hook = self.hook
         task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
 def test_default_fs_conn_id(self):
     with tempfile.NamedTemporaryFile() as tmp:
         task = FileSensor(
             task_id="test",
             filepath=tmp.name[1:],
             dag=self.dag,
             timeout=0,
         )
         task._hook = self.hook
         task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
 def test_simple(self):
     task = FileSensor(
         task_id="test",
         filepath="etc/hosts",
         fs_conn_id='fs_default',
         _hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE,
              end_date=DEFAULT_DATE,
              ignore_ti_state=True)
 def test_wildcard_file(self):
     suffix = '.txt'
     with tempfile.NamedTemporaryFile(suffix=suffix) as tmp:
         fileglob = os.path.join(os.path.dirname(tmp.name), '*' + suffix)
         task = FileSensor(
             task_id='test',
             filepath=fileglob,
             fs_conn_id='fs_default',
             dag=self.dag,
             timeout=0,
         )
         task._hook = self.hook
         task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
    def test_subdirectory_empty(self):
        dir_ = tempfile.mkdtemp()
        tempfile.mkdtemp(dir=dir_)
        task = FileSensor(task_id='test',
                          filepath=dir_,
                          fs_conn_id='fs_default',
                          dag=self.dag,
                          timeout=0,
                          poke_interval=1)
        task._hook = self.hook

        with self.assertRaises(AirflowSensorTimeout):
            task.run(start_date=DEFAULT_DATE,
                     end_date=DEFAULT_DATE,
                     ignore_ti_state=True)
            shutil.rmtree(dir_)
 def test_empty_dir(self):
     dir = tempfile.mkdtemp()
     task = FileSensor(task_id="test",
                       filepath=dir[1:],
                       fs_conn_id='fs_default',
                       dag=self.dag,
                       timeout=0,
                       poke_interval=1)
     task._hook = self.hook
     try:
         with self.assertRaises(AirflowSensorTimeout):
             task.run(start_date=DEFAULT_DATE,
                      end_date=DEFAULT_DATE,
                      ignore_ti_state=True)
     finally:
         shutil.rmtree(dir)
 def test_empty_dir(self):
     dir = tempfile.mkdtemp()
     task = FileSensor(
         task_id="test",
         filepath=dir[1:],
         fs_conn_id='fs_default',
         dag=self.dag,
         timeout=0,
     )
     task._hook = self.hook
     try:
         with self.assertRaises(AirflowSensorTimeout):
             task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                      ignore_ti_state=True)
     finally:
         shutil.rmtree(dir)
    def test_subdirectory_not_empty(self):
        suffix = '.txt'
        dir_ = tempfile.mkdtemp()
        subdir = tempfile.mkdtemp(dir=dir_)

        with tempfile.NamedTemporaryFile(suffix=suffix, dir=subdir):
            task = FileSensor(
                task_id='test',
                filepath=dir_,
                fs_conn_id='fs_default',
                dag=self.dag,
                timeout=0,
            )
            task._hook = self.hook
            task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                     ignore_ti_state=True)
        shutil.rmtree(dir_)
Пример #10
0
 def test_file_in_dir(self):
     dir = tempfile.mkdtemp()
     task = FileSensor(
         task_id="test",
         filepath=dir[1:],
         fs_conn_id='fs_default',
         dag=self.dag,
         timeout=0,
     )
     task._hook = self.hook
     try:
         # `touch` the dir
         open(dir + "/file", "a").close()
         task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
     finally:
         shutil.rmtree(dir)
 def test_file_in_dir(self):
     dir = tempfile.mkdtemp()
     task = FileSensor(
         task_id="test",
         filepath=dir[1:],
         fs_conn_id='fs_default',
         dag=self.dag,
         timeout=0,
     )
     task._hook = self.hook
     try:
         # `touch` the dir
         open(dir + "/file", "a").close()
         task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
     finally:
         shutil.rmtree(dir)
Пример #12
0
def validate_task(validators, dag):
    """Validates that all files added in the
    validator exists.

    arguments:
    validators -- list of diactionaries containing both the name of the Sensor
    and the file path
    dag -- Dag where the tasks are added

    returns:
        dummy operator used as join of all sensor tasks"""
    join = DummyOperator(task_id='join_operator', dag=dag)
    for validator in validators:
        sensor = FileSensor(task_id=validator['name'],
                            fs_conn_id='fs_default',
                            filepath=validator['path'],
                            dag=dag)
        sensor >> join
    return join
Пример #13
0
def build_process_result_sub_dag(main_dag, default_args):
    s_dag = DAG(
        dag_id="{}.{}".format(main_dag, 'process_result_sub_dag'),
        default_args=default_args,
        schedule_interval='@hourly'
    )
    with s_dag:
        external_dag_sensor = ExternalTaskSensor(
            task_id='external_dag_sensor',
            external_dag_id=dagToCall,
            external_task_id=None,
            execution_date_fn=get_external_dag_execution_date,
            check_existence=True,
            poke_interval=5,
            timeout=120,
            soft_fail=True
        )
        ex_file_sensor = FileSensor(
            task_id="ex_file_sensor",
            filepath=ex_file
        )
        print_external_dag_result = PythonOperator(
            task_id="print_external_dag_result",
            python_callable=_print_external_dag_result,
            provide_context=True
        )
        remove_trigger_file = BashOperator(
            task_id="remove_trigger_file",
            bash_command="rm -f {}".format(path)
        )
        create_finished_file = BashOperator(
            task_id="create_finished_file",
            bash_command="touch " + default_path + "/finished_#{{ ts_nodash }}"
        )

    ex_file_sensor >> external_dag_sensor >> print_external_dag_result >> remove_trigger_file >> create_finished_file
    return s_dag
Пример #14
0
  'retries': 2,
}

with DAG(
    # Define DAG id
    'file_sensor',
    default_args=default_args,
    description='check if a file is exists inside a dir',
    tags=['explore-airflow', 'sensor'],
    # To enabled/disabled backfilling, set the catchup property
    catchup=False,
    # schedule interval every 6 minutes
    schedule_interval='*/6 * * * *'
) as dag:

    file_sensor = FileSensor(
        task_id='file_sensor_task',
        # the file path has been set in admin -> connection, so just need to specify the file name here
        filepath='todo.json',
        # set the fs_conn_id (admin -> connections)
        fs_conn_id='my_file_system',
        # by default mode set to 'poke', which means run repeatedly
        mode='poke',
        # wait 300 between checks
        poke_interval=300,
        dag=dag
    )

    last_task = DummyOperator(task_id='last_task')

    file_sensor >> last_task
    'retry_delay': timedelta(minutes=1)
}

yesterday = date.today() - timedelta(days=1)
dt = yesterday.strftime("%Y-%m-%d")

with DAG(DAG_ID,
         default_args=DAG_DEFAULT_ARGS,
         schedule_interval=DAG_SCHEDULE_INTERVAL) as dag:

    # Initialise a FileSensor to watch if a new file is coming.
    # task_id must be unique
    # poke_interval give the time in seconds that the job should wait between each tries
    waiting_file_task = FileSensor(
        task_id='waiting_file_task',
        fs_conn_id='fs_default',
        filepath='/home/airflow/airflow_files/data.csv',
        poke_interval=15,
        dag=dag)

    # Initialise a PythonOperator to execute the fetching_tweet.py script
    fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task',
                                         python_callable=fetching_tweet.main,
                                         dag=dag)

    # Initialise another PythonOperator to execute the cleaning_tweet.py script
    cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task',
                                         python_callable=cleaning_tweet.main,
                                         dag=dag)

    # Initialise a BashOperator to upload the file into HDFS
    filename = 'data_cleaned.csv'
Пример #16
0
               ]

dag = DAG('confirmed_covid_dag', description='Timeseries covid19',
         default_args={
            'owner':'josef.perez',
            'depends_on_past': False,
            'max_active_runs': 1,
            'start_date': days_ago(1)
         },
         schedule_interval='0 1 * * *',
         catchup=False)

file_sensor_task = FileSensor(dag=dag,
                              task_id="file_sensor",
                              fs_conn_id=FILE_CONNECTION_ID,
                              filepath=FILE_NAME,
                              poke_interval=10,
                              timeout=300
                             )

def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df_original = pd.read_csv(file_path)
    df_processed = df_original.melt(id_vars=KEPT_COLUMNS,
                                    var_name="Date",
                                    value_name="Accumulated"
                                   )
    df_processed.columns = COLUMNS
    df_processed["event_date"] = pd.to_datetime(df_processed["event_date"])
    'owner' : 'avinash',
    'start_date': days_ago(1)
}

dag = DAG(dag_id='my_sample_dag',default_args=args,schedule_interval=None)

def print_file_content(**context):
    hook = FSHook('my_file_system')
    path = os.path.join(hook.get_path(), 'test.txt')
    with open(path, 'r') as fp:
        print(fp.read())
    os.remove(path)


with dag:
    sensing_task = FileSensor(
        task_id='sensing_task',
        filepath='test.txt',
        fs_conn_id='my_file_system',
        poke_interval=10        
    )

    read_file_content_task = PythonOperator(
        task_id='read_file_content_task_id',
        python_callable=print_file_content,
        provide_context=True,
        retries=10,
        retry_delay=timedelta(seconds=1)
    )

sensing_task >> read_file_content_task
Пример #18
0
from airflow.operators.hive_operator import HiveOperator
from datetime import datetime

import fetching_tweet
import cleaning_tweet

default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"}

with DAG(dag_id="twitter_dag",
         schedule_interval="@daily",
         default_args=default_args,
         catchup=False) as dag:

    # checking if tweets are available - FileSensor
    waiting_for_tweets = FileSensor(task_id="waiting_for_tweets",
                                    fs_conn_id="fs_tweet",
                                    filepath="data.csv",
                                    poke_interval=5)

    # fetching tweets
    fetching_tweets = PythonOperator(task_id="fetching_tweets",
                                     python_callable=fetching_tweet.main)

    # cleaning tweets
    cleaning_tweets = PythonOperator(task_id="cleaning_tweets",
                                     python_callable=cleaning_tweet.main)

    # storing tweets into hdfs
    storing_tweets = BashOperator(
        task_id="storing_tweets",
        bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")
    'start_date': airflow.utils.dates.days_ago(2),
}

dag = DAG(
    dag_id='my_dag',
    default_args=default_args,
    schedule_interval=None,
)

today = datetime.today().strftime("%m-%d-%Y")
input_img = f"exercise-dataset/daily/{today}/image.jpg"
preproc_img = f"exercise-dataset/daily/{today}/preprocessed.jpg"
prediction = f"exercise-dataset/daily/{today}/result.json"

file_sensor = FileSensor(task_id="wait_data_exists",
                         filepath=input_img,
                         dag=dag)


def preprocess_img():
    img = cv2.imread(input_img, cv2.IMREAD_COLOR)
    larger = cv2.resize(img, (100, 100))
    gray = cv2.cvtColor(larger, cv2.COLOR_BGR2GRAY)
    cv2.imwrite(output_img, gray)


preprocess = PythonOperator(
    task_id='preprocess',
    python_callable=preprocess_img,
    dag=dag,
)
         schedule_interval="@daily",
         default_args=default_args,
         catchup=False) as dag:

    is_forex_rates_available = HttpSensor(
        task_id="is_forex_rates_available",
        method="GET",
        http_conn_id="forex_api",
        endpoint="latest",
        response_check=lambda response: "rates" in response.text,
        poke_interval=5,
        timeout=20)

    is_forex_currencies_file_available = FileSensor(
        task_id="is_forex_currencies_file_available",
        fs_conn_id="forex_path",
        filepath="forex_currencies.csv",
        poke_interval=5,
        timeout=20)

    downloading_rates = PythonOperator(task_id="downloading_rates",
                                       python_callable=download_rates)

    saving_rates = BashOperator(task_id="saving_rates",
                                bash_command="""
            hdfs dfs -mkdir -p /forex && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex
        """)

    creating_forex_rates_table = HiveOperator(
        task_id="creating_forex_rates_table",
        hive_cli_conn_id="hive_conn",
def is_monday(*args, **context):
    execution_date = context['execution_date']
    weekday = execution_date.in_timezone("Europe/London").weekday()
    return 'create_report' if weekday == 0 else 'none'


with DAG(dag_id="invoices_dag",
         schedule_interval="@daily",
         default_args=default_args,
         template_searchpath=[f"{os.environ['AIRFLOW_HOME']}"],
         catchup=False) as dag:
    # This file could come in S3 from our ecommerce application
    is_new_data_available = FileSensor(task_id="is_new_data_available",
                                       fs_conn_id="data_path",
                                       filepath="data.csv",
                                       poke_interval=5,
                                       timeout=20)

    notify_file_failed = SlackWebhookOperator(
        task_id='notify_file_failed',
        http_conn_id='slack_conn',
        webhook_token=slack_token,
        trigger_rule='all_failed',
        message="Error Notification \n"
        "Data was missing! \n "
        "https://www.youtube.com/watch?v=ZDEVut4j7eU",
        username='******',
        icon_url='https://raw.githubusercontent.com/apache/'
        'airflow/master/airflow/www/static/pin_100.png',
        dag=dag)
Пример #22
0
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

word_count_dag = DAG(
    'word-counter',
    default_args=default_args,
    description='Count the frequency of occurrence of a word in a file',
    schedule_interval=timedelta(days=1),
)

local_file_sensor = FileSensor(
    task_id='check_for_local_file',
    filepath='/usr/local/airflow/data/sample.txt',
    dag=word_count_dag,
    timeout=30,
    poke_interval=10,
)

s3_file_sensor = S3KeySensor(
    task_id='check_for_s3_file',
    aws_conn_id='my_conn_S3',
    bucket_name='calculator-api',
    bucket_key='twitter-raw/eia-prod/input.txt',
    wildcard_match=True,
    timeout=30,
    poke_interval=10,
    dag=word_count_dag,
    trigger_rule=TriggerRule.ALL_FAILED,
)
    #logger.info(f"Rows inserted {len(df.index)}")


dag = DAG('confirmed',
          description='Load COVID confirmed cases',
          default_args={
              'owner': 'grupo.dos',
              'depends_on_past': False,
              'max_active_runs': 1,
              'start_date': days_ago(1)
          },
          schedule_interval='0 1 * * *',
          catchup=True)

file_sensor_task = FileSensor(
    dag=dag,
    task_id="readfile_sensor",
    fs_conn_id=FILE_CONNECTION_ID,
    filepath=FILE_NAME,
    poke_intreval=10,
    timeout=300,
    #provide_context = True
)

etl_operator = PythonOperator(dag=dag,
                              task_id="etl_confirmed",
                              python_callable=etl_process,
                              provide_context=True)

file_sensor_task >> etl_operator
from airflow import DAG
from airflow.operators.python_operator impoty PythonOperator
from airflow.operators.bash_operator impoty BashOperator
from airflow.operators.hive_operator impoty HiveOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from datetime import date, timedelta, datetime

import fetching_tweet
import cleaning_tweet

DAG_DEFAULT_ARGS = {
    'owner': 'airflow',
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG('first_data_pipeline', start_date=datetime(2019, 02, 14), schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, catchup=False) as dag:
    # Create tasks 
    waiting_file_task = FileSensor(task_id='waiting_file_task', fs_conn_id='fs_default', filepath='./data.csv', poke_interval=5)
    fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task', python_callable=fetching_tweet.main)
    cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task', python_callable=cleaning_tweet.main)
    loading_into_hdfs_task = BashOperator(task_id='loading_into_hdfs_task', bash_command='hadoop fs -put -f /temp/data_cleaned.csv /tmp/')
    transfer_into_hive_task = HiveOperator(task_id='transfer_into_hive_task', hql="'"LOAD DATA INPAT 'tmp/data_cleaned.csv' INTO TABLE tweets PARTITION(dt='2018-10-01')")
    
    # Connect tasks into DAG
    waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> loading_into_hdfs_task >> transfer_into_hive_task
Пример #25
0
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.email_operator import EmailOperator
from dags.process import process_data
from datetime import datetime, timedelta

# Update the default arguments and apply them to the DAG.

default_args = {
    'start_date': datetime(2019, 1, 1),
    'sla': timedelta(minutes=90)
}

dag = DAG(dag_id='etl_update', default_args=default_args)

sensor = FileSensor(task_id='sense_file',
                    filepath='/home/repl/workspace/startprocess.txt',
                    poke_interval=45,
                    dag=dag)

bash_task = BashOperator(task_id='cleanup_tempfiles',
                         bash_command='rm -f /home/repl/*.tmp',
                         dag=dag)

python_task = PythonOperator(task_id='run_processing',
                             python_callable=process_data,
                             provide_context=True,
                             dag=dag)

email_subject = """
  Email report for {{ params.department }} on {{ ds_nodash }}
"""
Пример #26
0

dag = DAG('mainDAG',
          description="Dag to Ingest CSV's",
          default_args={
              'owner': 'MaiBoris',
              'depends_on_past': False,
              'max_active_runs': 1,
              'start_date': days_ago(5)
          },
          schedule_interval='0 1 * * *',
          catchup=False)

sensor1 = FileSensor(task_id="file_sensor_deaths",
                     dag=dag,
                     filepath='deaths.csv',
                     fs_conn_id='my_file_system',
                     poke_interval=10,
                     timeout=600)

sensor2 = FileSensor(task_id="file_sensor_confirmed",
                     dag=dag,
                     filepath='confirmed.csv',
                     fs_conn_id='my_file_system',
                     poke_interval=10,
                     timeout=600)

sensor3 = FileSensor(task_id="file_sensor_recovered",
                     dag=dag,
                     filepath='recovered.csv',
                     fs_conn_id='my_file_system',
                     poke_interval=10,
Пример #27
0
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor

with DAG(dag_id="file_sensor_consume_new_data",
         start_date=datetime(2020, 12, 1),
         schedule_interval="0 * * * *") as dag:

    # task 1
    get_new_data = FileSensor(
        task_id="get_new_data",
        filepath="../shop123/{{ ds_nodash }}/${hour}/data.json")

    # task 2
    parse_file = DummyOperator(task_id="parse_file")

    # task 3
    check_is_it_ne_customer = DummyOperator(task_id="check_is_it_ne_customer")

    # task 4
    create_new_customer = DummyOperator(task_id="create_new_customer")

    # task 5
    update_existed_customer = DummyOperator(task_id="update_existed_customer")

    # task 6
    get_new_data >> parse_file >> check_is_it_ne_customer >> [
        create_new_customer, update_existed_customer
    ]
Пример #28
0
    # Starting no projeto
    starting = BashOperator(task_id='starting',
                            bash_command='echo Start no projeto viagens 2020')

    #criando diretório para armazenar os dados
    create_folder_viagens = PythonOperator(
        task_id='create_folder_viagens',
        python_callable=_create_folder_viagens)
    # Download dos dados
    download_file = PythonOperator(task_id='download_file',
                                   python_callable=_download_file)
    #Sensor para monitorar os arquivos .csv
    sensor_file_csv = FileSensor(task_id='sensor_file_csv',
                                 fs_conn_id='viagens_path',
                                 filepath="*.csv",
                                 poke_interval=5,
                                 timeout=20)

    # Salvando arquivos no HDFS
    saving_hdfs = BashOperator(
        task_id='saving_hdfs',
        bash_command=""" hdfs dfs -mkdir -p /input/viagens && \
            hdfs dfs -put -f /opt/airflow/dags/files/viagens2020/*.csv /input/viagens 
        """)

    # Criando  as tabelas no Hive para armazenar os arquivos.
    create_table_hive_pagamentos = HiveOperator(
        task_id='create_table_hive_pagamentos',
        hive_cli_conn_id='conn_hive',
        hql=hql_query_pagamentos)
Пример #29
0
    description="This pipeline is made for taking backup of postgres database and seding to Azure data blob in tar.gz format",
    schedule_interval="*/5 * * * *",
    start_date=datetime(2019, 11, 1),
    catchup=False
) as dag:

    # TASK 1: Take backup of a database on local storage (in tar.gz format).
    create_backup = BashOperator(
        task_id = "Create_backup_of_localdb",
        bash_command="pg_dump -U yudi sample_database -F tar -f ~/workspace/DE-Prac/dags/backup.tar.gz"
    )

    # TASK 2: Sense the backup file, wait until it appears.
    file_sensing_task = FileSensor(
        task_id="sense_the_backup_file",
        filepath="backup.tar.gz",
        fs_conn_id="my_file_system",
        poke_interval=10,
    )

    # TASK 3: Upload it to azure data blob
    def upload_data():
        print("WORKING!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        blob_name="pg_backup"
        blob = BlobClient.from_connection_string(conn_str=connection_string, container_name=container_name, blob_name=blob_name)
        with open("/Users/yudi/workspace/DE-Prac/dags/backup.tar.gz", "rb") as data:
            blob.upload_blob(data)


    upload_to_blob_storage = PythonOperator(
        task_id="upload_to_blob_storage",   
        python_callable=upload_data,
Пример #30
0
# default_args are the default arguments applied to the Dag's tasks
DAG_DEFAULT_ARGS = {
    'owner': 'airflow',
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG('twitter_dag_v2',
         start_date=datetime(2018, 10, 1),
         schedule_interval="@daily",
         default_args=DAG_DEFAULT_ARGS,
         catchup=False) as dag:
    waiting_file_task = FileSensor(task_id="waiting_file_task",
                                   fs_conn_id="fs_default",
                                   filepath="/usr/local/airflow/data/data.csv",
                                   poke_interval=5)

    fetching_tweet_task = PythonOperator(task_id="fetching_tweet_task",
                                         python_callable=fetching_tweet.main)

    cleaning_tweet_task = PythonOperator(task_id="cleaning_tweet_task",
                                         python_callable=cleaning_tweet.main)

    load_into_hdfs_task = BashOperator(
        task_id="load_into_hdfs_task",
        bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")

    transfer_into_hive_task = HiveOperator(
        task_id="transfer_into_hive_task",
        hql=
Пример #31
0
def branch_callable(**context):
    client_in_db = choice([True, False])
    if client_in_db:
        return 'update_existed_customer'
    return 'create_new_customer'


with DAG(
        dag_id="max_active_dag_run_one_with_xcom_branch_operator",
        start_date=datetime(2020, 12, 1),
        schedule_interval=None,
        #schedule_interval="0 * * * *",
        user_defined_macros={'shop_filepath_macros': shop_filepath_macros},
        max_active_runs=1) as dag:
    # task 1
    get_new_data = FileSensor(task_id="get_new_data", filepath=file_path)

    # task 2
    parse_file = PythonOperator(task_id="parse_file",
                                python_callable=parse_json,
                                provide_context=True,
                                op_kwargs={'file_path': file_path})

    # task 3
    check_is_it_ne_customer = BranchPythonOperator(
        task_id="check_is_it_ne_customer", python_callable=branch_callable)

    # task 4
    create_new_customer = DummyOperator(task_id="create_new_customer")

    # task 5
Пример #32
0
    bash_command='cd /home/airflow/pyspark_airflow && spark-submit --py-files pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadCSVWriteToParquet.py '+input+' '+parquet_output,
    dag = dag)

parquet_to_avro = BashOperator(
    task_id='parquet_to_avro',
    bash_command='cd /home/airflow/pyspark_airflow && spark-submit --py-files pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadParquetWriteToAvro.py ' + parquet_input + ' '+ avro_output,
    dag = dag)

avro_to_jdbc = BashOperator(
    task_id='avro_to_jdbc',
    bash_command='cd /home/airflow/pyspark_airflow && spark-submit --pyfiles pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadAvroWriteToJdbcPostgres.py '+avro_input + ' '+jdbc_target_table ,
    dag = dag)

verify_parquet_file =  FileSensor(
        task_id='verify_parquet_file',
        filepath=parquet_output,
        poke_interval=5,
        timeout=20
    )

copy_parquet_file = BashOperator(
    task_id='copy_parquet_file',
    bash_command='cp ' + parquet_output + ' '+ parquet_input,
    dag = dag)

verify_avro_file = FileSensor(
        task_id='verify_avro_file',
        filepath=avro_output,
        poke_interval=5,
        timeout=20
    )
Пример #33
0
#-----------Instancia de DAG
dag = DAG('Transformacion_data_dag',
          description='A new attempt to save a register',
          default_args={
              'depends_on_past': False,
              'max_active_runs': 1,
              'start_date': days_ago(2)
          },
          schedule_interval='0 1 * * *',
          catchup=False)

#-----Sensor para detectar archivo de casos confirmados
file_sensor_task_confirmed = FileSensor(dag=dag,
                                        task_id="file_sensor_confirmed",
                                        fs_conn_id=FILE_CONNECTION_ID,
                                        filepath=FILE_NAME_CONFIRMED,
                                        poke_interval=10,
                                        timeout=300)

#-----Sensor para detectar archivo de muertes
file_sensor_task_deaths = FileSensor(dag=dag,
                                     task_id="file_sensor_deaths",
                                     fs_conn_id=FILE_CONNECTION_ID,
                                     filepath=FILE_NAME_DEATHS,
                                     poke_interval=10,
                                     timeout=300)

#-----Sensor para detectar archivo de recuperados
file_sensor_task_recovered = FileSensor(dag=dag,
                                        task_id="file_sensor_recovered",
                                        fs_conn_id=FILE_CONNECTION_ID,
Пример #34
0
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.email_operator import EmailOperator
from datetime import datetime, timedelta

# Update the default arguments and apply them to the DAG.

default_args = {
    "start_date": datetime(2019, 1, 1),
    "sla": timedelta(minutes=90)
}

dag = DAG(dag_id="etl_update", default_args=default_args)

sensor = FileSensor(task_id="sense_file",
                    filepath="/home/repl/workspace/startprocess.txt",
                    poke_interval=45,
                    dag=dag)

bash_task = BashOperator(task_id="cleanup_tempfiles",
                         bash_command="rm -f /home/repl/*.tmp",
                         dag=dag)


def process_data():
    pass


python_task = PythonOperator(task_id="run_processing",
                             python_callable=process_data,
                             provide_context=True,
                             dag=dag)