def test_simple(self): task = FileSensor( task_id="test", filepath="etc/hosts", fs_conn_id='fs_default', _hook=self.hook, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_default_fs_conn_id(self): with tempfile.NamedTemporaryFile() as tmp: task = FileSensor( task_id="test", filepath=tmp.name[1:], dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_wildcard_file(self): suffix = '.txt' with tempfile.NamedTemporaryFile(suffix=suffix) as tmp: fileglob = os.path.join(os.path.dirname(tmp.name), '*' + suffix) task = FileSensor( task_id='test', filepath=fileglob, fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_subdirectory_empty(self): dir_ = tempfile.mkdtemp() tempfile.mkdtemp(dir=dir_) task = FileSensor(task_id='test', filepath=dir_, fs_conn_id='fs_default', dag=self.dag, timeout=0, poke_interval=1) task._hook = self.hook with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) shutil.rmtree(dir_)
def test_empty_dir(self): dir = tempfile.mkdtemp() task = FileSensor(task_id="test", filepath=dir[1:], fs_conn_id='fs_default', dag=self.dag, timeout=0, poke_interval=1) task._hook = self.hook try: with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) finally: shutil.rmtree(dir)
def test_empty_dir(self): dir = tempfile.mkdtemp() task = FileSensor( task_id="test", filepath=dir[1:], fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook try: with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) finally: shutil.rmtree(dir)
def test_subdirectory_not_empty(self): suffix = '.txt' dir_ = tempfile.mkdtemp() subdir = tempfile.mkdtemp(dir=dir_) with tempfile.NamedTemporaryFile(suffix=suffix, dir=subdir): task = FileSensor( task_id='test', filepath=dir_, fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) shutil.rmtree(dir_)
def test_file_in_dir(self): dir = tempfile.mkdtemp() task = FileSensor( task_id="test", filepath=dir[1:], fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook try: # `touch` the dir open(dir + "/file", "a").close() task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) finally: shutil.rmtree(dir)
def validate_task(validators, dag): """Validates that all files added in the validator exists. arguments: validators -- list of diactionaries containing both the name of the Sensor and the file path dag -- Dag where the tasks are added returns: dummy operator used as join of all sensor tasks""" join = DummyOperator(task_id='join_operator', dag=dag) for validator in validators: sensor = FileSensor(task_id=validator['name'], fs_conn_id='fs_default', filepath=validator['path'], dag=dag) sensor >> join return join
def build_process_result_sub_dag(main_dag, default_args): s_dag = DAG( dag_id="{}.{}".format(main_dag, 'process_result_sub_dag'), default_args=default_args, schedule_interval='@hourly' ) with s_dag: external_dag_sensor = ExternalTaskSensor( task_id='external_dag_sensor', external_dag_id=dagToCall, external_task_id=None, execution_date_fn=get_external_dag_execution_date, check_existence=True, poke_interval=5, timeout=120, soft_fail=True ) ex_file_sensor = FileSensor( task_id="ex_file_sensor", filepath=ex_file ) print_external_dag_result = PythonOperator( task_id="print_external_dag_result", python_callable=_print_external_dag_result, provide_context=True ) remove_trigger_file = BashOperator( task_id="remove_trigger_file", bash_command="rm -f {}".format(path) ) create_finished_file = BashOperator( task_id="create_finished_file", bash_command="touch " + default_path + "/finished_#{{ ts_nodash }}" ) ex_file_sensor >> external_dag_sensor >> print_external_dag_result >> remove_trigger_file >> create_finished_file return s_dag
'retries': 2, } with DAG( # Define DAG id 'file_sensor', default_args=default_args, description='check if a file is exists inside a dir', tags=['explore-airflow', 'sensor'], # To enabled/disabled backfilling, set the catchup property catchup=False, # schedule interval every 6 minutes schedule_interval='*/6 * * * *' ) as dag: file_sensor = FileSensor( task_id='file_sensor_task', # the file path has been set in admin -> connection, so just need to specify the file name here filepath='todo.json', # set the fs_conn_id (admin -> connections) fs_conn_id='my_file_system', # by default mode set to 'poke', which means run repeatedly mode='poke', # wait 300 between checks poke_interval=300, dag=dag ) last_task = DummyOperator(task_id='last_task') file_sensor >> last_task
'retry_delay': timedelta(minutes=1) } yesterday = date.today() - timedelta(days=1) dt = yesterday.strftime("%Y-%m-%d") with DAG(DAG_ID, default_args=DAG_DEFAULT_ARGS, schedule_interval=DAG_SCHEDULE_INTERVAL) as dag: # Initialise a FileSensor to watch if a new file is coming. # task_id must be unique # poke_interval give the time in seconds that the job should wait between each tries waiting_file_task = FileSensor( task_id='waiting_file_task', fs_conn_id='fs_default', filepath='/home/airflow/airflow_files/data.csv', poke_interval=15, dag=dag) # Initialise a PythonOperator to execute the fetching_tweet.py script fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task', python_callable=fetching_tweet.main, dag=dag) # Initialise another PythonOperator to execute the cleaning_tweet.py script cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task', python_callable=cleaning_tweet.main, dag=dag) # Initialise a BashOperator to upload the file into HDFS filename = 'data_cleaned.csv'
] dag = DAG('confirmed_covid_dag', description='Timeseries covid19', default_args={ 'owner':'josef.perez', 'depends_on_past': False, 'max_active_runs': 1, 'start_date': days_ago(1) }, schedule_interval='0 1 * * *', catchup=False) file_sensor_task = FileSensor(dag=dag, task_id="file_sensor", fs_conn_id=FILE_CONNECTION_ID, filepath=FILE_NAME, poke_interval=10, timeout=300 ) def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df_original = pd.read_csv(file_path) df_processed = df_original.melt(id_vars=KEPT_COLUMNS, var_name="Date", value_name="Accumulated" ) df_processed.columns = COLUMNS df_processed["event_date"] = pd.to_datetime(df_processed["event_date"])
'owner' : 'avinash', 'start_date': days_ago(1) } dag = DAG(dag_id='my_sample_dag',default_args=args,schedule_interval=None) def print_file_content(**context): hook = FSHook('my_file_system') path = os.path.join(hook.get_path(), 'test.txt') with open(path, 'r') as fp: print(fp.read()) os.remove(path) with dag: sensing_task = FileSensor( task_id='sensing_task', filepath='test.txt', fs_conn_id='my_file_system', poke_interval=10 ) read_file_content_task = PythonOperator( task_id='read_file_content_task_id', python_callable=print_file_content, provide_context=True, retries=10, retry_delay=timedelta(seconds=1) ) sensing_task >> read_file_content_task
from airflow.operators.hive_operator import HiveOperator from datetime import datetime import fetching_tweet import cleaning_tweet default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"} with DAG(dag_id="twitter_dag", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: # checking if tweets are available - FileSensor waiting_for_tweets = FileSensor(task_id="waiting_for_tweets", fs_conn_id="fs_tweet", filepath="data.csv", poke_interval=5) # fetching tweets fetching_tweets = PythonOperator(task_id="fetching_tweets", python_callable=fetching_tweet.main) # cleaning tweets cleaning_tweets = PythonOperator(task_id="cleaning_tweets", python_callable=cleaning_tweet.main) # storing tweets into hdfs storing_tweets = BashOperator( task_id="storing_tweets", bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")
'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='my_dag', default_args=default_args, schedule_interval=None, ) today = datetime.today().strftime("%m-%d-%Y") input_img = f"exercise-dataset/daily/{today}/image.jpg" preproc_img = f"exercise-dataset/daily/{today}/preprocessed.jpg" prediction = f"exercise-dataset/daily/{today}/result.json" file_sensor = FileSensor(task_id="wait_data_exists", filepath=input_img, dag=dag) def preprocess_img(): img = cv2.imread(input_img, cv2.IMREAD_COLOR) larger = cv2.resize(img, (100, 100)) gray = cv2.cvtColor(larger, cv2.COLOR_BGR2GRAY) cv2.imwrite(output_img, gray) preprocess = PythonOperator( task_id='preprocess', python_callable=preprocess_img, dag=dag, )
schedule_interval="@daily", default_args=default_args, catchup=False) as dag: is_forex_rates_available = HttpSensor( task_id="is_forex_rates_available", method="GET", http_conn_id="forex_api", endpoint="latest", response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20) is_forex_currencies_file_available = FileSensor( task_id="is_forex_currencies_file_available", fs_conn_id="forex_path", filepath="forex_currencies.csv", poke_interval=5, timeout=20) downloading_rates = PythonOperator(task_id="downloading_rates", python_callable=download_rates) saving_rates = BashOperator(task_id="saving_rates", bash_command=""" hdfs dfs -mkdir -p /forex && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex """) creating_forex_rates_table = HiveOperator( task_id="creating_forex_rates_table", hive_cli_conn_id="hive_conn",
def is_monday(*args, **context): execution_date = context['execution_date'] weekday = execution_date.in_timezone("Europe/London").weekday() return 'create_report' if weekday == 0 else 'none' with DAG(dag_id="invoices_dag", schedule_interval="@daily", default_args=default_args, template_searchpath=[f"{os.environ['AIRFLOW_HOME']}"], catchup=False) as dag: # This file could come in S3 from our ecommerce application is_new_data_available = FileSensor(task_id="is_new_data_available", fs_conn_id="data_path", filepath="data.csv", poke_interval=5, timeout=20) notify_file_failed = SlackWebhookOperator( task_id='notify_file_failed', http_conn_id='slack_conn', webhook_token=slack_token, trigger_rule='all_failed', message="Error Notification \n" "Data was missing! \n " "https://www.youtube.com/watch?v=ZDEVut4j7eU", username='******', icon_url='https://raw.githubusercontent.com/apache/' 'airflow/master/airflow/www/static/pin_100.png', dag=dag)
'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), } word_count_dag = DAG( 'word-counter', default_args=default_args, description='Count the frequency of occurrence of a word in a file', schedule_interval=timedelta(days=1), ) local_file_sensor = FileSensor( task_id='check_for_local_file', filepath='/usr/local/airflow/data/sample.txt', dag=word_count_dag, timeout=30, poke_interval=10, ) s3_file_sensor = S3KeySensor( task_id='check_for_s3_file', aws_conn_id='my_conn_S3', bucket_name='calculator-api', bucket_key='twitter-raw/eia-prod/input.txt', wildcard_match=True, timeout=30, poke_interval=10, dag=word_count_dag, trigger_rule=TriggerRule.ALL_FAILED, )
#logger.info(f"Rows inserted {len(df.index)}") dag = DAG('confirmed', description='Load COVID confirmed cases', default_args={ 'owner': 'grupo.dos', 'depends_on_past': False, 'max_active_runs': 1, 'start_date': days_ago(1) }, schedule_interval='0 1 * * *', catchup=True) file_sensor_task = FileSensor( dag=dag, task_id="readfile_sensor", fs_conn_id=FILE_CONNECTION_ID, filepath=FILE_NAME, poke_intreval=10, timeout=300, #provide_context = True ) etl_operator = PythonOperator(dag=dag, task_id="etl_confirmed", python_callable=etl_process, provide_context=True) file_sensor_task >> etl_operator
from airflow import DAG from airflow.operators.python_operator impoty PythonOperator from airflow.operators.bash_operator impoty BashOperator from airflow.operators.hive_operator impoty HiveOperator from airflow.contrib.sensors.file_sensor import FileSensor from datetime import date, timedelta, datetime import fetching_tweet import cleaning_tweet DAG_DEFAULT_ARGS = { 'owner': 'airflow', 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG('first_data_pipeline', start_date=datetime(2019, 02, 14), schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, catchup=False) as dag: # Create tasks waiting_file_task = FileSensor(task_id='waiting_file_task', fs_conn_id='fs_default', filepath='./data.csv', poke_interval=5) fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task', python_callable=fetching_tweet.main) cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task', python_callable=cleaning_tweet.main) loading_into_hdfs_task = BashOperator(task_id='loading_into_hdfs_task', bash_command='hadoop fs -put -f /temp/data_cleaned.csv /tmp/') transfer_into_hive_task = HiveOperator(task_id='transfer_into_hive_task', hql="'"LOAD DATA INPAT 'tmp/data_cleaned.csv' INTO TABLE tweets PARTITION(dt='2018-10-01')") # Connect tasks into DAG waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> loading_into_hdfs_task >> transfer_into_hive_task
from airflow.operators.dummy_operator import DummyOperator from airflow.operators.email_operator import EmailOperator from dags.process import process_data from datetime import datetime, timedelta # Update the default arguments and apply them to the DAG. default_args = { 'start_date': datetime(2019, 1, 1), 'sla': timedelta(minutes=90) } dag = DAG(dag_id='etl_update', default_args=default_args) sensor = FileSensor(task_id='sense_file', filepath='/home/repl/workspace/startprocess.txt', poke_interval=45, dag=dag) bash_task = BashOperator(task_id='cleanup_tempfiles', bash_command='rm -f /home/repl/*.tmp', dag=dag) python_task = PythonOperator(task_id='run_processing', python_callable=process_data, provide_context=True, dag=dag) email_subject = """ Email report for {{ params.department }} on {{ ds_nodash }} """
dag = DAG('mainDAG', description="Dag to Ingest CSV's", default_args={ 'owner': 'MaiBoris', 'depends_on_past': False, 'max_active_runs': 1, 'start_date': days_ago(5) }, schedule_interval='0 1 * * *', catchup=False) sensor1 = FileSensor(task_id="file_sensor_deaths", dag=dag, filepath='deaths.csv', fs_conn_id='my_file_system', poke_interval=10, timeout=600) sensor2 = FileSensor(task_id="file_sensor_confirmed", dag=dag, filepath='confirmed.csv', fs_conn_id='my_file_system', poke_interval=10, timeout=600) sensor3 = FileSensor(task_id="file_sensor_recovered", dag=dag, filepath='recovered.csv', fs_conn_id='my_file_system', poke_interval=10,
from datetime import datetime from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.contrib.sensors.file_sensor import FileSensor with DAG(dag_id="file_sensor_consume_new_data", start_date=datetime(2020, 12, 1), schedule_interval="0 * * * *") as dag: # task 1 get_new_data = FileSensor( task_id="get_new_data", filepath="../shop123/{{ ds_nodash }}/${hour}/data.json") # task 2 parse_file = DummyOperator(task_id="parse_file") # task 3 check_is_it_ne_customer = DummyOperator(task_id="check_is_it_ne_customer") # task 4 create_new_customer = DummyOperator(task_id="create_new_customer") # task 5 update_existed_customer = DummyOperator(task_id="update_existed_customer") # task 6 get_new_data >> parse_file >> check_is_it_ne_customer >> [ create_new_customer, update_existed_customer ]
# Starting no projeto starting = BashOperator(task_id='starting', bash_command='echo Start no projeto viagens 2020') #criando diretório para armazenar os dados create_folder_viagens = PythonOperator( task_id='create_folder_viagens', python_callable=_create_folder_viagens) # Download dos dados download_file = PythonOperator(task_id='download_file', python_callable=_download_file) #Sensor para monitorar os arquivos .csv sensor_file_csv = FileSensor(task_id='sensor_file_csv', fs_conn_id='viagens_path', filepath="*.csv", poke_interval=5, timeout=20) # Salvando arquivos no HDFS saving_hdfs = BashOperator( task_id='saving_hdfs', bash_command=""" hdfs dfs -mkdir -p /input/viagens && \ hdfs dfs -put -f /opt/airflow/dags/files/viagens2020/*.csv /input/viagens """) # Criando as tabelas no Hive para armazenar os arquivos. create_table_hive_pagamentos = HiveOperator( task_id='create_table_hive_pagamentos', hive_cli_conn_id='conn_hive', hql=hql_query_pagamentos)
description="This pipeline is made for taking backup of postgres database and seding to Azure data blob in tar.gz format", schedule_interval="*/5 * * * *", start_date=datetime(2019, 11, 1), catchup=False ) as dag: # TASK 1: Take backup of a database on local storage (in tar.gz format). create_backup = BashOperator( task_id = "Create_backup_of_localdb", bash_command="pg_dump -U yudi sample_database -F tar -f ~/workspace/DE-Prac/dags/backup.tar.gz" ) # TASK 2: Sense the backup file, wait until it appears. file_sensing_task = FileSensor( task_id="sense_the_backup_file", filepath="backup.tar.gz", fs_conn_id="my_file_system", poke_interval=10, ) # TASK 3: Upload it to azure data blob def upload_data(): print("WORKING!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") blob_name="pg_backup" blob = BlobClient.from_connection_string(conn_str=connection_string, container_name=container_name, blob_name=blob_name) with open("/Users/yudi/workspace/DE-Prac/dags/backup.tar.gz", "rb") as data: blob.upload_blob(data) upload_to_blob_storage = PythonOperator( task_id="upload_to_blob_storage", python_callable=upload_data,
# default_args are the default arguments applied to the Dag's tasks DAG_DEFAULT_ARGS = { 'owner': 'airflow', 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG('twitter_dag_v2', start_date=datetime(2018, 10, 1), schedule_interval="@daily", default_args=DAG_DEFAULT_ARGS, catchup=False) as dag: waiting_file_task = FileSensor(task_id="waiting_file_task", fs_conn_id="fs_default", filepath="/usr/local/airflow/data/data.csv", poke_interval=5) fetching_tweet_task = PythonOperator(task_id="fetching_tweet_task", python_callable=fetching_tweet.main) cleaning_tweet_task = PythonOperator(task_id="cleaning_tweet_task", python_callable=cleaning_tweet.main) load_into_hdfs_task = BashOperator( task_id="load_into_hdfs_task", bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/") transfer_into_hive_task = HiveOperator( task_id="transfer_into_hive_task", hql=
def branch_callable(**context): client_in_db = choice([True, False]) if client_in_db: return 'update_existed_customer' return 'create_new_customer' with DAG( dag_id="max_active_dag_run_one_with_xcom_branch_operator", start_date=datetime(2020, 12, 1), schedule_interval=None, #schedule_interval="0 * * * *", user_defined_macros={'shop_filepath_macros': shop_filepath_macros}, max_active_runs=1) as dag: # task 1 get_new_data = FileSensor(task_id="get_new_data", filepath=file_path) # task 2 parse_file = PythonOperator(task_id="parse_file", python_callable=parse_json, provide_context=True, op_kwargs={'file_path': file_path}) # task 3 check_is_it_ne_customer = BranchPythonOperator( task_id="check_is_it_ne_customer", python_callable=branch_callable) # task 4 create_new_customer = DummyOperator(task_id="create_new_customer") # task 5
bash_command='cd /home/airflow/pyspark_airflow && spark-submit --py-files pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadCSVWriteToParquet.py '+input+' '+parquet_output, dag = dag) parquet_to_avro = BashOperator( task_id='parquet_to_avro', bash_command='cd /home/airflow/pyspark_airflow && spark-submit --py-files pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadParquetWriteToAvro.py ' + parquet_input + ' '+ avro_output, dag = dag) avro_to_jdbc = BashOperator( task_id='avro_to_jdbc', bash_command='cd /home/airflow/pyspark_airflow && spark-submit --pyfiles pyspark_allsources.zip com/rposam/process/etlpipeline/airflow/ReadAvroWriteToJdbcPostgres.py '+avro_input + ' '+jdbc_target_table , dag = dag) verify_parquet_file = FileSensor( task_id='verify_parquet_file', filepath=parquet_output, poke_interval=5, timeout=20 ) copy_parquet_file = BashOperator( task_id='copy_parquet_file', bash_command='cp ' + parquet_output + ' '+ parquet_input, dag = dag) verify_avro_file = FileSensor( task_id='verify_avro_file', filepath=avro_output, poke_interval=5, timeout=20 )
#-----------Instancia de DAG dag = DAG('Transformacion_data_dag', description='A new attempt to save a register', default_args={ 'depends_on_past': False, 'max_active_runs': 1, 'start_date': days_ago(2) }, schedule_interval='0 1 * * *', catchup=False) #-----Sensor para detectar archivo de casos confirmados file_sensor_task_confirmed = FileSensor(dag=dag, task_id="file_sensor_confirmed", fs_conn_id=FILE_CONNECTION_ID, filepath=FILE_NAME_CONFIRMED, poke_interval=10, timeout=300) #-----Sensor para detectar archivo de muertes file_sensor_task_deaths = FileSensor(dag=dag, task_id="file_sensor_deaths", fs_conn_id=FILE_CONNECTION_ID, filepath=FILE_NAME_DEATHS, poke_interval=10, timeout=300) #-----Sensor para detectar archivo de recuperados file_sensor_task_recovered = FileSensor(dag=dag, task_id="file_sensor_recovered", fs_conn_id=FILE_CONNECTION_ID,
from airflow.operators.python_operator import BranchPythonOperator from airflow.operators.dummy_operator import DummyOperator from airflow.operators.email_operator import EmailOperator from datetime import datetime, timedelta # Update the default arguments and apply them to the DAG. default_args = { "start_date": datetime(2019, 1, 1), "sla": timedelta(minutes=90) } dag = DAG(dag_id="etl_update", default_args=default_args) sensor = FileSensor(task_id="sense_file", filepath="/home/repl/workspace/startprocess.txt", poke_interval=45, dag=dag) bash_task = BashOperator(task_id="cleanup_tempfiles", bash_command="rm -f /home/repl/*.tmp", dag=dag) def process_data(): pass python_task = PythonOperator(task_id="run_processing", python_callable=process_data, provide_context=True, dag=dag)