) local_file_sensor = FileSensor( task_id='check_for_local_file', filepath='/usr/local/airflow/data/sample.txt', dag=word_count_dag, timeout=30, poke_interval=10, ) s3_file_sensor = S3KeySensor( task_id='check_for_s3_file', aws_conn_id='my_conn_S3', bucket_name='calculator-api', bucket_key='twitter-raw/eia-prod/input.txt', wildcard_match=True, timeout=30, poke_interval=10, dag=word_count_dag, trigger_rule=TriggerRule.ALL_FAILED, ) s3_file_download = PythonOperator( task_id='download_text_file_from_s3', python_callable=download_s3_file, op_args=[ 'calculator-api', 'twitter-raw/eia-prod/input.txt', '/usr/local/airflow/data/sample.txt' ], dag=word_count_dag, )
'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG(dag_id='s3_sensor_123', schedule_interval='@once', default_args=default_args) def download(key, bucket_name, local_file): hook = S3Hook(aws_conn_id='aws_default') data = hook.read_key(key, bucket_name) with open(local_file, 'wb') as file: file.write(data) bucket_key = 'driver-data/s3_sensor.csv' bucket_name = 'wdt-datalake' t0 = S3KeySensor(task_id='s3_sensor', bucket_key=bucket_key, bucket_name=bucket_name, sla=timedelta(minutes=2), dag=dag) t1 = PythonOperator(task_id='download', python_callable=download, op_args=[bucket_key, bucket_name, local_file], dag=dag) t0 >> t1
DAG_NAME, description='Load data from Procon complaints from S3 to Redshift.', start_date=start_date, schedule_interval=datetime.timedelta(hours=1), catchup=False, max_active_runs=1, ) start_operator = DummyOperator(task_id='begin_execution', dag=dag) has_file_to_process = S3KeySensor( task_id='has_file_to_process', dag=dag, bucket_name=S3_BUCKET, bucket_key=f'{S3_KEY}/*.csv', wildcard_match=True, aws_conn_id=AWS_CREDENTIALS, timeout=31, poke_interval=30, soft_fail=True, ) create_procon_stage_table = PostgresOperator( task_id='create_procon_stage_table', dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=[ procon_queries['drop_stage_table'], procon_queries['create_stage_table'] ])
# dag=dag # ) moment = datetime.now() b_name = Variable.get("s3_bucket") source = Variable.get("pinpoint") year = moment.year month = '%02d' % moment.month day = '%02d' % moment.day hr = moment.hour bucket_key_template = f'{source}/{year}/{month}/{day}/ypsource.json' get_new_json = S3KeySensor(task_id="get_new_json", poke_interval=60 * 2, timeout=60 * 60 * 3, bucket_key=bucket_key_template, bucket_name=b_name, wildcard_match=False, aws_conn_id="s3_task", dag=dag) # get_from_S3 = PythonOperator( # task_id='get_from_S3', # python_callable=get_file_from_s3, # dag=dag # ) upload_to_S3_task = PythonOperator( task_id='upload_file_to_S3', python_callable=upload_file_to_S3_with_hook, params={ 'filename': '/home/akorede/Documents/mycsv.csv',
with DAG( "csv2postgres", description="Read CSV files from S3 and load into Postgres db", schedule_interval="0 12 * * *", start_date=days_ago(2), catchup=False, ) as csv2postgres: bucket_name = s3_conn.extra_dejson["bucket_name"] file_key = "s3://{{params.bucket_name}}/{{ds}}/spire2csv__query_{{params.table}}_to_csv__{{ds_nodash}}/{{params.table}}.csv" for table, query in queries.items(): check_for_files = S3KeySensor( task_id=f"check_s3_for_{table}_file", bucket_key=file_key, poke_interval=60, params={ "table": table, "bucket_name": bucket_name }, timeout=60 * 60 * 12, # timeout after 12 hours of waiting for the file ) csv2postgres_task = PythonOperator( task_id=f"{table}_csv_to_db", python_callable=csv_to_postgres, provide_context=True, params={ "table": table, "bucket_name": bucket_name }, op_kwargs={
:return: bool """ with managed_connection() as connection: process_source_dir(connection, OUTPUT_DIR, is_batch_mode=True) return SUCCESS dag = DAG('data_pipeline', default_args=default_args, schedule_interval='@daily') bucket_watcher = S3KeySensor(task_id='bucket_watcher', poke_interval=5, timeout=300, soft_fail=True, wildcard_match=True, bucket_key='*', bucket_name=INCOMING_BUCKET, dag=dag) get_file_keys = PythonOperator(task_id='get_file_keys', provide_context=True, python_callable=_get_file_keys, dag=dag) download_zip_files = PythonOperator(task_id='download_zip_files', provide_context=True, python_callable=_download_zip_files, dag=dag) move_zip_files_to_archive = PythonOperator(
"start_date": datetime(2019, 1, 24), "email": ["*****@*****.**"], "email_on_retry": False, "retry_delay": timedelta(minutes=5), "retries": 2 } dag = DAG('EMR_TEST_1', default_args=DEFAULT_ARGS, catchup=False, schedule_interval="0 1 * * *") with dag: file_sensor = S3KeySensor(task_id='file_sensor', poke_interval=600, timeout=1000, soft_fail=False, bucket_name='ds-afarrell', bucket_key='manybla.txt') create_cluster = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_benchmarks_connection') run_some_pyspark = EmrAddStepsOperator( task_id='run_some_pyspark', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=EMR_STEP_1)
from airflow.operators.sensors import S3KeySensor from airflow.operators.bash_operator import BashOperator from datetime import datetime as dt from airflow import DAG with DAG('s3_event') as dag: t1 = S3KeySensor( task_id='s3_sensor', bucket_name='airflow-input-coke', bucket_key='*', start_date=dt.now(), dag=dag) t2 = BashOperator( task_id='print_key', bash_command='echo "I Win"', start_date=dt.now(), dag=dag ) t1 >> t2
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('dms-cassandraV3', default_args=default_args, schedule_interval='@daily') s3ready = S3KeySensor( task_id='s3_file', poke_interval=0, timeout=15, soft_fail=False, bucket_key= 's3://dms-deploy/flood-monitoring/archive/readings-full-{{ yesterday_ds }}.csv.gz', bucket_name=None, s3_conn_id=Variable.get("s3_connection"), dag=dag) def downloadDatafile(date, credentials): filename = "readings-full-" + date + ".csv.gz" s3 = boto3.resource( "s3", aws_access_key_id=credentials['aws_access_key_id'], aws_secret_access_key=credentials['aws_secret_access_key']) try: print("Downloading File") s3.Bucket("dms-deploy").download_file(
""" dag.doc_md = #### Update DB Update DB takes in the csv downloaded from oura and loads it into postgres database as soon as dag is triggered by file landing in s3 bucket """ dag = DAG(dag_id='oura_pipeline', default_args=default_args, description='ETL', schedule_interval=timedelta(days=1)) sensor = S3KeySensor(task_id='s3_file_check', bucket_key='oura_*', wildcard_match=True, bucket_name='ouraringbackupdata', s3_conn_id='my_conn_S3', timeout=18 * 60 * 60, poke_interval=120, dag=dag) t1 = PythonOperator(task_id='read_csv', provide_context=False, python_callable=read_csv, dag=dag) t2 = PythonOperator(task_id='transform_data', provide_context=False, python_callable=transform, dag=dag) t3 = PythonOperator(task_id='Load_to_postgre',
os.system(f'spark-submit --conf spark.cores.max={max_cores} --executor-memory=3G ' +\ '$sparkf ~/eCommerce/data-processing/ingestion.py') def run_time_window(): ''' spark-submit pyspark script that maintains 24-hour window for the minute-level datatable on PostgreSQL DB ''' os.system(f'spark-submit --conf spark.cores.max=14 --executor-memory=5G ' +\ '$sparkf ~/eCommerce/data-processing/table_time_window.py') new_file_sensor = S3KeySensor( task_id='new_csv_sensor', poke_interval=5, # (seconds); checking file every 5 seconds timeout=30, # timeout in 1 hours bucket_key=f"s3://maxwell-insight/serverpool/*.csv", bucket_name=None, wildcard_match=True, dag=dag) spark_ingestion = PythonOperator(task_id='spark_ingestion', python_callable=run_ingestion, trigger_rule='none_failed', dag=dag) table_time_window = PythonOperator(task_id='table_time_window', python_callable=run_time_window, dag=dag) new_file_sensor >> spark_ingestion >> table_time_window
'retries': 0, # 'retry_delay': timedelta(minutes=5), # 'execution_timeout': timedelta(minutes=10), } dag = DAG( dag_id='deploy_stack_on_file_upload', schedule_interval=schedule, default_args=args, catchup=False, ) file_sensor = S3KeySensor( task_id='s3_key_sensor_task', poke_interval=60 * 1, # seconds timeout=60 * 10, # seconds bucket_key="s3://auto-bench/docker-compose.yml", bucket_name=None, wildcard_match=False, dag=dag) move_file = BashOperator( task_id="move_yml", bash_command= "aws s3 mv s3://auto-bench/docker-compose.yml /home/ec2-user/docker-compose.yml", dag=dag) rm_prev_stack = BashOperator(task_id="rm_prev_stack", bash_command="docker stack rm AutoBench", dag=dag) docker_prune = BashOperator(task_id="docker_prune",
task_id='s3_ingest', bash_command='aws s3 sync s3://air-flow-lightning s3://airflow-lightning-origin', queue='default', dag=dag) # Spark processing operator spark_batch = BashOperator( task_id='spark_batch', bash_command='spark-submit ~/code/Duo-flow/spark.py', queue='default', dag=dag) # S3 file sensor operator which senses the newly creatly file in S3 s3_file_sensor = S3KeySensor( task_id='s3_file_sensor', queue='default' bucket_key='s3://air-flow-lightning-output/lightning_2020output.csv', bucket_name=None, queue='default', dag=dag) # Store to DB operator that stores the result in PostgreSQL store_db = PythonOperator( task_id = 'store_db', provide_context=True, python_callable=store_db, queue='default', dag=dag)
srcDir = '/home/ubuntu/fault-tolerant-airflow/src/spark/' # Command to run remote spark batch processing cmd = 'ssh [email protected] spark-submit' + ' ' + srcDir + 'PDS.py --master ec2-18-235-191-19.compute-1.amazonaws.com --deploy-mode=cluster' objectKey = 's3n://de-yk-bucket/PDS/XETR/DailyAverages/' + str(todays_date_str) + '.csv' # Bash operator that synchronizes the Deutsche XETR Public Dataset with my bucket stored in S3 s3_ingest_opr = BashOperator(task_id='s3_ingest', bash_command='aws s3 sync s3://deutsche-boerse-xetra-pds s3://de-yk-bucket/PDS/XETR/ ', dag=dag) # Remote batch processing operator that calculates the daily averages of stock prices spark_batch_opr = BashOperator(task_id='spark_batch', bash_command=cmd, dag=dag) # S3 file sensor operator that senses the temporarily created csv file in S3 s3_file_sensor_opr = S3KeySensor( task_id='s3_file_sensor', poke_interval=60, timeout=10, soft_fail=True, bucket_key=objectKey, bucket_name=None, dag=dag) # Store to DB operator that stores the calculated daily average prices in PostgreSQL store_to_db_opr = PythonOperator(task_id = 'store_to_db', provide_context=True, python_callable=store_to_db, dag=dag) # Create dependencies for the DAG s3_ingest_opr >> spark_batch_opr >> s3_file_sensor_opr >> store_to_db_opr
python_callable=print_context, dag=dag) send_file = PythonOperator( task_id='send_file', #trigger_rule='all_success', python_callable=sendFile, op_kwargs={'filename': filename}, dag=dag) s3_chk = S3KeySensor(task_id='s3_chk', s3_conn_id='dev1_s3', depends_on_past=False, poke_interval=2, timeout=15, soft_fail=False, bucket_key='{}input/{}'.format( Variable.get('s3_buckey'), Variable.get('s3_filename')), bucket_name=None, wildcard_match=False, dag=dag) s3_create_project = PythonOperator(task_id='s3_create_project', depends_on_past=False, op_kwargs={ 'project_id': '', 'bucket': filename }, python_callable=create_structure, dag=dag)
############################################################ ############################################################ # Create S3KeySensor Task: # 5.Sense_S3_Source ############################################################ ############################################################ # Sensor task to verify existence of source data on S3 # aws_conn_id is setup in Admin panel on Web UI (I used default provided by Airflow) # In this case, AWS CLI is getting credentials from ~/.aws, though these could be # encrypted as variables in Airflow t5 = S3KeySensor(task_id='5.Sense_S3_Source', aws_conn_id="aws_default", bucket_key=source_data_path, bucket_name=None, poke_interval=5, timeout=5, trigger_rule="one_failed", dag=dag) ############################################################ ############################################################ # Create BashOperator Task: # 6.Generate_Copied.hash ############################################################ ############################################################ # This BashOperator copies data from S3 to local drive # using AWS CLI (which needs to be available in OS) # In this case, AWS CLI is getting credentials from ~/.aws, though these could be # encrypted as variables in Airflow
# task_id='redshift_table', # dag=dag, # postgres_conn_id='redshift', # sql='sql/create_tables.sql' # ) # s3://udacity-dend/log_data/2018/11/ # log_data/2018/11/2018-11-01-events.json # log_data/{{year}}/{{month}}/{{yyyy-mm-dd}}-events.json # LOG_JSONPATH='s3://udacity-dend/log_json_path.json' s3_file = S3KeySensor( task_id='s3_file_check', bucket_key= 's3://udacity-dend/log_data/{{macros.ds_format(ds,"%Y-%m-%d","%Y")}}/{{macros.ds_format(ds,"%Y-%m-%d","%m")}}/{{ds}}-events.json', bucket_name=None, aws_conn_id='aws_credentials', poke_interval=2, timeout=10, soft_fail=True, dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='staging_events', dag=dag, # end_date=datetime(2018, 11, 30, hour=23), table='staging_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', copy_sql=SqlQueries.copy_staging_events, params={'log_path': 's3://udacity-dend/log_json_path.json'}, # s3_key='log_data/'
"retries": 1, "retry_delay": timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG("s3_sensor_example", default_args=default_args, schedule_interval=timedelta(minutes=30), catchup=False) s3_sensor = S3KeySensor(task_id='s3_sensor', bucket_key='sensor_test/*', wildcard_match=True, bucket_name='cdn.getsixthman.com', s3_conn_id='sixthman_airflow_s3', dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) t3 = BashOperator( task_id="templated", bash_command="echo wooooooooo", dag=dag, ) t2.set_upstream(s3_sensor) t3.set_upstream(s3_sensor)
'email_on_failure': True, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=10) } # run every day at 12:01 am my_dag = DAG('s3_spark_mysql', default_args=default_args, schedule_interval='1 12 * * *') t1 = S3KeySensor( task_id='s3_file_test', poke_interval=30, timeout=10, soft_fail=False, bucket_key=data_uri, #expected file bucket_name=None, wildcard_match=True, dag=my_dag) t2 = BashOperator(task_id='extract_users', depends_on_past=False, bash_command="""$SPARK_HOME/bin/spark-submit \ --packages mysql:mysql-connector-java:5.1.40 \ --master spark://ip-10-0-0-11:7077 \ --executor-memory 6G \ /home/ubuntu/venmo/spark/userinfo.py """ + data_file, dag=my_dag) t3 = BashOperator(task_id='net_spending',
task_id='watch_step_pi', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) monitor_step_op_2 = EmrStepSensor( task_id='watch_step_distcp', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) validate_path_exists = S3KeySensor( task_id='validate_pii_exist', bucket_name='{{ params.bucket_name }}', bucket_key='{{ params.bucket_key }}', wildcard_match=True) terminate_cluster_op = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) handle_failure_op = PythonOperator( task_id='handle_failure', python_callable=handle_failure_task, trigger_rule=trigger_rule.TriggerRule.ONE_FAILED) create_cluster_op >> monitor_cluster_op >> handle_failure_op >> terminate_cluster_op
def grab_file(): s3_conn_id = 'my_conn_S3' s3 = S3Hook(s3_conn_id) key_label = "file-to-watch-3.txt" key = s3.get_key(key_label, 'superconductive-airflow-bucket') key_string = key.get_contents_as_string() return key_string dag = DAG('s3_connect_dag', default_args=default_args, schedule_interval='@once') file_processor = PythonOperator(task_id='grab_file_from_s3', python_callable=grab_file, dag=dag) file_trigger = S3KeySensor(task_id='check_s3_for_file_in_s3', bucket_key='file-to-watch-*', wildcard_match=True, bucket_name='superconductive-airflow-bucket', s3_conn_id='my_conn_S3', timeout=18 * 60 * 60, poke_interval=120, dag=dag) file_processor.set_upstream(file_trigger)
'email_on_retry': False, 'owner': 'airflow', 'start_date': datetime.now() - timedelta(days=1), } dag = DAG('af-dnaseq-align-wgs', default_args=default_args, schedule_interval=None # schedule_interval='@once' ) sensor = S3KeySensor(task_id='check_s3', bucket_name='secure-east2-test-bucket', s3_host='s3-us-east-2.amazonaws.com', bucket_key='jobs*.json', wildcard_match=True, s3_conn_id='fs_default', timeout=0, poke_interval=0, soft_fail=False, dag=dag) def create_run_jobs(queue_json_file): with TemporaryDirectory() as temp_git_dir: # create jobs job_creation_uuid = str(uuid.uuid4()) ## get job data with open(queue_json_file.name, 'r') as f: f.seek(0) queue_dict = json.loads(f.read())