from airflow import DAG from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago args = { 'owner': 'airflow', 'start_date': days_ago(2), } def dummy(*args, **kwargs): """Dummy function""" return "pass" with DAG(dag_id='example_xcom_args', default_args=args, schedule_interval=None, tags=['example']) as dag: task1 = PythonOperator( task_id='task1', python_callable=dummy, ) task2 = PythonOperator( task_id='task2', python_callable=dummy, op_kwargs={"dummy": task1.output}, )
# get value_2 pulled_value_2 = ti.xcom_pull(task_ids='push_by_returning') if pulled_value_2 != value_2: raise ValueError(f'The two values differ {pulled_value_2} and {value_2}') # get both value_1 and value_2 pulled_value_1, pulled_value_2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning']) if pulled_value_1 != value_1: raise ValueError(f'The two values differ {pulled_value_1} and {value_1}') if pulled_value_2 != value_2: raise ValueError(f'The two values differ {pulled_value_2} and {value_2}') push1 = PythonOperator( task_id='push', dag=dag, python_callable=push, ) push2 = PythonOperator( task_id='push_by_returning', dag=dag, python_callable=push_by_returning, ) pull = PythonOperator( task_id='puller', dag=dag, python_callable=puller, )
# DAG tests backfill with pooled tasks # Previously backfill would queue the task but never run it dag1 = DAG(dag_id='test_backfill_pooled_task_dag', default_args=default_args) dag1_task1 = DummyOperator( task_id='test_backfill_pooled_task', dag=dag1, pool='test_backfill_pooled_task_pool', ) # dag2 has been moved to test_prev_dagrun_dep.py # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator(task_id='test_dagrun_fail', dag=dag3, python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3, ) dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, python_callable=fail, ) dag4_task2 = DummyOperator(task_id='test_dagrun_succeed',
default_args=args, schedule_interval=None, tags=['example']) # [START howto_operator_python] def print_context(ds, **kwargs): """Print the Airflow context and ds variable from the context.""" pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='print_the_context', python_callable=print_context, dag=dag, ) # [END howto_operator_python] # [START howto_operator_python_kwargs] def my_sleeping_function(random_base): """This is a function that will run within the DAG execution""" time.sleep(random_base) # Generate 5 sleeping tasks, sleeping from 0.0 to 0.4 seconds respectively for i in range(5): task = PythonOperator(
timeout=60, poke_interval=10, retries=100, mode='poke', ) transformer_sensor = PythonSensor( task_id='transformer_sensor', python_callable=_wait_for_file, op_kwargs={'path': '{{ var.value.transformer_path }}'}, timeout=60, poke_interval=10, retries=100, mode='poke', ) predict = PythonOperator(task_id='predict', python_callable=_predict, op_kwargs={ 'test_data_path': '/opt/airflow/data/raw/{{ ds }}/test.csv', 'model_path': '{{ var.value.model_path }}', 'transformer_path': '{{ var.value.transformer_path }}', 'output_dir': '/opt/airflow/data/predictions/{{ ds }}/', }) [data_sensor, model_sensor, transformer_sensor] >> predict
with DAG( dag_id='example_twitter_dag', default_args=default_args, schedule_interval="@daily", tags=['example'], ) as dag: # -------------------------------------------------------------------------------- # This task should call Twitter API and retrieve tweets from yesterday from and to # for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv # output files generated by this task and naming convention # is direction(from or to)_twitterHandle_date.csv # -------------------------------------------------------------------------------- fetch_tweets = PythonOperator(task_id='fetch_tweets', python_callable=fetchtweets) # -------------------------------------------------------------------------------- # Clean the eight files. In this step you can get rid of or cherry pick columns # and different parts of the text # -------------------------------------------------------------------------------- clean_tweets = PythonOperator(task_id='clean_tweets', python_callable=cleantweets) clean_tweets << fetch_tweets # -------------------------------------------------------------------------------- # In this section you can use a script to analyze the twitter data. Could simply # be a sentiment analysis through algorithms like bag of words or something more # complicated. You can also take a look at Web Services to do such tasks
hero_img = x.get('img') table += f'''<tr><td class="hero_img_name">{hero_img}</td><td class="hero_name">{hero_name}</td><td class="winrate">{hero_media_winrate}</td></tr>''' table += '''</table> </html>''' return table def salva_html(ds, **kwargs): ti = kwargs['ti'] table = ti.xcom_pull(task_ids='gera_html') with open('table_winrate.html', 'w') as f: f.write(table) f.close() run_coleta_winrate_heroes = PythonOperator( task_id='coleta_winrate_heroes', python_callable=coleta_winrate_heroes, ) run_salva_mongo_coleta = PythonOperator( task_id='salva_mongo_coleta', python_callable=salva_mongo, op_kwargs={ 'task_id': 'coleta_winrate_heroes', 'database': 'dota_col', 'collection': 'winrate_meta' }, ) run_gera_media_winrate = PythonOperator( task_id='gera_media_winrate', python_callable=gera_media_winrate,
'retries': 1, 'retry_delay': timedelta(hours=1), } dag = DAG('basic_pipeline', default_args=project_cfg, schedule_interval=timedelta(days=1)) def example_task(_id, **kwargs): print("Task {}".format(_id)) return "completed task {}".format(_id) task_1 = PythonOperator( task_id='task_1', provide_context=True, python_callable=example_task, op_kwargs={'_id': 1}, dag=dag ) task_2 = PythonOperator( task_id='task_2', provide_context=True, python_callable=example_task, op_kwargs={'_id': 2}, dag=dag ) task_1 >> task_2
def _fetch_dataset_new(): print("Fetching data (NEW)...") with DAG( dag_id="03_branching", start_date=airflow.utils.dates.days_ago(3), schedule_interval="@daily", ) as dag: start = DummyOperator(task_id="start") pick_branch = BranchPythonOperator(task_id="pick_branch", python_callable=_pick_branch) fetch_dataset_old = PythonOperator(task_id="fetch_dataset_old", python_callable=_fetch_dataset_old) fetch_dataset_new = PythonOperator(task_id="fetch_dataset_new", python_callable=_fetch_dataset_new) fetch_another_dataset = DummyOperator(task_id="fetch_another_dataset") join_datasets = DummyOperator(task_id="join_datasets", trigger_rule="none_failed") train_model = DummyOperator(task_id="train_model") deploy_model = DummyOperator(task_id="deploy_model") start >> pick_branch pick_branch >> [fetch_dataset_old, fetch_dataset_new] [fetch_dataset_old, fetch_dataset_new, fetch_another_dataset
df.to_csv('dags/postgresqldata.csv') print("-------Data Saved------") def insertElasticsearch(): es = Elasticsearch() df = pd.read_csv('dags/postgresqldata.csv') for _, r in df.iterrows(): doc = r.to_json() res = es.index(index="frompostgresql", doc_type="doc", body=doc) print(res) default_args = { 'owner': 'sbahaddi', 'start_date': dt.datetime(2021, 3, 25), 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), } with DAG('MyDBdag', default_args=default_args, schedule_interval='@daily', ) as dag: getData = PythonOperator(task_id='QueryPostgreSQL', python_callable=queryPostgresql) insertData = PythonOperator( task_id='InsertDataElasticsearch', python_callable=insertElasticsearch) getData >> insertData
Tests whether the volume has been mounted. """ with open('/foo/volume_mount_test.txt', 'w') as foo: foo.write('Hello') return_code = os.system("cat /foo/volume_mount_test.txt") if return_code != 0: raise ValueError( f"Error when checking volume mount. Return code {return_code}") # You can use annotations on your kubernetes pods! start_task = PythonOperator(task_id="start_task", python_callable=print_stuff, executor_config={ "KubernetesExecutor": { "annotations": { "test": "annotation" } } }) # You can mount volume or secret to the worker pod second_task = PythonOperator( task_id="four_task", python_callable=test_volume_mount, executor_config={ "KubernetesExecutor": { "volumes": [ { "name": "example-kubernetes-test-volume", "hostPath": {
end_date = Variable.get('narrativedx_end_date', default_var=first_of_month - timedelta(days=1)) start_date = Variable.get('narrativedx_start_date', default_var=first_of_month - timedelta(days=end_date.day)) sql = sql.format(start_date=start_date, end_date=end_date, surv=service) df = pd.read_sql(sql, ppw_engine) df.to_csv(basepath.joinpath(f'NarrativeDX - {service} - {exec_date}.csv')) queries = [] for service in services: delete = PythonOperator(task_id=f'delete_older_{service}_file', python_callable=delete_older_file, op_kwargs={'service': service}, dag=dag) query = PythonOperator(task_id=f'query_narrativedx_{service}', python_callable=query_narrativedx, op_kwargs={'service': service}, dag=dag) sftp = SFTPOperator( task_id=f'upload_{service}_to_sftp', ssh_conn_id='coh_sftp', local_filepath=str( basepath.joinpath(f'NarrativeDX - {service} - {exec_date}.csv')), remote_filepath=f'/sftp/NarrativeDX - {service} - {exec_date}.csv', operation='put', create_intermediate_dirs=True,
'retries': 1, 'retry_delay': timedelta(minutes=5), } # Using a DAG context manager, you don't have to specify the dag property of each task with DAG( 'rock_content_item_backfill_example_dag', start_date=datetime(2021, 2, 22), max_active_runs=1, schedule_interval='@once', default_args=default_args, # catchup=False # enable if you don't want historical dag runs to run ) as dag: t0 = PythonOperator( task_id='fetch_and_save_content_items', python_callable= fetch_and_save_content_items, # make sure you don't include the () of the function op_kwargs={'client': None}) t1 = PythonOperator( task_id='fetch_and_save_content_items_connections', python_callable= fetch_and_save_content_items_connections, # make sure you don't include the () of the function op_kwargs={ 'client': None, 'do_backfill': True }) t0 >> t1
'owner': 'airflow', } def print_cwd(ds, **kwargs): """Print the Airflow context and ds variable from the context.""" print (Path.cwd()) return str(Path.cwd()) dag = DAG( dag_id='tika_bash_operator', default_args=args, start_date=days_ago(2), dagrun_timeout=timedelta(minutes=60), tags=['curl_tika'], params={"example_key": "example_value"}, ) run_this = BashOperator( task_id='run_curl', bash_command='curl -T /opt/airflow/dags/LICENSE http://0.0.0.0:9998/meta', dag=dag, ) run_this0 = PythonOperator( task_id='print_the_context', python_callable=print_cwd, dag=dag, ) run_this0 >> run_this
return ['accurate', 'in_accurate'] return 'in_accurate' with DAG('xcom_dag', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: downloading_data = BashOperator(task_id='downloading_data', bash_command='sleep 3', do_xcom_push=False) with TaskGroup('processing_tasks') as processing_tasks: training_model_a = PythonOperator(task_id='training_model_a', python_callable=_training_model) training_model_b = PythonOperator(task_id='training_model_b', python_callable=_training_model) training_model_c = PythonOperator(task_id='training_model_c', python_callable=_training_model) choose_model = BranchPythonOperator(task_id='task_4', python_callable=_choose_best_model) accurate = DummyOperator(task_id='accurate') in_accurate = DummyOperator(task_id='in_accurate') downloading_data >> processing_tasks >> choose_model choose_model >> [accurate, in_accurate]
def print_hello(**context): received_value = context['ti'].xcom_pull(key='random_value') print(f'hello, I received the following {str(received_value)}') def branch_func(**context): if random.random() < 0.5: return 'say_hi' return 'say_hello' run_this_task = PythonOperator(task_id='run_this', python_callable=push_to_xcom, provide_context=True, retries=10, retry_delay=timedelta(seconds=1), dag=dag) run_this_task_2 = PythonOperator(task_id='say_hi', python_callable=print_hi, provide_context=True, dag=dag) run_this_task_3 = PythonOperator(task_id='say_hello', python_callable=print_hello, provide_context=True, dag=dag) branch_op = BranchPythonOperator(task_id='branch_task', python_callable=branch_func,
catchup=False, tags=['example666'], render_template_as_native_obj=True) as dag: def extract(): data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}' return json.loads(data_string) def transform(order_data): print(type(order_data)) print(order_data) total_order_value = 0 for value in order_data.values(): total_order_value += value return {"total_order_value": total_order_value} extract_task = PythonOperator( task_id="extract", python_callable=extract, ) transform_task = PythonOperator( task_id="transform", op_kwargs={"order_data": "{{ti.xcom_pull('extract')}}"}, python_callable=transform, ) extract_task >> transform_task if __name__ == "__main__": dag.cli()
logging.info(f'Total count is: {error_count}') logging.info(f'Error list is : {error_list}') return error_count, error_list # Create Airflow dag default_args = {"retries": 2, 'retry_delay': timedelta(minutes=5)} with DAG(dag_id='loganalyzer', default_args=default_args, description='A simple DAG', schedule_interval='0 18 * * 1-5', start_date=datetime(2021, 11, 20, hour=18), catchup=False) as dag: t1 = PythonOperator(task_id='aapl_log_errors', python_callable=analyze_file, op_kwargs={ 'stock': 'aapl', 'log_dir': base_log_folder }) t2 = PythonOperator(task_id='tsla_log_errors', python_callable=analyze_file, op_kwargs={ 'stock': 'tsla', 'log_dir': base_log_folder }) t1 >> t2
"retries": 1, "retry_delay": timedelta(minutes=5), } with DAG( dag_id="covid_data_dag", default_args=default_args, description= "DAG to update Covid 19 data daily to push to a Postgres database.", schedule_interval='30 9 * * *', start_date=datetime(2021, 8, 24), ) as dag: # Initiate tasks task_1 = DummyOperator(task_id="Initiate_DAG") task_2 = PythonOperator( task_id="dashboard_update", python_callable=covid_19_dashboard_update, op_kwargs={ "username": config.username, "password": passwords_dict.get('postgres_password'), "database": config.database, "table_name": config.table_name, "columns": config.columns, "geo_ids_url": config.geo_ids_url, }, ) task_1 >> task_2
from datetime import timedelta from airflow.models import DAG from airflow.operators.python import PythonOperator from airflow.utils.timezone import datetime DEFAULT_DATE = datetime(2016, 1, 1) default_args = dict(start_date=DEFAULT_DATE, owner='airflow') def fail(): raise ValueError('Expected failure.') def success(ti=None, *args, **kwargs): if ti.execution_date != DEFAULT_DATE + timedelta(days=1): fail() # DAG tests that tasks ignore all dependencies dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args)) dag1_task1 = PythonOperator(task_id='test_run_dependency_task', python_callable=fail, dag=dag1) dag1_task2 = PythonOperator(task_id='test_run_dependent_task', python_callable=success, dag=dag1) dag1_task1.set_downstream(dag1_task2)
""" Tests whether the volume has been mounted. """ with open('/foo/volume_mount_test.txt', 'w') as foo: foo.write('Hello') return_code = os.system("cat /foo/volume_mount_test.txt") if return_code != 0: raise ValueError(f"Error when checking volume mount. Return code {return_code}") # You can use annotations on your kubernetes pods! start_task = PythonOperator( task_id="start_task", python_callable=print_stuff, executor_config={ "KubernetesExecutor": { "annotations": {"test": "annotation"} } } ) # You can mount volume or secret to the worker pod second_task = PythonOperator( task_id="four_task", python_callable=test_volume_mount, executor_config={ "KubernetesExecutor": { "volumes": [ { "name": "example-kubernetes-test-volume", "hostPath": {"path": "/tmp/"},
# Loop through metadata results for field, ts_obj in r.json().items(): device_metadata[field] = ts_obj[ 0] #assign the first (latest) value print(f'Adding {field}->{ts_obj[0]} to metdata result payload') results_metadata[d_id] = device_metadata print(json.dumps(results_metadata)) return json.dumps(results_metadata) ############################################## flashflood_authenticate_task = PythonOperator( task_id='flashflood_authenticate', python_callable=aware.flashfloodinfo_authenticate) flashflood_get_customer = PythonOperator( task_id='flashflood_get_customer', python_callable=aware.flashflood_get_customer, op_kwargs={ 'token': "{{task_instance.xcom_pull(task_ids='flashflood_authenticate')}}" }) get_aware_devices_task = PythonOperator( task_id='get_aware_devices', python_callable=aware.get_aware_devices, op_kwargs={ 'token':
total_order_value += value total_value = {"total_order_value": total_order_value} total_value_json_string = json.dumps(total_value) ti.xcom_push('total_order_value', total_value_json_string) def load(**kwargs): ti = kwargs['ti'] total_value_string = ti.xcom_pull(task_ids='transform', key='total_order_value') total_order_value = json.loads(total_value_string) print(total_order_value) extract_task = PythonOperator( task_id='extract', python_callable=extract, ) extract_task.doc_md = """\ #### Extract task A simple Extract task to get data ready for the rest of the data pipeline. In this case, getting data is simulated by reading from a hardcoded JSON string. This data is then put into xcom, so that it can be processed by the next task. """ transform_task = PythonOperator( task_id='transform', python_callable=transform, ) transform_task.doc_md = """\ #### Transform task A simple Transform task which takes in the collection of order data from xcom
tempfile = read_s3(kwargs['file']) conn = settings.engine.raw_connection() try: with open(tempfile, 'r') as f: cursor = conn.cursor() cursor.copy_expert(query, f) conn.commit() finally: conn.close() os.remove(tempfile) with DAG(dag_id=dag_id, schedule_interval=None, catchup=False, start_date=days_ago(1)) as dag: pause_dags_t = PythonOperator( task_id="pause_dags", python_callable=pause_dags ) with TaskGroup(group_id='import') as import_t: for x in OBJECTS_TO_IMPORT: load_task = PythonOperator( task_id=x[1], python_callable=load_data, op_kwargs={'query': x[0], 'file': x[1]}, provide_context=True ) load_variable_t = PythonOperator( task_id="variable", python_callable=importVariable ) load_task_instance_t = PythonOperator(
'last_played': retorna_localtime(hero['last_played']), 'played': hero.get('games'), 'won': hero.get('win'), 'lost': hero.get('games') - hero.get('win'), 'winrate': retorna_winrate_player_hero(hero.get('games'), hero.get('win')) } return player_heroes_data def salva_mongo(player_id,ds, **kwargs): ti = kwargs['ti'] dict_player_heroes_data = ti.xcom_pull(task_ids=f'Coleta_Dados_Player_{player_id}') db_col, db_client = collection_mongo_local('mongodb:27017', 'dota_col', 'winrate_player_heroes') db_col.insert_one(dict_player_heroes_data) db_client.close() player_ids=[23724176, 79380838,79409528, 95777879, 146329338] for i in player_ids: player_id = i run_coleta_winrate_player_heroes = PythonOperator( task_id=f'Coleta_Dados_Player_{player_id}', python_callable=coleta_winrate_player_heroes, op_kwargs={'player_id': player_id} ) run_salva_mongo = PythonOperator( task_id=f'salva_mongo_{player_id}', python_callable=salva_mongo, op_kwargs={'player_id': player_id} ) run_coleta_winrate_player_heroes >> run_salva_mongo
""" able to get context """ ctx = get_current_context() log.info("The knights of Ni say: %s (at %s)", value, ctx['ts']) with DAG( dag_id='example_xcom_args', default_args={'owner': 'airflow'}, start_date=days_ago(2), schedule_interval=None, tags=['example'], ) as dag: task1 = PythonOperator( task_id='generate_value', python_callable=generate_value, ) print_value(task1.output) with DAG( "example_xcom_args_with_operators", default_args={'owner': 'airflow'}, start_date=days_ago(2), schedule_interval=None, tags=['example'], ) as dag2: bash_op1 = BashOperator(task_id="c", bash_command="echo c") bash_op2 = BashOperator(task_id="d", bash_command="echo c") xcom_args_a = print_value("first!") xcom_args_b = print_value("second!")
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } # Using a DAG context manager, you don't have to specify the dag property of each task with DAG('rivervalley_rock_people_dag', start_date=datetime(2021, 4, 29), max_active_runs=1, schedule_interval=timedelta(minutes=30), # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs default_args=default_args, # catchup=False # enable if you don't want historical dag runs to run ) as dag: t0 = PythonOperator( task_id='fetch_and_save_campuses', python_callable=fetch_and_save_campuses, # make sure you don't include the () of the function op_kwargs={'client': 'rivervalley'} ) # generate tasks with a loop. task_id must be unique t1 = PythonOperator( task_id='fetch_and_save_people', python_callable=fetch_and_save_people, # make sure you don't include the () of the function op_kwargs={'do_backfill': False, 'client': 'rivervalley'} ) t0 >> t1
{ 'topologyKey': 'kubernetes.io/hostname', 'labelSelector': { 'matchExpressions': [{'key': 'app', 'operator': 'In', 'values': ['airflow']}] }, } ] } } tolerations = [{'key': 'dedicated', 'operator': 'Equal', 'value': 'airflow'}] # You don't have to specify any special KubernetesExecutor configuration if # you don't want/need to start_task = PythonOperator( task_id="start_task", python_callable=print_stuff ) # Check available libraries in airflow/ci:latest image one_task = PythonOperator( task_id="one_task", python_callable=check_installed_libraries, executor_config={"KubernetesExecutor": {"image": "apache/airflow:2.0.2-python3.8"}}, ) # List pods in current namespace two_task = PythonOperator( task_id="two_task", python_callable=list_pods, executor_config={"KubernetesExecutor": {"image": "apache/airflow:2.0.2-python3.8"}}, )
pathlib.Path("/tmp/images").mkdir(parents=True, exist_ok=True) # Download all pictures in launches.json with open("/tmp/launches.json") as f: launches = json.load(f) image_urls = [launch["image"] for launch in launches["results"]] for image_url in image_urls: try: response = requests.get(image_url) image_filename = image_url.split('/')[-1] target_file = f"/tmp/images/{image_filename}" with open(target_file, "wb") as f: f.write(response.content) print(f"Downloaded {image_url} to {target_file}") except request_exceptions.MissingSchema: print(f"{image_url} appears to be an invalid URL.") except requests.exceptions.ConnectionError: print(f"Could not connect to {image_url}.") get_pictures = PythonOperator(task_id="get_pictures", python_callable=_get_pictures, dag=dag) notify = BashOperator( task_id="notify", bash_command='echo "there are now $(ls /tmp/images/ | wc -l) images."', dag=dag) download_launches >> get_pictures >> notify
}) thread.daemon = True thread.start() time.sleep(consumer.RECEIVE_DURATION) consumer_client.close() thread.join() except KeyboardInterrupt: print('Stop receiving.') print('Consumer2 has stopped receiving, end time is {}.'.format( time.time())) t1 = PythonOperator( task_id='produce_raw_message', python_callable=produce_raw_message, dag=dag, ) t2 = PythonOperator( task_id='preprocess_raw_message', python_callable=preprocess_raw_message, dag=dag, ) t3 = PythonOperator( task_id='consume_and_offload_preprocessed_message', python_callable=consume_preprocessed_message, dag=dag, ) t1 >> t2 >> t3