def test_return_value(self): bash_operator = BashOperator( bash_command='echo "stdout"', task_id='test_return_value', dag=None ) return_value = bash_operator.execute(context={}) self.assertEqual(return_value, u'stdout')
def test_echo_env_variables(self): """ Test that env variables are exported correctly to the task bash environment. """ now = datetime.utcnow() now = now.replace(tzinfo=timezone.utc) self.dag = DAG( dag_id='bash_op_test', default_args={ 'owner': 'airflow', 'retries': 100, 'start_date': DEFAULT_DATE }, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60)) self.dag.create_dagrun( run_id='manual__' + DEFAULT_DATE.isoformat(), execution_date=DEFAULT_DATE, start_date=now, state=State.RUNNING, external_trigger=False, ) import tempfile with tempfile.NamedTemporaryFile() as f: fname = f.name t = BashOperator( task_id='echo_env_vars', dag=self.dag, bash_command='echo $AIRFLOW_HOME>> {0};' 'echo $PYTHONPATH>> {0};' 'echo $AIRFLOW_CTX_DAG_ID >> {0};' 'echo $AIRFLOW_CTX_TASK_ID>> {0};' 'echo $AIRFLOW_CTX_EXECUTION_DATE>> {0};' 'echo $AIRFLOW_CTX_DAG_RUN_ID>> {0};'.format(fname) ) original_AIRFLOW_HOME = os.environ['AIRFLOW_HOME'] os.environ['AIRFLOW_HOME'] = 'MY_PATH_TO_AIRFLOW_HOME' t.run(DEFAULT_DATE, DEFAULT_DATE, ignore_first_depends_on_past=True, ignore_ti_state=True) with open(fname, 'r') as fr: output = ''.join(fr.readlines()) self.assertIn('MY_PATH_TO_AIRFLOW_HOME', output) # exported in run_unit_tests.sh as part of PYTHONPATH self.assertIn('tests/test_utils', output) self.assertIn('bash_op_test', output) self.assertIn('echo_env_vars', output) self.assertIn(DEFAULT_DATE.isoformat(), output) self.assertIn('manual__' + DEFAULT_DATE.isoformat(), output) os.environ['AIRFLOW_HOME'] = original_AIRFLOW_HOME
def test_return_value_to_xcom(self): bash_operator = BashOperator( bash_command='echo "stdout"', xcom_push=True, task_id='test_return_value_to_xcom', dag=None ) xcom_return_value = bash_operator.execute(context={}) self.assertEqual(xcom_return_value, u'stdout')
def create_sde_tasks(dag, folder, layer, datasd_name, md, path_to_file, sde_to_shp): """Dynamically create SDE Airflow tasks. dag: DAG defined in _dags file. folder: subfolder in the sde folder on S3. layer: layer name. datasd_name: layer name + _datasd. md: name of md file on Seaboard. path_to_file: poseidon path + datasd_name. sde_to_shp: _jobs specific sde_to_shp function """ #: Latest Only Operator for sde layer sde_latest_only = LatestOnlyOperator(task_id='{layer}_latest_only' .format(layer=layer), dag=dag) #: Convert sde table to shapefile format to_shp = PythonOperator( task_id='{layer}_to_shp'.format(layer=layer), python_callable=sde_to_shp, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shapefile to GeoJSON format to_geojson = BashOperator( task_id='{layer}_to_geojson'.format(layer=layer), bash_command=shp_to_geojson(path_to_file), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shapefile to TopoJSON format to_topojson = BashOperator( task_id='{layer}_to_topojson'.format(layer=layer), bash_command=shp_to_topojson(path_to_file), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Compress shapefile components to_zip = PythonOperator( task_id='{layer}_shp_to_zip'.format(layer=layer), python_callable=shp_to_zip, op_kwargs={'datasd_name': datasd_name}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload shapefile to S3 shp_to_S3 = S3FileTransferOperator( task_id='{layer}_shp_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.zip'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.zip' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Upload geojson to S3 geojson_to_S3 = S3FileTransferOperator( task_id='{layer}_geojson_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.geojson'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.geojson' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Upload topojson to S3 topojson_to_S3 = S3FileTransferOperator( task_id='{layer}_topojson_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.topojson'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.topojson' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_md = get_seaboard_update_dag('{md}.md'.format(md=md), dag) if layer not in no_pbf: #: Convert GeoJSON to Geobuf format to_geobuf = PythonOperator( task_id='{layer}_to_geobuf'.format(layer=layer), python_callable=geojson_to_geobuf, op_kwargs={'path_to_file': path_to_file}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert geobuf to gzipped geobuf to_gzip = PythonOperator( task_id='{layer}_geobuf_to_gzip'.format(layer=layer), python_callable=geobuf_to_gzip, op_kwargs={'datasd_name': datasd_name}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload geobuf to S3 geobuf_to_S3 = S3FileTransferOperator( task_id='{layer}_geobuf_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.pbf'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.pbf' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, use_gzip=True, dag=dag) #: Conversion to geobuf is triggered after conversion to geojson. to_geobuf.set_upstream(to_geojson) #: Compression to gzip is triggered after conversion to geobuf. to_gzip.set_upstream(to_geobuf) #: geobuf upload to S3 is triggered after compression to gzipped geobuf. geobuf_to_S3.set_upstream(to_gzip) #: Github update depends on shapefile S3 upload success. update_md.set_upstream(geobuf_to_S3) #: Execution rules: #: sde_latest_only must run before shp conversion. to_shp.set_upstream(sde_latest_only) #: Conversion to geojson is triggered after conversion to shp. to_geojson.set_upstream(to_shp) #: Conversion to topojson is triggered after conversion to shapefile. to_topojson.set_upstream(to_shp) #: Compression to zip is triggered after conversion to geojson and topojson. to_zip.set_upstream(to_geojson) to_zip.set_upstream(to_topojson) #: shapefile upload to S3 is triggered after conversion to zip. shp_to_S3.set_upstream(to_zip) #: geojson upload to S3 is triggered after conversion to geojson. geojson_to_S3.set_upstream(to_geojson) #: topojson upload to S3 is triggered after conversion to topojson. topojson_to_S3.set_upstream(to_topojson) #: Github update depends on shapefile S3 upload success. update_md.set_upstream(shp_to_S3) update_md.set_upstream(geojson_to_S3) update_md.set_upstream(topojson_to_S3)
start_date = general.start_date['pd_cfs'] dag = DAG( dag_id='pd_cfs', default_args=args, start_date=start_date, schedule_interval=schedule['pd_cfs']) #: Latest Only Operator for pd_cfs pd_cfs_latest_only = LatestOnlyOperator( task_id='pd_cfs_latest_only', dag=dag) #: Get CFS data from FTP and save to temp folder get_cfs_data = BashOperator( task_id='get_cfs_data', bash_command=get_cfs_data(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process CFS data and save result to prod folder process_cfs_data = PythonOperator( task_id='process_cfs_data', python_callable=process_cfs_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod file to S3 cfs_to_S3 = S3FileTransferOperator(
def MakeCommonDag(name='istio_daily_flow_test', schedule_interval='15 9 * * *', monthly=False): """Creates the shared part of the daily/monthly dags.""" common_dag = DAG( name, catchup=False, default_args=default_args, schedule_interval=schedule_interval, ) def AirflowGetVariableOrBaseCase(var, base): try: return Variable.get(var) except KeyError: return base def GenerateTestArgs(**kwargs): """Loads the configuration that will be used for this Iteration.""" conf = kwargs['dag_run'].conf if conf is None: conf = dict() """ Airflow gives the execution date when the job is supposed to be run, however we dont backfill and only need to run one build therefore use the current date instead of the date that is passed in """ # date = kwargs['execution_date'] date = datetime.datetime.now() timestamp = time.mktime(date.timetuple()) # Monthly releases started in Nov 2017 with 0.3.0, so minor is # of months # from Aug 2017. minor_version = (date.year - 2017) * 12 + (date.month - 1) - 7 major_version = AirflowGetVariableOrBaseCase('major_version', 0) # This code gets information about the latest released version so we know # What version number to use for this round. r_minor = int(AirflowGetVariableOrBaseCase('released_version_minor', 0)) r_patch = int(AirflowGetVariableOrBaseCase('released_version_patch', 0)) # If we have already released a monthy for this mounth then bump # The patch number for the remander of the month. if r_minor == minor_version: patch = r_patch + 1 else: patch = 0 # If version is overriden then we should use it otherwise we use it's # default or monthly value. version = conf.get('VERSION') if monthly and not version: version = '{}.{}.{}'.format(major_version, minor_version, patch) default_conf = environment_config.get_airflow_config( version, timestamp, major=major_version, minor=minor_version, patch=patch, date=date.strftime('%Y%m%d'), rc=date.strftime('%H-%M')) config_settings = dict(VERSION=default_conf['VERSION']) config_settings_name = [ 'PROJECT_ID', 'MFEST_URL', 'MFEST_FILE', 'GCS_STAGING_BUCKET', 'SVC_ACCT', 'GITHUB_ORG', 'GITHUB_REPO', 'GCS_GITHUB_PATH', 'TOKEN_FILE', 'GCR_STAGING_DEST', 'GCR_RELEASE_DEST', 'GCS_MONTHLY_RELEASE_PATH', 'DOCKER_HUB', 'GCS_BUILD_BUCKET', 'RELEASE_PROJECT_ID', ] for name in config_settings_name: config_settings[name] = conf.get(name) or default_conf[name] if monthly: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or Variable.get('latest_sha') gcs_path = conf.get('GCS_MONTHLY_STAGE_PATH') if not gcs_path: gcs_path = default_conf['GCS_MONTHLY_STAGE_PATH'] else: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or default_conf['MFEST_COMMIT'] gcs_path = conf.get('GCS_DAILY_PATH') or default_conf['GCS_DAILY_PATH'] config_settings['GCS_STAGING_PATH'] = gcs_path config_settings['GCS_BUILD_PATH'] = '{}/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings['GCS_RELEASE_TOOLS_PATH'] = '{}/release-tools/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings['GCS_FULL_STAGING_PATH'] = '{}/{}'.format( config_settings['GCS_STAGING_BUCKET'], gcs_path) config_settings['ISTIO_REPO'] = 'https://github.com/{}/{}.git'.format( config_settings['GITHUB_ORG'], config_settings['GITHUB_REPO']) return config_settings generate_flow_args = PythonOperator( task_id='generate_workflow_args', python_callable=GenerateTestArgs, provide_context=True, dag=common_dag, ) get_git_commit_cmd = """ {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} git config --global user.name "TestRunnerBot" git config --global user.email "*****@*****.**" git clone {{ settings.MFEST_URL }} green-builds || exit 2 pushd green-builds git checkout {{ settings.MFEST_COMMIT }} || exit 5 SHA=`grep {{ settings.GITHUB_ORG }}/{{ settings.GITHUB_REPO }} {{ settings.MFEST_FILE }} | cut -f 6 -d \\"` || exit 3 if [ -z ${SHA} ]; then echo "SHA not found" exit 6 fi popd git clone {{ settings.ISTIO_REPO }} istio-code pushd istio-code/release git checkout ${SHA} || exit 4 gsutil cp *.sh gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/ gsutil cp *.json gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/ popd pushd green-builds git rev-parse HEAD """ get_git_commit = BashOperator( task_id='get_git_commit', bash_command=get_git_commit_cmd, xcom_push=True, dag=common_dag) build_template = """ {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} {% set m_commit = task_instance.xcom_pull(task_ids='get_git_commit') %} gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.json . gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.sh . chmod u+x * ./start_gcb_build.sh -w -p {{ settings.PROJECT_ID \ }} -r {{ settings.GCR_STAGING_DEST }} -s {{ settings.GCS_BUILD_PATH }} \ -v "{{ settings.VERSION }}" \ -u "{{ settings.MFEST_URL }}" \ -t "{{ m_commit }}" -m "{{ settings.MFEST_FILE }}" \ -a {{ settings.SVC_ACCT }} """ # NOTE: if you add commands to build_template after start_gcb_build.sh then take care to preserve its return value build = BashOperator( task_id='run_cloud_builder', bash_command=build_template, dag=common_dag) test_command = """ cp /home/airflow/gcs/data/githubctl ./githubctl chmod u+x ./githubctl {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} git config --global user.name "TestRunnerBot" git config --global user.email "*****@*****.**" ls -l ./githubctl ./githubctl \ --token_file="{{ settings.TOKEN_FILE }}" \ --op=dailyRelQual \ --hub=gcr.io/{{ settings.GCR_STAGING_DEST }} \ --gcs_path="{{ settings.GCS_BUILD_PATH }}" \ --tag="{{ settings.VERSION }}" """ run_release_qualification_tests = BashOperator( task_id='run_release_qualification_tests', bash_command=test_command, retries=0, dag=common_dag) copy_files = GoogleCloudStorageCopyOperator( task_id='copy_files_for_release', source_bucket=GetSettingTemplate('GCS_BUILD_BUCKET'), source_object=GetSettingTemplate('GCS_STAGING_PATH'), destination_bucket=GetSettingTemplate('GCS_STAGING_BUCKET'), dag=common_dag, ) generate_flow_args >> get_git_commit >> build run_release_qualification_tests.set_upstream(build) run_release_qualification_tests >> copy_files return common_dag, copy_files
# Access to http://localhost:8080/admin/ ``` # Commands ``` $ airflow list_dags $ airflow list_tasks -t __job_name__ $ airflow run __job_name__ print_date_0 2015-08-01 ``` """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'inazo', 'start_date': datetime(2016, 1, 1) } dag = DAG('__job_name__', default_args=default_args) # tasks t1 = BashOperator(task_id='print_date_0', bash_command='date', dag=dag) t2 = BashOperator(task_id='sleep', bash_command='sleep 5', dag=dag) t3 = BashOperator(task_id='print_date_1', bash_command='date', dag=dag) # schedule t2.set_upstream(t1) t3.set_upstream(t2)
dag = DAG( dag_id='parking_meters', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for parking meters parking_meters_latest_only = LatestOnlyOperator( task_id='parking_meters_latest_only', dag=dag) #: Downloads all parking files from FTP get_parking_files = BashOperator( task_id='get_parking_files', bash_command=ftp_download_wget(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Joins downloaded files from ftp to production build_prod_file = PythonOperator( task_id='build_prod_file', python_callable=build_prod_file, provide_context=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
# ############################## # # ### Annotate image example ### # # ############################## # # [START howto_operator_vision_annotate_image] annotate_image = CloudVisionAnnotateImageOperator( request=annotate_image_request, retry=Retry(maximum=10.0), timeout=5, task_id='annotate_image') # [END howto_operator_vision_annotate_image] # [START howto_operator_vision_annotate_image_result] annotate_image_result = BashOperator( bash_command="echo {{ task_instance.xcom_pull('annotate_image')" "['logoAnnotations'][0]['description'] }}", task_id='annotate_image_result', ) # [END howto_operator_vision_annotate_image_result] # [START howto_operator_vision_detect_text] detect_text = CloudVisionDetectTextOperator( image=DETECT_IMAGE, retry=Retry(maximum=10.0), timeout=5, task_id="detect_text", language_hints="en", web_detection_params={'include_geo_results': True}, ) # [END howto_operator_vision_detect_text]
'owner': 'restic', 'depends_on_past': False, #'start_date': airflow.utils.dates.days_ago(2), 'start_date': datetime(2020, 2, 29), 'retries': 0, } dag = DAG( dag_id='restic_check', schedule_interval=None, dagrun_timeout=timedelta(minutes=60), catchup=False, default_args=args, ) run_this_last = DummyOperator( task_id='run_END', dag=dag, ) run_restic_check = BashOperator( task_id='run_restic_check', bash_command=msys2_bash_invocation("restic-do-check"), dag=dag, ) run_restic_check >> run_this_last if __name__ == "__main__": dag.cli()
"depends_on_past": False, "start_date": datetime.now() - timedelta(days=7), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } with DAG("scheduled_bash_dag", default_args=default_args) as dag: # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 1", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id="templated", bash_command=templated_command,
unload_subjects = PythonOperator( dag=dag, task_id="unload_subjects", python_callable=query_to_local, op_kwargs={ 'sqlFilePath': '/usr/local/airflow/dags/scripts/unload_subjects.sql', 'dest': '/tmp/subjects.csv' }) csv_to_s3_stage = PythonOperator(dag=dag, task_id='CSVs_to_S3_stage', python_callable=csv_to_s3_stage) clean_tmp_dir = BashOperator(dag=dag, task_id='clean_tmp_dir', bash_command='rm /tmp/*.csv') with open('/usr/local/airflow/dags/EMR/steps.json') as steps_file: emr_steps = json.load(steps_file) students_average = EmrAddStepsOperator( dag=dag, task_id='students_average', job_flow_id=Variable.get('emr_cluster_id'), aws_conn_id='aws_default', steps=emr_steps, params={'bucket_name': Variable.get('bucket_name')}) step_checker = EmrStepSensor( dag=dag,
"on_failure_callback": slack.task_fail_slack_alert, "retries": 0, } bucket_base_uri = f"gs://{bucket_name}/" bucket_image_storage_url = f"{GCP_STORAGE_BASE}{bucket_name}/images/" dag = DAG( "2-export_images_to_gcs_dataset", default_args=default_args, catchup=False, schedule_interval=None ) create_data_bucket_cmd = f"gsutil ls -b {bucket_base_uri} || gsutil mb {bucket_base_uri}" create_data_bucket = BashOperator( task_id="create_data_bucket", bash_command=create_data_bucket_cmd, provide_context=True, dag=dag ) set_data_bucket_acl_cmd = f"gsutil defacl ch -u AllUsers:R {bucket_base_uri}" set_data_bucket_acl = BashOperator( task_id="set_data_bucket_acl", bash_command=set_data_bucket_acl_cmd, provide_context=True, trigger_rule="all_success", dag=dag, ) export_images_to_gcs_dataset_cmd = f"gsutil -m cp -r {AIRFLOW_IMAGE_FOLDER} {bucket_base_uri}" export_images_to_gcs_dataset = BashOperator( task_id="export_images_to_gcs", bash_command=export_images_to_gcs_dataset_cmd,
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } with DAG('gcp_example', default_args=default_args) as dag: create_bq_dataset_if_not_exist = """ bq ls {0} if [ $? -ne 0 ]; then bq mk {0} fi """.format(BQ_DATASET_NAME) # Create destination dataset. t1 = BashOperator(task_id='create_destination_dataset', bash_command=create_bq_dataset_if_not_exist, dag=dag) # Create a bigquery table from a .csv file located in a GCS bucket # (gs://example-datasets/game_data_condensed.csv). # Store it in our dataset. t2 = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket='example-datasets', source_objects=['game_data_condensed.csv'], destination_project_dataset_table='{0}.gcp_example_table'.format( BQ_DATASET_NAME), schema_fields=[ { 'name': 'name', 'type': 'string',
schedule_interval='@once', default_args=default_args) t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = PythonOperator( task_id='run_job', python_callable=run_job, op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'), retries=1, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id='templated', bash_command=templated_command, params={'my_param': 'Parameter I passed in'}, dag=dag) t2.set_upstream(t1) t3.set_upstream(t1)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('design_stock', default_args=default_args, schedule_interval="0 0 1 1 *") a1 = "aa.sh " # t1, t2 and t3 are examples of tasks created by instantiating operators t_analysis = BashOperator(task_id='analysis', bash_command=a1, dag=dag) t_scrap_data = BashOperator(task_id='scrap_data', bash_command=a1, dag=dag) t_run_main_PROJECTNAME = BashOperator(task_id='run_main_PROJECTNAME', bash_command=a1, dag=dag) t_clean_data = BashOperator(task_id='clean_data', bash_command=a1, dag=dag) t_download_data = BashOperator(task_id='download_data', bash_command=a1, dag=dag) t_to_hive = BashOperator(task_id='to_hive', bash_command=a1, dag=dag) feature_analysis = SubDagOperator( task_id='feature_analysis', subdag=subdag(DAG_NAME, 'feature_analysis', default_args), dag=dag,
start_date = general.start_date['tsw_integration'] #: Dag spec dag = DAG(dag_id='tsw_integration', default_args=args, start_date=start_date, schedule_interval=schedule) violations_latest_only = LatestOnlyOperator(task_id='violations_latest_only', dag=dag) # VPM Extraction Support Tasks #: Download VPM dump from FTP get_vpm_violations = BashOperator( task_id='get_vpm_violations', bash_command=get_vpm_violations_wget(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Download VPM dump from FTP #get_vpm_dump = BashOperator( # task_id='get_vpm_dump', # bash_command=ftp_download_wget(), # on_failure_callback=notify, # on_retry_callback=notify, # on_success_callback=notify, # dag=dag) # #
dag_id, default_args=args, description="aantal geidentificeerde taxikentekenplaten per dag", ) as dag: # 1. starting message on Slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. make temp dir mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {TMP_PATH}") # 3. download the data into temp directory download_data = HttpFetchOperator( task_id="download", endpoint=endpoint, http_conn_id=http_conn_id, tmp_file=f"{TMP_PATH}/taxi_passages.csv", output_type="text", ) create_temp_table = PostgresOperator( task_id="create_temp_tables", sql=SQL_CREATE_TEMP_TABLE, params=dict(base_table=table_id), )
dag=BAKE_OFF_PIPE, task_id="{region}_split".format(**locals()) ) freebayes_command = """freebayes -f {{ reference }} --vcf {{ outfile }} --targets {{ region }} {{ opts }} {{ in_bam }}""" freebayes_operators = {} for toople in chromosome_split_operators.iteritems(): region, operator = toople outfile = "{WORK_DIR}/{region}.vcf" freebayes_by_region = BashOperator(bash_command=freebayes_command, params={ 'reference': "/path/to/human.fasta", 'outfile': outfile, 'region': region, 'opts': default_args['freebayes'], 'in_bam': "{WORK_DIR}/{region}.bam".format(**locals()) }, dag=BAKE_OFF_PIPE, task_id="{region}_freebayes".format(**locals()) ) freebayes_operators[region] = freebayes_by_region freebayes_by_region.set_upstream(operator) # now merge vcf_concat_command = """vcf-concat-parts {{ in_files }} | vcf-sort > {{ outfile }}""" infiles = [] for toople in freebayes_operators.iteritems(): region, operator = toople infiles.append("{WORK_DIR}/{region}.vcf".format(**locals()))
import requests from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator # Initialising the DAG object dag = DAG( dag_id="download_rocket_launches", start_date=airflow.utils.dates.days_ago(14), schedule_interval=None, ) download_launches = BashOperator( task_id="download_launches", bash_commands= "curl -o /tmp/launches.json 'https://launchlibrary.net/1.4/launch?next=5&mode=verbose'", dag=dag, ) def _get_pictures(): pathlib.Path("/tmp/images").mkdir(parents=True, exist_ok=True) with open("/tmp/launches.json") as f: launches = json.load(f) image_urls = [ launch["rocket"]["imageURL"] for launch in launches["launches"] ] for image_url in image_urls: response = requests.get(image_url) image_filename = image_url.split("/")[-1]
'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.now(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('update_sparksubmit_pyspark', default_args=default_args) #, schedule_interval=timedelta(0)) pull_git = BashOperator( task_id='pull_git', bash_command='cd /root/pipeline && git pull', dag=dag) # t1 is an example of tasks created by instatiating operators sparksubmit_pyspark = BashOperator( task_id='sparksubmit_pyspark', bash_command='spark-submit --master local[*] /root/pipeline/jupyterhub.ml/scripts/pi.py 10', dag=dag) # Setup Airflow DAG sparksubmit_pyspark.set_upstream(pull_git)
'depends_on_past': False, 'start_date': run_time, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=30), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('dadan_DFCF', default_args=default_args, schedule_interval="30 21 * * *") # t1, t2 and t3 are examples of tasks created by instantiating operators task1_command = "/home/davidyu/stock/scripts/davidyu_stock/scripts/analysis/dadan_DFCF/shell/run_dadan_dfcf_weekly_dadan_cnt.sh " task2_command = "/home/davidyu/stock/scripts/davidyu_stock/scripts/analysis/dadan_DFCF/shell/run_dadan_dfcf_today_dadan_weekly_dadan_cnt.sh " t1 = BashOperator(task_id='dadan_dfcf_weekly_dadan_cnt', bash_command=task1_command, dag=dag) t2 = BashOperator(task_id='dadan_dfcf_today_dadan_weekly_dadan_cnt', bash_command=task2_command, dag=dag) t2.set_upstream(t1)
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('undeploy_prediction_pmml', default_args=default_args) # TODO: dockerFileTag and dockerFilePath should be passed in from webhook switch_to_aws = BashOperator( task_id='switch_to_aws', bash_command='sudo kubectl config use-context awsdemo', dag=dag) undeploy_container_aws = BashOperator( task_id='undeploy_container_to_aws', bash_command='sudo kubectl delete prediction-pmml', dag=dag) switch_to_gcp = BashOperator( task_id='switch_to_gcp', bash_command='sudo kubectl config use-context gcpdemo', dag=dag) undeploy_container_gcp = BashOperator( task_id='undeploy_container_gcp', bash_command='sudo kubectl delete prediction-pmml', dag=dag) # Setup Airflow DAG undeploy_container_aws.set_upstream(switch_to_aws) switch_to_gcp.set_upstream(undeploy_container_aws)
from airflow import DAG from airflow.operators.http_operator import SimpleHttpOperator from airflow.sensors.http_sensor import HttpSensor from airflow.operators.bash_operator import BashOperator from airflow.operators.email_operator import EmailOperator from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2020, 3, 15), 'end_date': None, 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=5) } dag = DAG('TARGET_INVENTORY', default_args=default_args, schedule_interval='@daily') target_sh = BashOperator( task_id='SHR', bash_command= "cd /home/ec2-user/TARGET_2/target/target/spiders && python3 -m scrapy crawl target_data.py ", queue="pipeline9", dag=dag)
# 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'trigger_rule': 'all_success' } dag = DAG( 'tutorial', default_args=default_args, description='A simple tutorial DAG', schedule_interval=timedelta(days=1), ) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag, ) t1.doc_md = """\ #### Task Documentation You can document your task using the attributes `doc_md` (markdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Instance Details page. ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) """ dag.doc_md = __doc__ t2 = BashOperator( task_id='sleep',
# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Used for unit tests""" import airflow from airflow.models import DAG from airflow.operators.bash_operator import BashOperator dag = DAG(dag_id='test_utils', schedule_interval=None) task = BashOperator( task_id='sleeps_forever', dag=dag, bash_command="sleep 10000000000", start_date=airflow.utils.dates.days_ago(2), owner='airflow', )
dag=dag) #: Update portal modified date update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag) #: Execution rules #: dsd_code_enf_latest_only must run before get_code_enf_files get_code_enf_files.set_upstream(dsd_ce_latest_only) for i in fname_list: #: Create fme shell command build_csv_task = BashOperator( task_id='get_' + i, bash_command=get_bash_command(i), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Set Task as Downstream for downloading files build_csv_task.set_upstream(get_code_enf_files) #: Create S3 Upload task s3_task = S3FileTransferOperator( task_id='upload_' + i, source_base_path=conf['prod_data_dir'], source_key=i + '_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='dsd/' + i + '_datasd.csv',
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago def split_execution_date(**kwargs): execution_date = kwargs['execution_date'] print(execution_date, type(execution_date)) args = { 'owner': 'Airflow', 'start_date': days_ago(2), 'provide_context': True, } with DAG('execution_date', schedule_interval='*/5 * * * *', default_args=args, catchup=False) as dag: t0 = BashOperator( task_id='print_execution_date', bash_command='echo {{ ds }} {{ execution_date }} {{ ts }}', ) t1 = PythonOperator( task_id='split_execution_date', python_callable=split_execution_date, op_kwargs={'execution_date': '{{ ds }}'}, ) t0 >> t1
# 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('update_prediction_pmml', default_args=default_args) #, schedule_interval=timedelta(0)) pull_git = BashOperator( task_id='pull_git', bash_command='cd /root/pipeline && git pull', dag=dag) # TODO: dockerFileTag and dockerFilePath should be passed in from webhook build_image = BashOperator( task_id='build_docker_image', bash_command='sudo docker build -t fluxcapacitor/prediction-pmml /root/pipeline/prediction.ml/pmml/', dag=dag) push_image = BashOperator( task_id='push_docker_image', bash_command='sudo docker push fluxcapacitor/prediction-pmml', dag=dag) #switch_to_aws = BashOperator( # task_id='switch_to_aws', # bash_command='sudo kubectl config use-context awsdemo', # dag=dag) update_container_aws = BashOperator( task_id='update_container_aws', bash_command='kubectl rolling-update prediction-pmml --context=awsdemo --image-pull-policy=Always --image=fluxcapacitor/prediction-pmml',
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('openstack_cli', default_args=default_args, schedule_interval=None) # print_date t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) ## Note that the openrc.sh file needs to be placed on a volume that can be ## accessed by the containers # openstack endpoint list t2 = OpenStackOperator(task_id='endpoint_list_task', openrc_file='/usr/local/airflow/dags/openrc.sh', openstack_command=['openstack', 'endpoint', 'list'], dag=dag) # openstack service list t3 = OpenStackOperator(task_id='service_list_task', openrc_file='/usr/local/airflow/dags/openrc.sh', openstack_command=['openstack', 'service', 'list'], dag=dag)
'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2020, 12, 24), 'end_date': None, 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=5) } dag = DAG('WHR', default_args=default_args, schedule_interval='1 0,12 * * *') HD_sh = BashOperator( task_id='HD', bash_command= "source /home/ec2-user/Scrapes/.venv/bin/activate && python /home/ec2-user/Scrapes/WHR/PYTHON/Homedepot.py ", queue="pipeline1", dag=dag) HD_LT_sh = BashOperator( task_id='HD_LT', bash_command= "source /home/ec2-user/Scrapes/.venv/bin/activate && python /home/ec2-user/Scrapes/WHR/PYTHON/Homedepot_LT.py ", queue="pipeline1", dag=dag) LOW_sh = BashOperator( task_id='LOW', bash_command= "source /home/ec2-user/Scrapes/.venv/bin/activate && python /home/ec2-user/Scrapes/WHR/PYTHON/Lowes_mb.py ", queue="pipeline1",
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2016,10,5,19), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 4, 'retry_delay': timedelta(seconds=0), } dag = DAG('test_retry_handling_job', default_args=default_args, schedule_interval='@once') task1 = BashOperator( task_id='test_retry_handling_op', bash_command='exit 1', dag=dag)
## Subheader Here's a [url](www.airbnb.com) My numbered list: 1. one 1. two My bulleted list: - first - second """ the_task = BashOperator( task_id='the_task', bash_command='echo THE_TASK', dag=dag) the_task.doc_md = """\ # Title Here's a [url](www.airbnb.com) My list: 1. one 1. two My bulleted list: - first - second """
from airflow.operators.bash_operator import BashOperator airflow_home = os.environ.get('AIRFLOW_HOME') default_args = { 'owner': 'airflow', 'start_date': datetime(2018, 6, 16), 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('bq_stores_sales_processing', schedule_interval='*/10 * * * *', catchup=False, default_args=default_args) bq_store_sales_task = BashOperator(task_id='bq_store_sales_bash', bash_command='bq query --use_legacy_sql=False < ' + airflow_home + '/scripts/store_sales.sql ', dag=dag) #bq_store_sales_task = BigQueryOperator( # task_id='bq_store_sales', # bql="INSERT `retail_demo_warehouse.store_sales` (store_id, store_name, lat_long, transaction_date, total_sales, updated_timestamp) SELECT store.store_id, store.store_name, CONCAT(CAST(store.latitude AS STRING), CONCAT(',', CAST(store.longitude AS STRING))) AS lat_long, sales.transaction_date, sales.store_sales AS total_sales, CURRENT_TIMESTAMP() as updated_timestamp FROM ( SELECT transaction_date, CAST(store_id AS INT64) AS store_id, ROUND(SUM((SELECT SUM(item_price_each * quantity) FROM UNNEST(lineitems))),2) AS store_sales FROM `retail_demo_warehouse.sales_events` GROUP BY transaction_date, store_id ) sales JOIN `retail_demo_warehouse.store` store ON store.store_id = sales.store_id", # sql=None, # destination_dataset_table=False, # bigquery_conn_id='google_cloud_default', # use_legacy_sql=False, # udf_config=False, # dag=dag)
schedule_interval=timedelta(days=1)) #------------------------------------------------------------------------------- # first operator date_operator = BashOperator( task_id='date_task', bash_command='date', dag=dag) #------------------------------------------------------------------------------- # second operator sleep_operator = BashOperator( task_id='sleep_task', depends_on_past=False, bash_command='sleep 5', dag=dag) #------------------------------------------------------------------------------- # third operator def print_hello(): return 'Hello world!' hello_operator = PythonOperator( task_id='hello_task', python_callable=print_hello, dag=dag) #-------------------------------------------------------------------------------
ContactID , FirstName , MiddleName , LastName , EmailAddress , Phone from adventureworks.contact c) """ t2 = MySqlOperator(sql=qry_populate_staging, mysql_conn_id='mysql_adventure', task_id='populating_staging', dag=dag) bash_command = """ spark-submit $AIRFLOW_HOME/dags/spark_jobs/process_etl.py """ t3 = BashOperator(task_id='proccessing_data', depends_on_past=False, bash_command=bash_command, dag=dag) # t3 = SparkSubmitOperator( # task_id='test_spark', # application='$AIRFLOW_HOME/dags/spark_jobs/process_etl.py', # start_date=datetime(2020, 6, 4) # ) t1 >> t2 >> t3
# Run a simple PySpark Script pyspark_local_task_one = BashOperator( task_id = "pyspark_local_task_one", bash_command = """spark-submit \ --master {{ params.master }} {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""", params = { "master": "local[8]", "filename": "ch02/pyspark_task_one.py", "base_path": "{}/".format(project_home) }, dag=dag ) # Run another simple PySpark Script that depends on the previous one pyspark_local_task_two = BashOperator( task_id = "pyspark_local_task_two", bash_command = """spark-submit \ --master {{ params.master }} {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""", params = { "master": "local[8]", "filename": "ch02/pyspark_task_two.py", "base_path": "{}/".format(project_home) }, dag=dag ) # Add the dependency from the second to the first task pyspark_local_task_two.set_upstream(pyspark_local_task_one)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=2) } batchdate = time.strftime("%Y%m%d") filename = "gs://dw-dev-insurance/ivans/current/IE_NCNU_" + batchdate + ".DAT" schedule_interval = "30 19 * * *" with DAG('DAG_GCP_IVANS_IE_LOAD', schedule_interval=schedule_interval, catchup=False, default_args=default_args) as dag: # t1 = BashOperator( # task_id='T1_COPY_TO_GCS', # bash_command='python /home/airflow/gcs/data/GCPDWH/util/transfer_mountpoint_to_gcs.py --config "config.properties" --productconfig "ivans.properties" --env "dev"' # ) t2 = BashOperator( task_id='T2_GCP_LOAD', bash_command= 'python /home/airflow/gcs/data/GCPDWH/ivans/load_ie_segments_to_bq_dataflow.py --config "config.properties" --productconfig "ivans.properties" --env "dev" --separator "|" --stripheader "0" --stripdelim "0" --addaudit "1" --writeDeposition "WRITE_APPEND" --system "IE" --input "gs://dw-dev-insurance/ivans/current/IE_NCNU_20180925.DAT"' ) # t3 = BashOperator( # task_id='T3_GCP_MOVE', # bash_command='gsutil mv gs://dw-dev-insurance/ivans/current/* gs://dw-dev-insurance/ivans/archive/' # ) # t1 >> t2 >> t3 t2
#COUNTRY='PL' dag = DAG('project-workflow',description='Project Workflow DAG', schedule_interval = '*/5 0 * * *', start_date=datetime(2017,7,1), catchup=False) xlsx_to_csv_task = BashOperator( task_id='xlsx_to_csv', bash_command='"$src"/test.sh "$country" 2nd_param_xlsx', env={'src': SRC, 'country': COUNTRY}, dag=dag) merge_command = SRC + '/test.sh ' + COUNTRY + ' 2nd_param_merge' merge_task = BashOperator( task_id='merge', bash_command=merge_command , dag=dag) my_templated_command = """ {{ params.src }}/test.sh {{ params.country}} 2nd_param_cleansing """ cleansing_task = BashOperator( task_id='cleansing', bash_command=my_templated_command, params={'src': SRC, 'country': COUNTRY}, dag=dag) x1_task = BashOperator( task_id='x1', bash_command='sleep 1 && echo [x1 start]', dag=dag)
import datetime from dateutil.tz import * from airflow.models.dag import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.bash_operator import BashOperator valid_dag = DAG( dag_id="ValidDag", description="This is a valid test dag", start_date=datetime.datetime(2020, 5, 20, 0, 0), ) task1 = BashOperator(bash_command="echo 1", task_id="Task1", dag=valid_dag) task2 = BashOperator(bash_command='echo "2"', task_id="Task2", dag=valid_dag) task1 >> task2
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('deploy_prediction_codegen', default_args=default_args) # TODO: dockerFileTag and dockerFilePath should be passed in from webhook build_image = BashOperator( task_id='build_docker_image', bash_command='sudo docker build -t fluxcapacitor/prediction-codegen /root/pipeline/prediction.ml/codegen/', dag=dag) push_image = BashOperator( task_id='push_docker_image', bash_command='sudo docker push fluxcapacitor/prediction-codegen', dag=dag) switch_to_aws = BashOperator( task_id='switch_to_aws', bash_command='sudo kubectl config use-context awsdemo', dag=dag) deploy_container_aws = BashOperator( task_id='deploy_container_aws', bash_command='sudo kubectl create -f /root/pipeline/prediction.ml/codegen-rc.yaml', dag=dag) switch_to_gcp = BashOperator( task_id='switch_to_gcp', bash_command='sudo kubectl config use-context gcpdemo',
from ''' + modeled_dataset + '''.Patient_SQL_4 union all select PATIENT_ID, CASE_ID, SEX, AGE_YRS, AGE_GROUP, WEIGHT, WEIGHT_UNIT from ''' + modeled_dataset + '''.Patient_SQL_5 union all select PATIENT_ID, CASE_ID, SEX, AGE as AGE_YRS, AGE_GROUP, WEIGHT, WEIGHT_UNIT from ''' + modeled_dataset + '''.Patient where AGE is null or AGE_UNIT = "YR" )''' with models.DAG('faers_workflow', schedule_interval=None, default_args=default_dag_args) as dag: create_staging = BashOperator( task_id='create_staging_dataset', bash_command='bq --location=US mk --dataset ' + staging_dataset) create_modeled = BashOperator( task_id='create_modeled_dataset', bash_command='bq --location=US mk --dataset ' + modeled_dataset) load_demo = BashOperator( task_id='load_demo', bash_command='bq --location=US load --autodetect --skip_leading_rows=1 \ --source_format=CSV ' + staging_dataset + '.Demographic \ "gs://cs327e_project_data/drug_data/demo2018q4.csv"\ primaryid:INT64,caseid:INT64,caseversion:INT64,i_f_code:STRING,i_f_code_num:INT64,event_dt:INT64,\ event_dt_num:DATE,mfr_dt:INT64,mfr_dt_num:DATE,init_fda_dt:INT64,init_fda_dt_num:DATE,fda_dt:INT64,fda_dt_num:DATE,\ rept_cod:STRING,rept_cod_num:INT64,auth_num:STRING,mfr_num:STRING,mfr_sndr:STRING,lit_ref:STRING,age:INT64,\
from airflow.models import DAG from datetime import datetime, timedelta five_days_ago = datetime.combine(datetime.today() - timedelta(5), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': five_days_ago, } dag = DAG(dag_id='perf_dag_2', default_args=args, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60)) task_1 = BashOperator( task_id='perf_task_1', bash_command='sleep 5; echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) for i in range(2, 5): task = BashOperator(task_id='perf_task_{}'.format(i), bash_command=''' sleep 5; echo "run_id={{ run_id }} | dag_run={{ dag_run }}" ''', dag=dag) task.set_upstream(task_1) if __name__ == "__main__": dag.cli()
] to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D'] yesterday = date.today() - timedelta(days=1) dt = yesterday.strftime("%Y-%m-%d") # define where you want to store the tweets csv file in your local directory local_dir = "/tmp/" # define the location where you want to store in HDFS hdfs_dir = " /tmp/" for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels:
'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } THE_HUMAN_GENOME = "/Users/mlyons/genomics/reference/human_g1k_v37.fasta" BAM_DIR = "/Users/mlyons/genomics/1kg/bam" BIN_DIR = "/Users/mlyons/genomics/bin" simple_mapping_pipeline = DAG(dag_id="simple_mapping_pipeline", default_args=default_args, schedule_interval=timedelta(minutes=2)) # figure out some sensor to look for a fastq file to map fastq_sensor = FastqSensor(directory="/Users/mlyons/genomics/1kg/unprocessed_fastq", dag=simple_mapping_pipeline, task_id='fastq_sensor', poke_interval=60) """bwa mem {{ path_to_reference_file }} {{ ti.xcom_pull('unmapped_fastq') }} > {{ path_to_output }}/{{ task_instance_key_str }}.sam""" bwa_mem = BashOperator(bash_command=BWA_MEM_COMMAND, params={'path_to_reference_file': THE_HUMAN_GENOME, 'path_to_output': BAM_DIR, 'bin': BIN_DIR}, dag=simple_mapping_pipeline, task_id='bwa_mem', wait_for_downstream=False) bwa_mem.set_upstream(fastq_sensor)
'sql': pg_movies_dirs, 'filename': pg_csv_filename }, ) upload_pg_file = LocalFilesystemToGCSOperator( task_id="PG_UPLOAD_FILE", src=pg_csv_filename, dst=GCS_FILENAME.format('movies_directors', pg_base_filename), bucket=BUCKET, ) upload_mysql_file = LocalFilesystemToGCSOperator( task_id="MYSQL_UPLOAD_FILE", src=mysql_csv_filename, dst=GCS_FILENAME.format('movies_directors', mysql_base_filename), bucket=BUCKET, ) # t1, t2 and t3 are examples of tasks created by instantiating operators print_date = BashOperator( task_id='print_date', bash_command='date', ) mysql_poc_pull.set_upstream(print_date) pg_poc_pull.set_upstream(print_date) upload_mysql_file.set_upstream(mysql_poc_pull) upload_pg_file.set_upstream(pg_poc_pull)
def my_py_command(ds, **kwargs): # Print out the "foo" param passed in via # `airflow test example_passing_params_via_test_command run_this <date> # -tp '{"foo":"bar"}'` if kwargs["test_mode"]: print(" 'foo' was passed in via test={} command : kwargs[params][foo] \ = {}".format(kwargs["test_mode"], kwargs["params"]["foo"])) # Print out the value of "miff", passed in below via the Python Operator print(" 'miff' was passed in via task params = {}".format(kwargs["params"]["miff"])) return 1 my_templated_command = """ echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} " echo " 'miff was passed in via BashOperator with value {{ params.miff }} " """ run_this = PythonOperator( task_id='run_this', provide_context=True, python_callable=my_py_command, params={"miff":"agg"}, dag=dag) also_run_this = BashOperator( task_id='also_run_this', bash_command=my_templated_command, params={"miff":"agg"}, dag=dag) also_run_this.set_upstream(run_this)
pod_runtime_info_envs = [ PodRuntimeInfoEnv('MY_POD_NAMESPACE', 'metadata.namespace'), PodRuntimeInfoEnv('MY_POD_NAME', 'metadata.name'), PodRuntimeInfoEnv('MY_POD_IP', 'status.podIP') ] args = {'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(2)} # base path returned zip dag path base_path = os.path.split(__file__)[0] plain_txt = read_packaged_file(f"{base_path}/plain_files/plain.txt") with DAG(dag_id=DAG_NAME, default_args=args, schedule_interval='30 0 * * *') as dag: # Use the zip binary, which is only found in this special docker image read_local_file = BashOperator(task_id='read_local_file', bash_command=f"echo {plain_txt}") # Limit resources on this operator/task with node affinity & tolerations spark_batch_job_distributed_mode = KubernetesPodOperator( namespace=os.environ['AIRFLOW__KUBERNETES__NAMESPACE'], name="spark_batch_job_distributed_mode", image=docker_image, image_pull_policy="IfNotPresent", cmds=["/bin/sh", "-c"], arguments=[spark_submit_sh], env_vars=envs, service_account_name="airflow", resources={ 'request_memory': "1024Mi", 'request_cpu': "100m" }, task_id="spark_batch_job_distributed_mode",
from airflow.operators.bash_operator import BashOperator from airflow.models import DAG from datetime import timedelta args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(3), } dag = DAG( dag_id='perf_dag_1', default_args=args, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60)) task_1 = BashOperator( task_id='perf_task_1', bash_command='sleep 5; echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) for i in range(2, 5): task = BashOperator( task_id='perf_task_{}'.format(i), bash_command=''' sleep 5; echo "run_id={{ run_id }} | dag_run={{ dag_run }}" ''', dag=dag) task.set_upstream(task_1) if __name__ == "__main__": dag.cli()
dag = DAG(dag_id='impersonation_subdag', default_args=default_args) def print_today(): print('Today is {}'.format(datetime.utcnow())) subdag = DAG('impersonation_subdag.test_subdag_operation', default_args=default_args) PythonOperator( python_callable=print_today, task_id='exec_python_fn', dag=subdag) BashOperator( task_id='exec_bash_operator', bash_command='echo "Running within SubDag"', dag=subdag ) subdag_operator = SubDagOperator(task_id='test_subdag_operation', subdag=subdag, mode='reschedule', poke_interval=1, dag=dag)
from_channels = ['fromTwitter_A', 'fromTwitter_B', 'fromTwitter_C', 'fromTwitter_D'] to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D'] yesterday = date.today() - timedelta(days=1) dt = yesterday.strftime("%Y-%m-%d") # define where you want to store the tweets csv file in your local directory local_dir = "/tmp/" # define the location where you want to store in HDFS hdfs_dir = " /tmp/" for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql)
def get_bash_op(pkg_name, dag, configpath='/home/maloss/config/astgen_javascript_smt.config', cache_dir='/home/maloss/metadata', outdir='/home/maloss/result'): return BashOperator( task_id=get_sanitized_pkgname(pkg_name=pkg_name), execution_timeout=timedelta(hours=2), bash_command='cd /home/maloss/src/ && python main.py astfilter --ignore_dep_version -n %s -c %s -d %s -o %s -l javascript' % (pkg_name, configpath, cache_dir, outdir), dag=dag)
# 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'trigger_rule': u'all_success' } dag = DAG( 'tutorial', default_args=default_args, description='A simple tutorial DAG', schedule_interval=timedelta(days=1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t1.doc_md = """\ #### Task Documentation You can document your task using the attributes `doc_md` (markdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Instance Details page. ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) """ dag.doc_md = __doc__ t2 = BashOperator( task_id='sleep', depends_on_past=False,
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'sample', default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """
args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2) } dag = DAG( dag_id='test_example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) task.set_downstream(run_this_last)
"""Demo DAG showing a Hello World example.""" import airflow from airflow.models import DAG from airflow.operators.bash_operator import BashOperator args = { "owner": "godatadriven", "start_date": airflow.utils.dates.days_ago(14) } dag = DAG( dag_id="1_hello_dag", default_args=args, schedule_interval="0 0 * * *", description="Demo DAG showing a hello world example.", ) t1 = BashOperator(task_id="sleep_a_bit", bash_command="sleep 5", dag=dag) t2 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t1 >> t2
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('test_1', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='step1', bash_command='echo start', dag=dag) template_command=''' sh step2.sh ''' t2 = BashOperator( task_id='step2', bash_command=template_command, retries=3, dag=dag) t2.set_upstream(t1)
from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator def print_hello(): return 'Hello world!' default_args = { 'owner': 'Oliver', 'depends_on_past': False, 'start_date': datetime(2019, 2, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag = DAG('helloworld', description='hello world example', default_args=default_args, schedule_interval=timedelta(days=1), catchup=False) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) python_operator = PythonOperator(task_id='python_task', python_callable=print_hello, dag=dag) bash_script = '/usr/local/airflow/scripts/hello_bash.sh' if path.exists(bash_script): bash_operator = BashOperator(task_id='bash_task', bash_command=f"{bash_script} ", dag=dag) bash_operator.set_upstream(python_operator) dummy_operator >> python_operator
def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ {% set s=execution_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then exit 1 fi exit 0 """ dag_external_id = TEST_DAG_ID + '_external' dag_external = DAG( dag_external_id, default_args=self.args, schedule_interval=timedelta(seconds=1)) task_external_with_failure = BashOperator( task_id="task_external_with_failure", bash_command=bash_command_code, retries=0, dag=dag_external) task_external_without_failure = DummyOperator( task_id="task_external_without_failure", retries=0, dag=dag_external) task_external_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) session = settings.Session() TI = TaskInstance try: task_external_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) # The test_with_failure task is excepted to fail # once per minute (the run on the first second of # each minute). except Exception as e: failed_tis = session.query(TI).filter( TI.dag_id == dag_external_id, TI.state == State.FAILED, TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all() if len(failed_tis) == 1 and \ failed_tis[0].task_id == 'task_external_with_failure': pass else: raise e dag_id = TEST_DAG_ID dag = DAG( dag_id, default_args=self.args, schedule_interval=timedelta(minutes=1)) task_without_failure = ExternalTaskSensor( task_id='task_without_failure', external_dag_id=dag_external_id, external_task_id='task_external_without_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_with_failure = ExternalTaskSensor( task_id='task_with_failure', external_dag_id=dag_external_id, external_task_id='task_external_with_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with self.assertRaises(AirflowSensorTimeout): task_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
dag=dag) #: Get sidewalks shapefile from Atlas get_sw_shapefiles = PythonOperator( task_id='get_sidewalk_gis', python_callable=get_sidewalk_gis, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shp to geojson sidewalks_to_geojson = BashOperator( task_id='sidewalks_to_geojson', bash_command=shp_to_geojson(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shp to topojson sidewalks_to_topojson = BashOperator( task_id='sidewalks_to_topojson', bash_command=shp_to_topojson(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert geojson to geobuf sidewalks_to_geobuf = PythonOperator(