'start_date': datetime.now() - timedelta(minutes=1),
    'email': [],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('airflow_practical_exercise',
          default_args=default_args,
          schedule_interval=None,
          start_date=datetime.now() - timedelta(minutes=1))

MysqlToHive = BashOperator(
    task_id='MysqlToHive',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/MysqlToHive.sh """,
    dag=dag)

csvToHive = BashOperator(
    task_id='csvToHive',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/csvToHive.sh """,
    dag=dag)

ReportingTables1 = BashOperator(
    task_id='ReportingTables1',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/ReportingTables1.sh """,
    dag=dag)
Пример #2
0
    'email_on_retry': True,
    'email': [u'*****@*****.**'],
    'email_on_failure': True,
    'retry_delay': timedelta(seconds=30),
    'owner': 'ct',
    'depends_on_past': True,
    'start_date': one_min_ago,
    'retries': 500
}


dag = DAG('vs', default_args=default_args, schedule_interval='@once')


chem1_pdb_prot1_pdb = BashOperator(
    task_id='chem1_pdb_prot1_pdb', 
    bash_command="(cd /working-directory; virtualScreening.py -l chem1.pdb -o result -p prot1.pdb) ", 
    dag=dag)

chem1_pdb_prot1_pdb_success_mail = EmailOperator(
    task_id="chem1_pdb_prot1_pdb_success_mail", 
    to=[u'*****@*****.**'],  
    subject="chem1_pdb_prot1_pdb success",  
    html_content="chem1_pdb_prot1_pdb success",  
    dag=dag)
                
chem1_pdb_prot1_pdb_success_mail.set_upstream(chem1_pdb_prot1_pdb)
#chem1_pdb_prot1_pdb.set_upstream( )


chem1_pdb_prot2_pdb = BashOperator(
    task_id='chem1_pdb_prot2_pdb', 
Пример #3
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.combine(today, time(13, 00, 0)) - timedelta(days=1),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}
TR = TriggerRule
dag = DAG('rehost_ods',
          default_args=default_args,
          schedule_interval=timedelta(days=1))
script_folder = DAGS_FOLDER + '/../scripts/'
t0 = BashOperator(task_id='ods_load_batch_0',
                  bash_command=script_folder +
                  'rehost_ods/ods_load_batch_0.sh;',
                  dag=dag)
t1 = BashOperator(task_id='ods_load_batch_1',
                  bash_command=script_folder +
                  'rehost_ods/ods_load_batch_1.sh;',
                  dag=dag)
t2 = BashOperator(task_id='ods_load_batch_2',
                  bash_command=script_folder +
                  'rehost_ods/ods_load_batch_2.sh;',
                  dag=dag)
t3 = BashOperator(task_id='ods_load_batch_3',
                  bash_command=script_folder +
                  'rehost_ods/ods_load_batch_3.sh;',
                  dag=dag)
t4 = BashOperator(task_id='ods_load_batch_4',
                  bash_command=script_folder +
Пример #4
0
def print_hello_world():
    print('this_should_print_hello_world from python')


# Following are defaults which can be overridden later on
default_args = {
    'owner': 'Jackie G',
    'depends_on_past': False,
    'start_date': datetime(2016, 4, 15),
    'email': ['jackies-email'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('Helloworld', default_args=default_args)

t1 = BashOperator(
    task_id='hello_from_bash',
    bash_command='echo "Task 1 says hello"',
    dag=dag)

t2 = PythonOperator(
    task_id='hello_from_python',
    python_callable=print_hello_world,
    dag=dag)

t2.set_upstream(t1)
        create_disposition='CREATE_IF_NEEDED',
        skip_leading_rows=0,
        write_disposition='WRITE_TRUNCATE',  # If the table exists, overwrite it.
        max_bad_records=0)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        # Obviously needs to match the name of cluster created in the prior two Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # This will tear down the cluster even if there are failures in upstream tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    # Delete  gcs files in the timestamped transformed folder.
    delete_transformed_files = BashOperator(
        task_id='delete_transformed_files',
        bash_command="gsutil -m rm -r gs://" + BUCKET +
        "/{{ dag_run.conf['transformed_path'] }}/")

    # If the spark job or BQ Load fails we rename the timestamped raw path to
    # a timestamped failed path.
    move_failed_files = BashOperator(task_id='move_failed_files',
                                     bash_command="gsutil mv gs://" + BUCKET +
                                     "/{{ dag_run.conf['raw_path'] }}/ " +
                                     "gs://" + BUCKET +
                                     "/{{ dag_run.conf['failed_path'] }}/",
                                     trigger_rule=TriggerRule.ONE_FAILED)
    # Set the dag property of the first Operators, this will be inherited by downstream Operators.

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)
Пример #6
0
FIND_STATEMENT="find ${BASE_LOG_FOLDER}/*/* -type f -mtime +${MAX_LOG_AGE_IN_DAYS}"
echo "Executing Find Statement: ${FIND_STATEMENT}"
FILES_MARKED_FOR_DELETE=`eval ${FIND_STATEMENT}`
echo "Process will be Deleting the following directories:"
echo "${FILES_MARKED_FOR_DELETE}"
echo "Process will be Deleting `echo "${FILES_MARKED_FOR_DELETE}" | grep -v '^$' | wc -l ` file(s)"     # "grep -v '^$'" - removes empty lines. "wc -l" - Counts the number of lines
echo ""

if [ "${ENABLE_DELETE}" == "true" ];
then
    DELETE_STMT="${FIND_STATEMENT} -delete"
    echo "Executing Delete Statement: ${DELETE_STMT}"
    eval ${DELETE_STMT}
    DELETE_STMT_EXIT_CODE=$?
    if [ "${DELETE_STMT_EXIT_CODE}" != "0" ]; then
        echo "Delete process failed with exit code '${DELETE_STMT_EXIT_CODE}'"
        exit ${DELETE_STMT_EXIT_CODE}
    fi
else
    echo "WARN: You're opted to skip deleting the files!!!"
fi
echo "Finished Running Cleanup Process"
"""

for log_cleanup_id in range(1, NUMBER_OF_WORKERS + 1):

    log_cleanup = BashOperator(task_id='log_cleanup_' + str(log_cleanup_id),
                               bash_command=log_cleanup,
                               provide_context=True,
                               dag=dag)
          default_args=default_args,
          schedule_interval="0 * * * *",
          start_date=datetime.now() - timedelta(minutes=10))

c = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_parse_process.sh
"""

c1 = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_ftp_process.sh
"""

c2 = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_post_parse_process.sh
"""

pullFiles = BashOperator(task_id='zteumts--pullFiles',
                         bash_command=c1,
                         dag=dag)

parseFiles = BashOperator(task_id='zteumts--parseFiles',
                          bash_command=c,
                          dag=dag)

moveFilesToDatalake = BashOperator(task_id='zteumts--moveFilesToDataLake',
                                   bash_command=c2,
                                   dag=dag)

parseFiles.set_upstream(pullFiles)
moveFilesToDatalake.set_upstream(parseFiles)
Пример #8
0
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

from google.cloud import storage
client = storage.Client()
i = 0
with models.DAG('loop_over_gcs_bucket_files_example',
                schedule_interval=None,
                default_args=default_dag_args) as dag:

    start = DummyOperator(task_id='start')
    wait = DummyOperator(task_id='wait', trigger_rule=TriggerRule.ONE_SUCCESS)
    for blob in client.list_blobs('myBucket', prefix='myFolder/mySubfolder'):
        #task id must only contain alphanumeric chars
        bash_cmd = "echo " + str(blob.name)
        i = i + 1
        bash_operator = BashOperator(task_id='bash_operator' + str(i),
                                     bash_command=bash_cmd)
        start.set_downstream(bash_operator)
        bash_operator.set_downstream(wait)

    end = DummyOperator(task_id='end')
wait >> end
    'start_date': datetime.now() - timedelta(minutes=1),
    'email': [],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('initialization',
          default_args=default_args,
          schedule_interval=None,
          start_date=datetime.now() - timedelta(minutes=1))

create_db = BashOperator(
    task_id='create_db',
    bash_command=
    """ impala-shell -q "create database if not exists practical_exercise_1;" """,
    dag=dag)

meta_store = BashOperator(task_id='meta_store',
                          bash_command=""" nohup sqoop metastore & """,
                          dag=dag)

sqoop_job = BashOperator(
    task_id='sqoop_job',
    bash_command=
    """ sqoop job --meta-connect jdbc:hsqldb:hsql://localhost:16000/sqoop --create practical_exercise_1.activitylog -- import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/pwd.txt  --table activitylog -m 2 --hive-import --hive-database practical_exercise_1 --hive-table activitylog --incremental append --check-column id --last-value 0 """,
    dag=dag)

user_totaltable = BashOperator(
    task_id='user_totaltable',
Пример #10
0
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime as dt
from datetime import timedelta

# Default DAG parameters
default_args = {
    'owner': 'airflow',
    'depends_past': False,
    'start_date': dt(2020, 3, 23),
    'retries': 0
}

dag = DAG('task_example',
          default_args=default_args,
          schedule_interval='30 07 * * *')

run_script = BashOperator(task_id='show',
                          bash_command='aws configure list',
                          dag=dag)
#! usr/bin/env python
from airflow import DAG
from airflow.operators import BashOperator, PythonOperator
from datetime import datetime, timedelta
__author__ = "Jonathan Hilgart"


default_args = {
        'owner': 'airflow',
        'depends_on_past': False,
        'start_date': datetime(2016, 1, 1),
        'email': ['*****@*****.**'],
        'email_on_failure': True,
        'email_on_retry': False,
        'retries': 3,
        'retry_delay': timedelta(minutes=5),
      }

dag = DAG('weather-bart_data_s3', default_args=default_args, schedule_interval=timedelta(seconds=300))
# run every 5 mins
t1 = BashOperator(
    task_id='weather_current_to_s3',
    bash_command='python ~/./weather_data_current_to_s3.py',
    retries=3,
    dag=dag)
t2= BashOperator(
    task_id='bart_data_to_s3',
    bash_command='python ~/./bart_to_s3.py',
    retries=3,
    dag=dag)
Пример #12
0
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2017, 9, 13),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('HelloWorld1', default_args=default_args)

# t1, t2, t3 and t4 are examples of tasks created using operators

t1 = BashOperator(task_id='task_1',
                  bash_command='echo "Hello World from Task 1"',
                  dag=dag)

t2 = BashOperator(task_id='task_2',
                  bash_command='echo "Hello World from Task 2"',
                  dag=dag)

t3 = BashOperator(task_id='task_3',
                  bash_command='echo "Hello World from Task 3"',
                  dag=dag)

t4 = BashOperator(task_id='task_4',
                  bash_command='echo "Hello World from Task 4"',
                  dag=dag)

t2.set_upstream(t1)
Пример #13
0
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator',
          default_args=args,
          schedule_interval='0 0 * * *',
          dagrun_timeout=timedelta(minutes=60))

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(task_id='run_after_loop',
                        bash_command='echo 1',
                        dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_' + i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)
Пример #14
0
    data_2012 = pd.read_csv('../data/data_2012.csv')
    year2012 = data_2012['id'].value_counts(
        sort=True, ascending=True).to_frame()  # 2012 arrests by id
    print("tasks 2013")
    #data_2013 = pd.read_csv(path + '/data/data_2012.csv')
    data_2013 = pd.read_csv('../data/data_2012.csv')
    year2013 = data_2013['id'].value_counts(
        sort=True, ascending=True).to_frame()  # 2013 arrests by id
    m = pd.concat([year2012, year2013], axis=1)
    m.columns = ['year2012', 'year2013']
    #m.to_csv(path + '/data/mdata.csv')
    m.to_csv('../data/mdata.csv')


t1 = BashOperator(task_id='read_json_2012',
                  python_callable=task_read12(),
                  bash_command='python3 ~/airflow/dags/dag_datatest.py',
                  dag=dag)

t2 = BashOperator(task_id='read_json_2013',
                  python_callable=task_read13(),
                  bash_command='python3 ~/airflow/dags/dag_datatest.py',
                  dag=dag)

t3 = BashOperator(task_id='merge',
                  python_callable=task_merge(),
                  bash_command='python3 ~/airflow/dags/dag_datatest.py',
                  dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)
Пример #15
0
today = datetime.today()

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.combine(today, time(20, 00, 0)) - timedelta(days=1),
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}
TR = TriggerRule
dag = DAG('usl_ods_v1',
          default_args=default_args,
          schedule_interval=timedelta(days=1))
script_folder = DAGS_FOLDER + '/../scripts/'
t0 = BashOperator(task_id='usldb_ods_incremental1',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_incremental1.sh;',
                  dag=dag)
t1 = BashOperator(task_id='usldb_ods_incremental2',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_incremental2.sh;',
                  dag=dag)
t2 = BashOperator(task_id='usldb_ods_full_load_all',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_full_load_all.sh;',
                  dag=dag)
t3 = BashOperator(task_id='verify_load',
                  bash_command=script_folder +
                  'ods_archiving/checkDailyLoad.sh usldb_ods;',
                  dag=dag,
                  trigger_rule=TR.ALL_DONE)
t3.set_upstream(t0)
Пример #16
0
#import the required libraries
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators import BashOperator

#defining the default arguments dictionary
args = {
    'owner': 'airflow',
    'start_date': datetime(2020, 12, 2),  #you can change this start_date
    'retries': 1,
    "retry_delay": timedelta(seconds=10),
}

dag = DAG('Assignment_1', default_args=args)

#task1 is to create a directory 'test_dir' inside dags folder
task1 = BashOperator(task_id='create_directory',
                     bash_command='mkdir -p ~/outputs/test_dir',
                     dag=dag)

#task2 is to get the 'shasum' of 'test_dir' directory
task2 = BashOperator(task_id='get_shasum',
                     bash_command='shasum ~/outputs/test_dir',
                     dag=dag)

#below we are setting up the operator relationships such that task1 will run first than task2
task2.set_upstream(task1)
Пример #17
0
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime

default_args = {
    'owner': 'root',
    'start_date': datetime.today(),
}

dag = DAG('zaim', default_args=default_args, schedule_interval='00 00 * * *')

task1 = BashOperator(
    task_id='retrieve_zaim_data',
    bash_command='python /app/zaim_downloader.py',
    dag=dag)

task2 = BashOperator(
    task_id='update_data',
    bash_command='/app/update_data.sh ',
    dag=dag)

task1.set_downstream(task2)
Пример #18
0
branching = BranchPythonOperator(task_id='branching',
                                 python_callable=lambda: 'source_count'
                                 if datetime.now().day <= 7 and datetime.today(
                                 ).weekday() == 6 else 'ignore_not_sunday',
                                 dag=dag)
branching.set_upstream(run_this_first)

esucc = EmailOperator(task_id='email_success_' + dag.dag_id,
                      to=email_addr,
                      subject=dag.dag_id + ' [success] on ' +
                      datetime.now().strftime('%Y-%m-%d'),
                      html_content='Congratulation!',
                      trigger_rule='all_success',
                      dag=dag)

source_count = BashOperator(
    task_id='source_count',
    bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ',
    dag=dag)

source_count.set_upstream(branching)
esucc.set_upstream(source_count)

ignore_not_sunday = DummyOperator(task_id='ignore_not_sunday', dag=dag)
ignore_not_sunday.set_upstream(branching)

join = DummyOperator(task_id='join', trigger_rule='all_success', dag=dag)
join << ignore_not_sunday
join << esucc
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators import BashOperator

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('download_hot_topics',
          description='Download hot topics',
          schedule_interval='0 0 * * *',
          start_date=datetime(2019, 9, 4),
          default_args=default_args)

with dag:
    dummy_operator = BashOperator(
        task_id='download_topics',
        bash_command=
        'python /scripts/download_hot_topics.py --directory /script_output')
Пример #20
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.combine(
        today, time(hour, minute)) - timedelta(days=d_delta,
                                               hours=h_delta,
                                               minutes=m_delta),
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'queue': 'maas',
}

dag = DAG('cron_mml',
          default_args=default_args,
          schedule_interval=timedelta(days=d_delta,
                                      hours=h_delta,
                                      minutes=m_delta))


command = 'sudo env PATH=$PATH su -c "{acumen_cron_scripts_dir}/cron-mml.sh" {user}'.format(acumen_cron_scripts_dir=task_config['acumen_cron_scripts_dir'], user=task_config['user'])

t1 = BashOperator(task_id='cron_mml',
                  bash_command=command, dag=dag)


t2 = alert_task(log=LOG_PATH,
                prefix=task_config['LOG_PREFIX'],
                dag=dag)

t2.set_upstream(t1)
Пример #21
0
    "igoruchoa",
    "POSTGRESQL_USERNAME":
    "******",
    "POSTGRESQL_PASSWORD":
    "******",
    "log4j_setting":
    "-Dlog4j.configuration=file://" + proj_home + "/log4j.properties"
}

######################################
#
## Operators definition
#
######################################
copy_log4j = BashOperator(task_id="copy_log4j",
                          bash_command="cp $PROJ_HOME/log4j.properties .",
                          dag=dag,
                          env={"PROJ_HOME": proj_home})

zip_dependencies = BashOperator(
    task_id="zip_dependencies",
    bash_command=
    "cd $PROJ_HOME; zip -r dependencies.zip etl/ utils/ streaming/",
    dag=dag,
    env={"PROJ_HOME": proj_home})

param = 'etl /BASEA/universities.json universities \'{"Id":"integer", "Name":"string"}\''
universities = BashOperator(task_id="universities.json",
                            bash_command=command_base + " " + param,
                            dag=dag,
                            env=env)
Пример #22
0
    'depends_on_past': False,
    'start_date': datetime(2016, 6, 25),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('main_summary',
          default_args=default_args,
          schedule_interval='@daily',
          max_active_runs=10)

# Make sure all the data for the given day has arrived before running.
t0 = BashOperator(task_id="delayed_start", bash_command="sleep 1800", dag=dag)

t1 = EMRSparkOperator(
    task_id="main_summary",
    job_name="Main Summary View",
    execution_timeout=timedelta(hours=10),
    instance_count=10,
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/main_summary_view.sh",
    dag=dag)

# Wait a little while after midnight to start for a given day.
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0,
    )

    delete_cluster = DataprocClusterDeleteOperator(

        task_id ="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-southeast2",
        trigger_rule = TriggerRule.ALL_DONE
    )

    delete_tranformed_files = BashOperator(
        task_id = "delete_tranformed_files",
        bash_command = "gsutil -m rm -r " +BUCKET + "/series_data_output/*"
    )

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream(bq_load_series_1)

    bq_load_series_1.set_downstream(bq_load_series_2)

    bq_load_series_2.set_downstream(delete_cluster)
    
    delete_cluster.set_downstream(delete_tranformed_files)
#! usr/bin/env python
from airflow import DAG
from airflow.operators import BashOperator, PythonOperator
from datetime import datetime, timedelta
__author__ = "Jonathan Hilgart"


default_args = {
        'owner': 'airflow',
        'depends_on_past': False,
        'start_date': datetime(2016, 1, 1),
        'email': ['*****@*****.**'],
        'email_on_failure': True,
        'email_on_retry': False,
        'retries': 3,
        'retry_delay': timedelta(minutes=5),
      }

dag = DAG('dag_to_mongo', default_args=default_args,
          schedule_interval='0 12 * * *') # run at 2 am
# run every 5 mins
t1 = BashOperator(
    task_id='normalize_data',
    bash_command='python ~/./push_bart-weather_to_mongo.py',
    retries=3,
    dag=dag)
Пример #25
0
 
from airflow import models
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
from airflow.operators import BashOperator
yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())
 
 
default_args = {
    'start_date':yesterday
}
 
with models.DAG(
    'dataflow_python_gcp_conn_id',
    schedule_interval=None,
    default_args=default_args) as dag:
    
    bash_nothing = BashOperator(task_id='nothing_2',bash_command='echo nothing')
    
    run_dataflow_python = DataFlowPythonOperator(
		    task_id='df-conn-gcp-id-from-json',
		    py_file='/home/airflow/gcs/data/wordcount.py',
		    options={'runner':'DataflowRunner',
			     'output':'gs://staging-bucket-hijo-project/out',
			     'temp_location':'gs://staging-bucket-hijo-project/teemp',
			     'staging_location':'gs://staging-bucket-hijo-project/staging',
			     'project':'hijo-project'},
		    gcp_conn_id='cloud-dataflow-hijo-project-from-location')
    bash_nothing >> run_dataflow_python
# [START instantiate_dag]
dag = DAG(
    'tutorial',
    default_args=default_args,
    description='A simple tutorial DAG',
    schedule_interval=timedelta(days=1),
    tags=['example'],
)
# [END instantiate_dag]

# t1, t2 and t3 are examples of tasks created by instantiating operators
# [START basic_task]
t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag,
)

t2 = BashOperator(
    task_id='sleep',
    depends_on_past=False,
    bash_command='sleep 5',
    retries=3,
    dag=dag,
)
# [END basic_task]

# [START documentation]
dag.doc_md = __doc__
Пример #27
0
                default_args=default_args,
                schedule_interval=None) as dag:

    start = dummy_operator.DummyOperator(task_id='start',
                                         trigger_rule='all_success')

    end = dummy_operator.DummyOperator(task_id='end',
                                       trigger_rule='all_success')

    load_from_gdrive = PythonOperator(task_id='load_to_local_bucket',
                                      provide_context=True,
                                      python_callable=load_to_local_bucket,
                                      dag=dag)

    # copy_to_gcp = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator(
    # source_bucket='us-central1-microstore21871-37272379-bucket',
    # source_object='data/gcs/*',
    # desination_bucket='baketto1',
    # destination_object='data/',
    # move_object=True
    # )

    copy_to_gcp = BashOperator(task_id='copy_to_gcp',
                               bash_command='''
                gsutil -m cp \
                gs://us-central1-micro-store-218-f06e2a8d-bucket/data/* \
                gs://baketto1/data/
                ''')

    start >> load_from_gdrive >> copy_to_gcp >> end
    #start >> load_from_gdrive >> end
Пример #28
0
    'depends_on_past': False,
    'start_date': datetime(2017, 3, 6),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('normalize_and_predict_bart_weather',
          default_args=default_args,
          schedule_interval='0 14 * * *')  # run at 6 am
# run every 5 mins
t1 = BashOperator(
    task_id='normalize_data',
    bash_command='spark-submit ~/./normalization-bart-weather-data-spark.py',
    retries=3,
    dag=dag)

t2 = BashOperator(
    task_id='generate_predictions',
    bash_command='spark-submit ~/./predict_ridership_ml_spark.py',
    retries=3,
    dag=dag)

t3 = BashOperator(task_id='push_predictions_to_website',
                  bash_command='python ~/./push_predictions_to_website.py',
                  retries=3,
                  dag=dag)

t2.set_upstream(t1)
Пример #29
0
            BIRD, SORTID, BLOCK)

        PROBE = "A1x32-Poly3-5mm-25s-177"
        RIG = "burung32-A32-HST32V"

        dag_id = USER + "_" + BLOCK
        dag = DAG(
            dag_id,
            default_args=default_args,
            schedule_interval='@once',
        )

        make_klusta_dir_task = BashOperator(
            task_id='make_klusta_dir',
            bash_command=make_klustadir_cmd,
            params={'klustadir': KLUSTA_DIR},
            on_success_callback=lambda c: set_perms(c['params']['klustadir'],
                                                    default_args['owner']),
            dag=dag)

        make_kwd_task = BashOperator(
            task_id='make_kwd',
            # pool='RAM',
            bash_command=make_kwd_command,
            env={'PATH': ANACONDA_PATH},
            params={
                'klustadir': KLUSTA_DIR,
                'matfiledir': MATFILE_DIR,
                'probe': PROBE,
                'rig': RIG,
                'omit': OMIT
Пример #30
0
from datetime import datetime, timedelta

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator', default_args=args)

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(task_id='run_after_loop',
                        bash_command='echo 1',
                        dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_' + i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep ' + str(i),
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(task_id='also_run_this',
                    bash_command='echo "{{ macros.uuid.uuid1() }}"',
                    dag=dag)
task.set_downstream(run_this_last)