Пример #1
0
    # The contents of the obtained file_path_sp directory
    file_names_sp = os.listdir(file_path_sp)

    # Get ul file path
    file_path_ul = task_name_ul + '/' + option + '/bin/'
    # The contents of the obtained file_path_ul directory
    file_names_ul = os.listdir(file_path_ul)
    # Loop to get sp each file name
    for file_name_sp in file_names_sp:
        if os.path.isfile(file_path_sp + file_name_sp):
            task_option_path = '/usr/bin/perl ' + task_name_sp + '/' + option + '/bin/' + file_name_sp + ' '
            task_option_path_ld = '/usr/bin/perl ' + task_name_ld + '/' + option + '/bin/StructuralLoad.pl '
            t = BashOperator(task_id='LD_' + option,
                             bash_command=task_option_path_ld,
                             dag=dag)
            t.set_upstream(branching)
            dummy_follow = BashOperator(task_id='SP_' + option,
                                        bash_command=task_option_path,
                                        dag=dag)
            t.set_downstream(dummy_follow)
            dummy_follow.set_downstream(join)
    # Loop to get ul each file name
    for file_name_ul in file_names_ul:
        if os.path.isfile(file_path_ul + file_name_ul):
            task_option_path = '/usr/bin/perl ' + task_name_ul + '/' + option + '/bin/' + file_name_ul + ' '
            dummy_follow = BashOperator(task_id='UL_' + option,
                                        bash_command=task_option_path,
                                        dag=dag)
            dummy_follow.set_upstream(branching)
            dummy_follow.set_downstream(join)
Пример #2
0

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': min_10,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
  }


dag = DAG('scrape_cdc', default_args=default_args)

run_this = PythonOperator(
    task_id='print_the_context',
    provide_context=True,
    python_callable=print_context,
    dag=dag)

t1 = BashOperator(
task_id='testairflow',
bash_command=f'python {file_path}',
dag=dag)

t1.set_downstream(run_this)

if __name__ == "__main__":
    dag.cli()
Пример #3
0
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(
    dag_id='example_bash_operator', default_args=args,
    schedule_interval='0 0 * * *',
    dagrun_timeout=timedelta(minutes=60))

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(
    task_id='run_after_loop', bash_command='echo 1', dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_'+i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)
task.set_downstream(run_this_last)
    isalnum = a_string.isalnum()
    #print('Is String Alphanumeric :', isalnum)
    alphanumeric_filter = filter(str.isalnum, a_string)
    alphanumeric_string = "".join(alphanumeric_filter)
    #remove / from file path
    return alphanumeric_string.replace("/", "__")


with models.DAG(
        'import_ingestion',
        # Continue to run DAG once per day
        schedule_interval='@once',
        default_args=default_dag_args) as dag:

    start = DummyOperator(task_id='start')

    wait = DummyOperator(task_id='wait', trigger_rule="all_done")

    end = DummyOperator(task_id='end', trigger_rule="all_done")

    for blob in blobs:
        #print(blob.name)
        print_file = BashOperator(task_id='print_file_' +
                                  get_alphanumeric_task_id(blob.name),
                                  bash_command='echo "hello "+blob.name',
                                  dag=dag)
        start.set_downstream(print_file)
        print_file.set_downstream(wait)

wait >> end
Пример #5
0
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

from google.cloud import storage
client = storage.Client()
i = 0
with models.DAG('loop_over_gcs_bucket_files_example',
                schedule_interval=None,
                default_args=default_dag_args) as dag:

    start = DummyOperator(task_id='start')
    wait = DummyOperator(task_id='wait', trigger_rule=TriggerRule.ONE_SUCCESS)
    for blob in client.list_blobs('myBucket', prefix='myFolder/mySubfolder'):
        #task id must only contain alphanumeric chars
        bash_cmd = "echo " + str(blob.name)
        i = i + 1
        bash_operator = BashOperator(task_id='bash_operator' + str(i),
                                     bash_command=bash_cmd)
        start.set_downstream(bash_operator)
        bash_operator.set_downstream(wait)

    end = DummyOperator(task_id='end')
wait >> end
Пример #6
0
    'docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10))

t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t2 = BashOperator(
    task_id='sleep',
    bash_command='sleep 5',
    retries=3,
    dag=dag)

t3 = DockerOperator(api_version='1.19',
    docker_url='tcp://localhost:2375', #Set your docker URL
    command='/bin/sleep 30',
    image='centos:latest',
    network_mode='bridge',
    task_id='docker_op_tester',
    dag=dag)


t4 = BashOperator(
    task_id='print_hello',
    bash_command='echo "hello world!!!"',
    dag=dag)


t1.set_downstream(t2)
t1.set_downstream(t3)
t3.set_downstream(t4)
with models.DAG(
        'search_console_with_quata',
        # Continue to run DAG once per day
        schedule_interval=None,
        default_args=default_dag_args) as dag:

	#dummy - proceed only if success
	start = DummyOperator(task_id='start')
	wait = DummyOperator(task_id='wait')

	end = DummyOperator(task_id='end')
	
	for single_date in daterange(start_date, end_date):
		temp_date=single_date.strftime("%Y-%m-%d")
		day_after_single_date=single_date+ datetime.timedelta(days = 1)
		day_after_single_date=day_after_single_date.strftime("%Y-%m-%d")
		
		##notice trigger_rule="all_done"
		bash_run_report_remotly_cmd='gcloud beta compute --project 	myProject ssh search-console --internal-ip --zone us-central1-c --command "sudo -u omid python /home/omid/search_analytics_api_sample.py sc-domain:investing.com '+temp_date+" "+day_after_single_date+'"'
		run_report_remotly = BashOperator(task_id='run_report_remotly_'+temp_date,retries=2,retry_delay=datetime.timedelta(minutes=15),retry_exponential_backoff=True,max_retry_delay=datetime.timedelta(hours=48),bash_command=bash_run_report_remotly_cmd,trigger_rule="all_done")
		start.set_downstream(run_report_remotly)
		run_report_remotly.set_downstream(wait)

	mv_to_data_lake = BashOperator( task_id='mv_to_data_lake',bash_command='gcloud beta compute --project 	gap---all-sites-1245 ssh search-console --internal-ip --zone us-central1-c --command "sudo -u omid gsutil -m mv -r  /tmp/search* gs://data_lake_ingestion_us/search_console/"',dag=dag)
	
	load="""bq --location US load --source_format CSV --replace=true --skip_leading_rows 1 --allow_quoted_newlines --quote "" DATA_LAKE_INGESTION_US.search_console_partition gs://data_lake_ingestion_us/search_console/*"""

	load_to_data_lake = BashOperator( task_id='load_to_data_lake',bash_command=load,dag=dag)
	
wait  >> mv_to_data_lake  >> load_to_data_lake >> end
          schedule_interval=None,
          start_date=datetime.now() - timedelta(minutes=1))

MysqlToHive = BashOperator(
    task_id='MysqlToHive',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/MysqlToHive.sh """,
    dag=dag)

csvToHive = BashOperator(
    task_id='csvToHive',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/csvToHive.sh """,
    dag=dag)

ReportingTables1 = BashOperator(
    task_id='ReportingTables1',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/ReportingTables1.sh """,
    dag=dag)

ReportingTables2 = BashOperator(
    task_id='ReportingTables2',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/ReportingTables2.sh  """,
    dag=dag)

MysqlToHive.set_downstream(csvToHive)
csvToHive.set_downstream(ReportingTables1)
ReportingTables1.set_downstream(ReportingTables2)
    dag=dag)

import_sql_hive = BashOperator(
    task_id='import_sql_hive',
    bash_command=
    """sh /home/cloudera/Downloads/practical/import_sql_hive.sh -u root -p /user/cloudera/password.txt -d practical_exercise_1 """,
    dag=dag)

create_csv = BashOperator(
    task_id='create_csv',
    bash_command=
    """python3 /home/cloudera/Downloads/practical/practical_exercise_data_generator.py --create_csv """,
    dag=dag)

import_csv_hive = BashOperator(
    task_id='import_csv_hive',
    bash_command=
    """sh /home/cloudera/Downloads/practical/import_csv_hive.sh -d practical_exercise_1 """,
    dag=dag)

generate_report = BashOperator(
    task_id='generate_report',
    bash_command=
    """sh /home/cloudera/Downloads/practical/generate_report.sh -d practical_exercise_1 """,
    dag=dag)

create_csv.set_downstream(import_csv_hive)
load_data.set_downstream(import_sql_hive)
import_sql_hive.set_downstream(generate_report)
import_csv_hive.set_downstream(generate_report)
            "%Y%m%d") + '.json'
        bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(
            task_id='bash_api_call_GET_DESKTOP_TRAFFIC' +
            single_date.strftime("%Y%m%d"),
            bash_command=bash_cmd)

        bash_cmd2 = """gsutil mv /tmp/file_""" + single_date.strftime(
            "%Y%m%d") + '.json gs://data_lake/similar_web_desktop_traffic/'
        bash_gsutil_mv_files_to_ingestion = BashOperator(
            task_id='bash_gsutil_mv_files_to_ingestion' +
            single_date.strftime("%Y%m%d"),
            bash_command=bash_cmd2)
        #bash_cmd="""ls"""
        #bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(task_id='bash_opr_'+str(item),bash_command=bash_cmd)
        start.set_downstream(bash_api_call_GET_DESKTOP_TRAFFIC)
        bash_api_call_GET_DESKTOP_TRAFFIC.set_downstream(
            bash_gsutil_mv_files_to_ingestion)
        bash_gsutil_mv_files_to_ingestion.set_downstream(wait)

    load_to_bg_GET_DESKTOP_TRAFFIC = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='load_to_bg_GET_DESKTOP_TRAFFIC',
        source_objects=['*'],
        write_disposition='WRITE_TRUNCATE',  #overwrite?
        create_disposition='CREATE_IF_NEEDED',
        bucket=DST_BUCKET,
        destination_project_dataset_table=dst_table,
        autodetect='true')

    end = DummyOperator(task_id='end')

wait >> load_to_bg_GET_DESKTOP_TRAFFIC >> end
        region="us-east1")

    bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_csv_profeco",
        bucket='gnp-storage',
        source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"],
        destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table",
        autodetect=True,
        source_format="CSV",
        field_delimiter=',',
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-east1",
        trigger_rule=TriggerRule.ALL_DONE)

    unzip_files.dag = dag

    unzip_files.set_downstream(create_cluster)

    create_cluster.set_downstream(PythonOperator)

    PythonOperator.set_downstream([submit_pyspark, bq_load_profeco_data])

    submit_pyspark.set_downstream(delete_cluster)
    bash_command=" hadoop fs -mkdir -p /user/cloudera/workshop/process/ ",
    dag=dag)

Create_Database = BashOperator(
    task_id='Create_Database',
    bash_command="""impala-shell -q "create database practical_exercise_1;" """,
    dag=dag)

Sqoop_Job= BashOperator(
    task_id='Sqoop_Job',
    bash_command="sqoop job --meta-connect jdbc:hsqldb:hsql://localhost:16000/sqoop --create practical_exercise_1.activitylog -- import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/root_pwd.txt --table activitylog -m 4 --hive-import --hive-database practical_exercise_1 --hive-table activitylog --incremental append --check-column id --last-value 0 ",
    dag=dag)


External_table = BashOperator(
    task_id='External_table',
    bash_command="""hive -e "CREATE EXTERNAL TABLE practical_exercise_1.user_upload_dump ( user_id int, file_name STRING, timestamp int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/user/cloudera/workshop/process/' tblproperties ('skip.header.line.count'='1');" """ ,
    dag=dag)

Creating_table_user_total = BashOperator(
    task_id='Creating_table_user_total',
    bash_command="""impala-shell -q "create table if not exists practical_exercise_1.user_total(time_ran timestamp, total_users bigint, users_added bigint);" """,
    dag=dag)

Starting_Sqoop_Metajob.set_downstream(Sqoop_Job)
Creating_Directories.set_downstream(External_table)
Create_Database.set_downstream(External_table)
Create_Database.set_downstream(Sqoop_Job)
Create_Database.set_downstream(Creating_table_user_total)

Пример #13
0
    'start_date': datetime.utcnow(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('docker_sample',
          default_args=default_args,
          schedule_interval=timedelta(minutes=10))

t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)

t2 = BashOperator(task_id='sleep', bash_command='sleep 5', retries=3, dag=dag)

t3 = DockerOperator(api_version='1.21',
                    command='/bin/sleep 30',
                    image='busybox:latest',
                    network_mode='bridge',
                    task_id='docker_op_tester',
                    dag=dag)

t4 = BashOperator(task_id='print_hello',
                  bash_command='echo "hello world!!!"',
                  dag=dag)

t1.set_downstream(t2)
t1.set_downstream(t3)
t3.set_downstream(t4)
        destination_project_dataset_table=PROJECT_ID+".data_analysis.flights_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0   
    )


    # delete_cluster=DataprocClusterDeleteOperator(

    #     task_id="delete_cluster",
    #     cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
    #     region='asia-east1',
    #     trigger_rule=TriggerRule.ALL_DONE
    # )


    create_cluster.dag=dag 

    create_cluster.set_downstream(submit_sqoop)

    submit_sqoop.set_downstream(bq_load_flight_delays)

#    bq_load_flight_delays.set_downstream(delete_cluster)

    


Пример #15
0
    'owner': 'milseiei',
    'depends_on_past': False,
    'start_date': datetime(2017, 1, 12),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('Time_splitter',
          schedule_interval='0,30 * * * *',
          default_args=default_args)

t1 = BashOperator(task_id='task_1_date_data',
                  bash_command='date > /home/ubuntu/airflow/dags/data.txt',
                  dag=dag)

t2 = BashOperator(
    task_id='task_2_split_time',
    bash_command='python /home/ubuntu/airflow/dags/split_into_time.py',
    dag=dag)

t3 = BashOperator(
    task_id='task_3_split_into_mins',
    bash_command='python /home/ubuntu/airflow/dags/split_into_mins.py',
    dag=dag)

t1.set_downstream(t2)
t2.set_downstream(t3)
Пример #16
0

valid_chars='-_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
def sanitize(text):
	return ''.join(c for c in text if c in valid_chars)

#this is where pipeline-generated bash commands come in.... 
bash_commands = ('echo "hi russ"', 'echo "hello again"')
conclusion_command = 'echo "all done"'
conclusion = BashOperator(task_id='conclude', bash_command=conclusion_command, dag=dag)

for cmd in bash_commands: 
	cmd.rstrip()
	run_this = BashOperator(
	  task_id=sanitize(cmd), bash_command=cmd, dag=dag)
	run_this.set_downstream(conclusion)








# def push(**kwargs):
#     # pushes an XCom without a specific target
#     kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)

# def push_by_returning(**kwargs):
#     # pushes an XCom without a specific target, just by returning it
#     return value_2
Пример #17
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('init_airflow_practical_exercise',
          default_args=default_args,
          schedule_interval=None,
          start_date=datetime.now() - timedelta(minutes=1))

Init_MysqlToHive = BashOperator(
    task_id='Init_MysqlToHive',
    bash_command=
    """  sh /home/cloudera/Documents/PracticalExercise2/Init_MysqlToHive.sh """,
    dag=dag)

Init_csvToHive = BashOperator(
    task_id='Init_csvToHive',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/Init_csvToHive.sh  """,
    dag=dag)

Init_ReportingTables2 = BashOperator(
    task_id='Init_ReportingTables2',
    bash_command=
    """ sh /home/cloudera/Documents/PracticalExercise2/Init_ReportingTables2.sh  """,
    dag=dag)

Init_MysqlToHive.set_downstream(Init_csvToHive)
Init_csvToHive.set_downstream(Init_ReportingTables2)
Пример #18
0
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator',
          default_args=args,
          schedule_interval='0 0 * * *',
          dagrun_timeout=timedelta(minutes=60))

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(task_id='run_after_loop',
                        bash_command='echo 1',
                        dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_' + i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)
task.set_downstream(run_this_last)
Пример #19
0
    dag=dag)

create_findisactive = BashOperator(
    task_id='create_findisactive',
    bash_command=
    """ impala-shell -q "drop table if exists practical_exercise_1.find_isactive; create table practical_exercise_1.find_isactive as select sums.user_id,sums.total_updates, sums.total_inserts, sums.total_deletes, sums.upload_count, d.last_active_type, d.is_active from practical_exercise_1.sums join (select c.user_id, t, activitylog.type last_active_type, c.is_active from (select activitylog.user_id, max(timestamps) t, if(unix_timestamp()-max(timestamps)<=172800,'TRUE','FALSE') is_active from practical_exercise_1.activitylog group by user_id)c right outer join practical_exercise_1.activitylog on c.t=activitylog.timestamps where c.user_id=activitylog.user_id)d on d.user_id=sums.user_id;" """,
    dag=dag)

create_userreport = BashOperator(
    task_id='create_userreport',
    bash_command=
    """ impala-shell -q "drop table if exists practical_exercise_1.user_report; create table practical_exercise_1.user_report as select user.id, find_isactive.total_updates, find_isactive.total_inserts, find_isactive.total_deletes,find_isactive.upload_count,find_isactive.last_active_type,find_isactive.is_active from practical_exercise_1.find_isactive right outer join practical_exercise_1.user on user.id=find_isactive.user_id;" """,
    dag=dag)

insert_usertotal = BashOperator(
    task_id='insert_usertotal',
    bash_command=
    """ impala-shell -q "insert into practical_exercise_1.user_total select current_timestamp(), sub1.t , case when sub2.t1 is NULL then sub1.t when sub2.t1 is not NULL then sub1.t-sub2.t1 end from (select count(distinct id) as t from practical_exercise_1.user)sub1, (select max(total_users) t1 from user_total) sub2;" """,
    dag=dag)

load_data.set_downstream(addition_data)
load_data.set_downstream(import_user)
create_csv.set_downstream(move_csv_to_hdfs)
move_csv_to_hdfs.set_downstream(create_userdump)
move_csv_to_hdfs.set_downstream(move_csv_to_archive)
addition_data.set_downstream(create_sums)
create_sums.set_downstream(create_findisactive)
create_findisactive.set_downstream(create_userreport)
import_user.set_downstream(insert_usertotal)
create_userdump.set_downstream(create_sums)
dag = DAG('initialization',
          default_args=default_args,
          schedule_interval=None,
          start_date=datetime.now() - timedelta(minutes=1))

create_db = BashOperator(
    task_id='create_db',
    bash_command=
    """ impala-shell -q "create database if not exists practical_exercise_1;" """,
    dag=dag)

meta_store = BashOperator(task_id='meta_store',
                          bash_command=""" nohup sqoop metastore & """,
                          dag=dag)

sqoop_job = BashOperator(
    task_id='sqoop_job',
    bash_command=
    """ sqoop job --meta-connect jdbc:hsqldb:hsql://localhost:16000/sqoop --create practical_exercise_1.activitylog -- import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/pwd.txt  --table activitylog -m 2 --hive-import --hive-database practical_exercise_1 --hive-table activitylog --incremental append --check-column id --last-value 0 """,
    dag=dag)

user_totaltable = BashOperator(
    task_id='user_totaltable',
    bash_command=
    """ impala-shell -q "create table if not exists practical_exercise_1.user_total(time_ran timestamp, total_users bigint, users_added bigint);" """,
    dag=dag)

create_db.set_downstream(sqoop_job)
meta_store.set_downstream(sqoop_job)
create_db.set_downstream(user_totaltable)
Пример #21
0
                     bash_command = ACTIVATE_VENV + \
                         cmd_fmt.format(cmd='build_set_mongo-hadoop_1_3_3.sh ') )

pre_work = BashOperator(task_id='prework', env=options_env, retries=1, dag=dag,
                        bash_command = ACTIVATE_VENV + \
                            cmd_fmt.format(cmd='generate_sql_statements.sh ') )

etl_work = BashOperator(task_id='etl_work', env=options_env, retries=3, dag=dag,
                        bash_command = ACTIVATE_VENV + \
                            cmd_fmt.format(cmd='workflow_start_resume_mongo.sh ') )

csv_prepare_work = BashOperator(task_id='csv_prepare_work', env=options_env, retries=3, dag=dag,
                        bash_command = ACTIVATE_VENV + \
                                    cmd_fmt.format(cmd='workflow_start_resume_csv.sh '))

final_work = BashOperator(task_id='final_work', env=options_env, retries=3, dag=dag,
                          bash_command = ACTIVATE_VENV + \
                              cmd_fmt.format(cmd='workflow_start_resume_export.sh ') )

#jobs dependencies
pre_clean.set_downstream([repo1_scp, repo2_scp, repo3_scp])
pre_build.set_upstream([repo1_scp, repo2_scp, repo3_scp])
build.set_upstream(pre_build)
pre_work.set_upstream(build)
etl_work.set_upstream(pre_work)
csv_prepare_work.set_upstream(etl_work)
final_work.set_upstream(csv_prepare_work)

#for test purposes
username = mongodb_creds[1]
Пример #22
0
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime

default_args = {
    'owner': 'root',
    'start_date': datetime.today(),
}

dag = DAG('zaim', default_args=default_args, schedule_interval='00 00 * * *')

task1 = BashOperator(
    task_id='retrieve_zaim_data',
    bash_command='python /app/zaim_downloader.py',
    dag=dag)

task2 = BashOperator(
    task_id='update_data',
    bash_command='/app/update_data.sh ',
    dag=dag)

task1.set_downstream(task2)
        bash_cleanup_cmd = 'gsutil rm gs://myBucket/google/gam/example_report/*report_example_using_service_account_with_date_range_' + temp_date + '*'
        bash_cleanup = BashOperator(task_id='bash_cleanup_' + temp_date,
                                    retries=0,
                                    bash_command=bash_cleanup_cmd,
                                    trigger_rule="all_done")

        ##notice trigger_rule="all_done"
        bash_run_report_remotly_cmd = 'gcloud beta compute --project myProjectName ssh scheduler2 --internal-ip --zone us-central1-a --command "sudo -u omid python3 /home/omid/gam_data_transfer/report_example_using_service_account_with_date_range.py --start ' + temp_date + " --end " + temp_date + '"'
        run_report_remotly = BashOperator(
            task_id='run_report_remotly_' + temp_date,
            retries=0,
            bash_command=bash_run_report_remotly_cmd,
            trigger_rule="all_done")

        start.set_downstream(bash_cleanup)
        bash_cleanup.set_downstream(run_report_remotly)
        run_report_remotly.set_downstream(wait)

    ##notice trigger_rule="all_done"
    run_gsutil_mv = BashOperator(task_id='bash_gsutil_mv_cmd',
                                 retries=0,
                                 bash_command=bash_gsutil_mv_cmd,
                                 trigger_rule="all_done")

    load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='load_to_bq_from_gcs',
        source_objects='*',
        skip_leading_rows=1,
        create_disposition='CREATE_NEVER',
        write_disposition='WRITE_TRUNCATE',  #overwrite?
        bucket='myBucket/google/gam/example_report',
        task_id="hdfs_to_gcs",
        bash_command=
        "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'",
        dag=dag)

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="bigdata-etl-2_flights",
        source_objects=["sqoop_output/part.20190515_*.avro"],
        destination_project_dataset_table=PROJECT_ID +
        ".data_flights.flights_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    # delete_cluster = DataprocClusterDeleteOperator(
    #     task_id='delete_dataproc_cluster',
    #     cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
    #     region='asia-east1',
    #     trigger_rule=TriggerRule.ALL_DONE
    # )

    create_cluster.dag = dag

    create_cluster.set_downstream(sqoop_inc_import)
    sqoop_inc_import.set_downstream(hdfs_to_gcs)
    hdfs_to_gcs.set_downstream(bq_load_flight_delays)
#    bq_load_delays_by_distance.set_downstream(delete_cluster)
Create_user_report= BashOperator(
    task_id='Create_user_report',
    bash_command=""" impala-shell -q "create table practical_exercise_1.user_report(user_id bigint, total_updates bigint, total_inserts bigint, total_deletes bigint, last_activity_type string, is_active boolean, upload_count bigint);" """,
    dag=dag)

Insert_user_report= BashOperator(
    task_id='Insert_user_report',
    bash_command=""" NOW=$(date +%s);  
impala-shell -q "insert into practical_exercise_1.user_report select a.user_id,COALESCE(b.co,0) as total_updates,COALESCE(c.co,0) as total_inserts, COALESCE(d.co,0) as total_deletes, e.co as last_activity_type, COALESCE(f.co,FALSE) as is_active, COALESCE(g.co,0) as upload_count from (select id as user_id from practical_exercise_1.user group by id) as a left join (select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='UPDATE' group by user_id) as b on a.user_id=b.user_id left join (select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='INSERT' group by user_id) as c on a.user_id=c.user_id left join(select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='DELETE' group by user_id) as d on a.user_id=d.user_id left join (SELECT a.user_id, a.type as co FROM practical_exercise_1.activitylog a INNER JOIN (SELECT user_id, MAX(\`timestamp\`) as ti FROM practical_exercise_1.activitylog GROUP BY user_id ) AS b ON a.user_id = b.user_id AND a.\`timestamp\` = b.ti) as e on a.user_id=e.user_id left join (select user_id, if(count(*) = 0, FALSE, TRUE) as co from practical_exercise_1.activitylog where \`timestamp\` > $NOW-172800 group by user_id) as f on a.user_id=f.user_id left join (select user_id, count(user_id) as co from practical_exercise_1.user_upload_dump group by user_id) as g on a.user_id=g.user_id;" """,
    dag=dag)

Insert_user_total= BashOperator(
    task_id='Insert_user_total',
    bash_command=""" impala-shell -q "insert into practical_exercise_1.user_total select current_timestamp(), sub1.t , case when sub2.t1 is NULL then sub1.t when sub2.t1 is not NULL then sub1.t-sub2.t1 end from (select count(distinct id) as t from practical_exercise_1.user)sub1, (select max(total_users) t1 from practical_exercise_1.user_total) sub2;" """,
    dag=dag)


generating_the_MySql_data.set_downstream(Sqoop_import_user)
generating_the_MySql_data.set_downstream(Sqoop_import_activitylog)
generating_the_CSV_data.set_downstream(CSV_to_HDFS)

CSV_to_HDFS.set_downstream(Archiving)
CSV_to_HDFS.set_downstream(Drop_user_report_table)

Sqoop_import_user.set_downstream(Drop_user_report_table)
Sqoop_import_user.set_downstream(Insert_user_total)

Sqoop_import_activitylog.set_downstream(Drop_user_report_table)
Drop_user_report_table.set_downstream(Create_user_report)
Create_user_report.set_downstream(Insert_user_report)