wait_for = HivePartitionSensor( task_id='{}_{}_{}'.format('wait_on_partition_', dependency["db"], dependency["table"]), dag=dag, table='{}.{}'.format(dependency["db"], dependency["table"]), partition=dependency["partition"], ) dependency_list.append(wait_for) #Create full path for the file hql_file_path = os.path.join(os.path.dirname(__file__), source['hql']) print hql_file_path run_hive_query = HiveOperator(task_id='run_hive_query', dag=dag, hql=""" {{ local_hive_settings }} """ + "\n " + open(hql_file_path, 'r').read()) # dummy task all_tasks = DummyOperator(task_id='all_tasks', dag=dag, on_success_callback=send_task_success) # mark dependencies for dependency in dependency_list: dependency.set_downstream(run_hive_query) all_tasks.set_upstream(run_hive_query) #So that mulitple dags could be created
def export_to_gcp_dag(_sub_dag_id, _schedule_interval, _queue, _default_args, _export_hql_dict, _params, _dataproc_config): dag = DAG(dag_id=_sub_dag_id, schedule_interval=_schedule_interval, default_args=_default_args) export_data_hql = _export_hql_dict["export_data_hql"] add_dataproc_partition_hql = _export_hql_dict["add_dataproc_partition_hql"] drop_tmp_table_hql = _export_hql_dict["drop_tmp_table_hql"] def gen_date_str_nodash(execution_date, days_delta=0, hours_delta=0): from pytz import timezone, utc from datetime import datetime, timedelta pacific_timezone = "US/Pacific" date = utc.localize(execution_date) date = date.astimezone(timezone(pacific_timezone)) if days_delta: date += timedelta(days=days_delta) if hours_delta: date += timedelta(hours=hours_delta) return date.strftime("%Y%m%d") _params.update({"gen_date_str_nodash": gen_date_str_nodash}) export_data = HiveOperator(task_id="export_data", hql=export_data_hql, params=_params, queue=_queue, dag=dag) add_dataproc_partition = DataProcHiveOperator( task_id="add_dataproc_partition", dataproc_cluster=_dataproc_config["dataproc_cluster"], region=_dataproc_config["region"], gcp_conn_id=_dataproc_config["gcp_conn_id"], query=add_dataproc_partition_hql, params=_params, queue=_queue, dag=dag) drop_tmp_table = HiveOperator(task_id="drop_tmp_table", hql=drop_tmp_table_hql, params=_params, queue=_queue, dag=dag) add_dataproc_partition.set_upstream(export_data) drop_tmp_table.set_upstream(add_dataproc_partition) if _params.get("stamp_file_path", None) is not None: gcp_conf_true = 'google.cloud.auth.service.account.enable=true' gcp_conf_keyfile = 'google.cloud.auth.service.account.json.keyfile={{ params.gcp_keyfile }}' add_success_stamp_file = BashOperator( task_id="add_success_stamp_file", bash_command="hadoop fs -D " + gcp_conf_true + " -D " + gcp_conf_keyfile + " -touchz " + _params.get("stamp_file_path", None), params=_params, queue=_queue, dag=dag) add_success_stamp_file.set_upstream(drop_tmp_table) return dag
default_args = { 'owner': 'pardha', 'start_date': datetime(2017, 7, 14), 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('incremental_load', default_args=default_args) sqoop_job = """ exec ./scripts/sqoop_incremental.sh """ # Importing the data from Mysql table to HDFS task1 = BashOperator( task_id= 'sqoop_import', bash_command=sqoop_job, dag=dag ) # Inserting the data from Hive external table to the target table task2 = HiveOperator( task_id= 'hive_insert', hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;', depends_on_past=True, dag=dag ) # defining the job dependency task2.set_upstream(task1)
#bash_command=sqoop_incremental_job, bash_command='./sqoop_incremental.sh', dag=dag ) # merge the data from Mysql table to HDFS task2 = BashOperator( task_id= 'sqoop_merge_import', bash_command='./sqoop_merge.sh', dag=dag ) # Inserting the data from Hive external table to the target table task3 = HiveOperator( task_id= 'hive_insert', hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;', depends_on_past=True, dag=dag ) # Inserting the data from Hive table with masked ssn external table to the target table task4 = HiveOperator( task_id= 'hive_insert_masked', hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;', depends_on_past=True, dag=dag ) # defining the job dependency task2.set_upstream(task1) task3.set_upstream(task2) task4.set_upstream(task3)
import airflow from airflow import DAG from airflow.operators import HiveOperator args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)} dag = DAG(dag_id='example_hive_executor', default_args=args, schedule_interval=None) # Inserting the data from Hive external table to the target table task2 = HiveOperator(task_id='hive_select', hql='SELECT * from kraken_app_session limit 10;', depends_on_past=False, dag=dag)
file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag)
from datetime import datetime, timedelta default_args = { 'owner': 'udaysharma', 'start_date': datetime(2016, 1, 14), 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('incremental_load', default_args=default_args) sqoop_job = """ exec ./scripts/sqoop_incremental.sh """ # Importing the data from Mysql table to HDFS task1 = BashOperator( task_id= 'sqoop_import', bash_command=sqoop_job, dag=dag ) # Inserting the data from Hive external table to the target table task2 = HiveOperator( task_id= 'hive_insert', hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;', depends_on_past=True, dag=dag ) # defining the job dependency task2.set_upstream(task1)