Пример #1
0
        wait_for = HivePartitionSensor(
            task_id='{}_{}_{}'.format('wait_on_partition_', dependency["db"],
                                      dependency["table"]),
            dag=dag,
            table='{}.{}'.format(dependency["db"], dependency["table"]),
            partition=dependency["partition"],
        )

        dependency_list.append(wait_for)

    #Create full path for the file
    hql_file_path = os.path.join(os.path.dirname(__file__), source['hql'])
    print hql_file_path
    run_hive_query = HiveOperator(task_id='run_hive_query',
                                  dag=dag,
                                  hql="""
        {{ local_hive_settings }}
        """ + "\n " + open(hql_file_path, 'r').read())

    # dummy task
    all_tasks = DummyOperator(task_id='all_tasks',
                              dag=dag,
                              on_success_callback=send_task_success)

    # mark dependencies
    for dependency in dependency_list:
        dependency.set_downstream(run_hive_query)

    all_tasks.set_upstream(run_hive_query)

    #So that mulitple dags could be created
def export_to_gcp_dag(_sub_dag_id, _schedule_interval, _queue, _default_args,
                      _export_hql_dict, _params, _dataproc_config):
    dag = DAG(dag_id=_sub_dag_id,
              schedule_interval=_schedule_interval,
              default_args=_default_args)

    export_data_hql = _export_hql_dict["export_data_hql"]
    add_dataproc_partition_hql = _export_hql_dict["add_dataproc_partition_hql"]
    drop_tmp_table_hql = _export_hql_dict["drop_tmp_table_hql"]

    def gen_date_str_nodash(execution_date, days_delta=0, hours_delta=0):
        from pytz import timezone, utc
        from datetime import datetime, timedelta

        pacific_timezone = "US/Pacific"
        date = utc.localize(execution_date)
        date = date.astimezone(timezone(pacific_timezone))

        if days_delta:
            date += timedelta(days=days_delta)

        if hours_delta:
            date += timedelta(hours=hours_delta)

        return date.strftime("%Y%m%d")

    _params.update({"gen_date_str_nodash": gen_date_str_nodash})

    export_data = HiveOperator(task_id="export_data",
                               hql=export_data_hql,
                               params=_params,
                               queue=_queue,
                               dag=dag)

    add_dataproc_partition = DataProcHiveOperator(
        task_id="add_dataproc_partition",
        dataproc_cluster=_dataproc_config["dataproc_cluster"],
        region=_dataproc_config["region"],
        gcp_conn_id=_dataproc_config["gcp_conn_id"],
        query=add_dataproc_partition_hql,
        params=_params,
        queue=_queue,
        dag=dag)

    drop_tmp_table = HiveOperator(task_id="drop_tmp_table",
                                  hql=drop_tmp_table_hql,
                                  params=_params,
                                  queue=_queue,
                                  dag=dag)

    add_dataproc_partition.set_upstream(export_data)
    drop_tmp_table.set_upstream(add_dataproc_partition)

    if _params.get("stamp_file_path", None) is not None:
        gcp_conf_true = 'google.cloud.auth.service.account.enable=true'
        gcp_conf_keyfile = 'google.cloud.auth.service.account.json.keyfile={{ params.gcp_keyfile }}'

        add_success_stamp_file = BashOperator(
            task_id="add_success_stamp_file",
            bash_command="hadoop fs -D " + gcp_conf_true + " -D " +
            gcp_conf_keyfile + " -touchz " +
            _params.get("stamp_file_path", None),
            params=_params,
            queue=_queue,
            dag=dag)
        add_success_stamp_file.set_upstream(drop_tmp_table)

    return dag
default_args = {
    'owner': 'pardha',
    'start_date': datetime(2017, 7, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('incremental_load', default_args=default_args)

sqoop_job = """
 exec ./scripts/sqoop_incremental.sh
"""
# Importing the data from Mysql table to HDFS
task1 = BashOperator(
        task_id= 'sqoop_import',
        bash_command=sqoop_job,
        dag=dag
)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(
        task_id= 'hive_insert',
        hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;',
        depends_on_past=True,
        dag=dag
)

# defining the job dependency
task2.set_upstream(task1)
Пример #4
0
        #bash_command=sqoop_incremental_job,
	bash_command='./sqoop_incremental.sh',
        dag=dag
)

# merge the data from Mysql table to HDFS
task2 = BashOperator(
        task_id= 'sqoop_merge_import',
        bash_command='./sqoop_merge.sh',
        dag=dag
)

# Inserting the data from Hive external table to the target table
task3 = HiveOperator(
        task_id= 'hive_insert',
        hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;',
        depends_on_past=True,
        dag=dag
)

# Inserting the data from Hive table with masked ssn external table to the target table
task4 = HiveOperator(
        task_id= 'hive_insert_masked',
        hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;',
        depends_on_past=True,
        dag=dag
)
# defining the job dependency
task2.set_upstream(task1)
task3.set_upstream(task2)
task4.set_upstream(task3)
Пример #5
0
import airflow
from airflow import DAG
from airflow.operators import HiveOperator

args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id='example_hive_executor',
          default_args=args,
          schedule_interval=None)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(task_id='hive_select',
                     hql='SELECT * from kraken_app_session limit 10;',
                     depends_on_past=False,
                     dag=dag)
Пример #6
0
    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)
Пример #7
0
from datetime import datetime, timedelta

default_args = {
    'owner': 'udaysharma',
    'start_date': datetime(2016, 1, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('incremental_load', default_args=default_args)

sqoop_job = """
 exec ./scripts/sqoop_incremental.sh
"""
# Importing the data from Mysql table to HDFS
task1 = BashOperator(
        task_id= 'sqoop_import',
        bash_command=sqoop_job,
        dag=dag
)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(
        task_id= 'hive_insert',
        hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;',
        depends_on_past=True,
        dag=dag
)

# defining the job dependency
task2.set_upstream(task1)