Python HiveOperator примеры использования

Язык программирования: Python

Пространство имен/Пакет: airflow.operators

Класс/Тип: HiveOperator

Примеров на hotexamples.com: 7

Python HiveOperator - 7 примеров найдено. Это лучшие примеры Python кода для airflow.operators.HiveOperator, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HiveOperator(6)

set_upstream(4)

set_downstream(1)

Пример #1

Показать файл

        wait_for = HivePartitionSensor(
            task_id='{}_{}_{}'.format('wait_on_partition_', dependency["db"],
                                      dependency["table"]),
            dag=dag,
            table='{}.{}'.format(dependency["db"], dependency["table"]),
            partition=dependency["partition"],
        )

        dependency_list.append(wait_for)

    #Create full path for the file
    hql_file_path = os.path.join(os.path.dirname(__file__), source['hql'])
    print hql_file_path
    run_hive_query = HiveOperator(task_id='run_hive_query',
                                  dag=dag,
                                  hql="""
        {{ local_hive_settings }}
        """ + "\n " + open(hql_file_path, 'r').read())

    # dummy task
    all_tasks = DummyOperator(task_id='all_tasks',
                              dag=dag,
                              on_success_callback=send_task_success)

    # mark dependencies
    for dependency in dependency_list:
        dependency.set_downstream(run_hive_query)

    all_tasks.set_upstream(run_hive_query)

    #So that mulitple dags could be created

Пример #2

Показать файл

Файл: export_on_premise_to_gcp_dag.py Проект: wasimaries/Learning_Airflow

def export_to_gcp_dag(_sub_dag_id, _schedule_interval, _queue, _default_args,
                      _export_hql_dict, _params, _dataproc_config):
    dag = DAG(dag_id=_sub_dag_id,
              schedule_interval=_schedule_interval,
              default_args=_default_args)

    export_data_hql = _export_hql_dict["export_data_hql"]
    add_dataproc_partition_hql = _export_hql_dict["add_dataproc_partition_hql"]
    drop_tmp_table_hql = _export_hql_dict["drop_tmp_table_hql"]

    def gen_date_str_nodash(execution_date, days_delta=0, hours_delta=0):
        from pytz import timezone, utc
        from datetime import datetime, timedelta

        pacific_timezone = "US/Pacific"
        date = utc.localize(execution_date)
        date = date.astimezone(timezone(pacific_timezone))

        if days_delta:
            date += timedelta(days=days_delta)

        if hours_delta:
            date += timedelta(hours=hours_delta)

        return date.strftime("%Y%m%d")

    _params.update({"gen_date_str_nodash": gen_date_str_nodash})

    export_data = HiveOperator(task_id="export_data",
                               hql=export_data_hql,
                               params=_params,
                               queue=_queue,
                               dag=dag)

    add_dataproc_partition = DataProcHiveOperator(
        task_id="add_dataproc_partition",
        dataproc_cluster=_dataproc_config["dataproc_cluster"],
        region=_dataproc_config["region"],
        gcp_conn_id=_dataproc_config["gcp_conn_id"],
        query=add_dataproc_partition_hql,
        params=_params,
        queue=_queue,
        dag=dag)

    drop_tmp_table = HiveOperator(task_id="drop_tmp_table",
                                  hql=drop_tmp_table_hql,
                                  params=_params,
                                  queue=_queue,
                                  dag=dag)

    add_dataproc_partition.set_upstream(export_data)
    drop_tmp_table.set_upstream(add_dataproc_partition)

    if _params.get("stamp_file_path", None) is not None:
        gcp_conf_true = 'google.cloud.auth.service.account.enable=true'
        gcp_conf_keyfile = 'google.cloud.auth.service.account.json.keyfile={{ params.gcp_keyfile }}'

        add_success_stamp_file = BashOperator(
            task_id="add_success_stamp_file",
            bash_command="hadoop fs -D " + gcp_conf_true + " -D " +
            gcp_conf_keyfile + " -touchz " +
            _params.get("stamp_file_path", None),
            params=_params,
            queue=_queue,
            dag=dag)
        add_success_stamp_file.set_upstream(drop_tmp_table)

    return dag

Пример #3

Показать файл

Файл: wf_incremental_load.py Проект: pardha-fission/airflow_project1st

default_args = {
    'owner': 'pardha',
    'start_date': datetime(2017, 7, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('incremental_load', default_args=default_args)

sqoop_job = """
 exec ./scripts/sqoop_incremental.sh
"""
# Importing the data from Mysql table to HDFS
task1 = BashOperator(
        task_id= 'sqoop_import',
        bash_command=sqoop_job,
        dag=dag
)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(
        task_id= 'hive_insert',
        hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;',
        depends_on_past=True,
        dag=dag
)

# defining the job dependency
task2.set_upstream(task1)

Пример #4

Показать файл

        #bash_command=sqoop_incremental_job,
	bash_command='./sqoop_incremental.sh',
        dag=dag
)

# merge the data from Mysql table to HDFS
task2 = BashOperator(
        task_id= 'sqoop_merge_import',
        bash_command='./sqoop_merge.sh',
        dag=dag
)

# Inserting the data from Hive external table to the target table
task3 = HiveOperator(
        task_id= 'hive_insert',
        hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;',
        depends_on_past=True,
        dag=dag
)

# Inserting the data from Hive table with masked ssn external table to the target table
task4 = HiveOperator(
        task_id= 'hive_insert_masked',
        hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;',
        depends_on_past=True,
        dag=dag
)
# defining the job dependency
task2.set_upstream(task1)
task3.set_upstream(task2)
task4.set_upstream(task3)

Пример #5

Показать файл

import airflow
from airflow import DAG
from airflow.operators import HiveOperator

args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id='example_hive_executor',
          default_args=args,
          schedule_interval=None)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(task_id='hive_select',
                     hql='SELECT * from kraken_app_session limit 10;',
                     depends_on_past=False,
                     dag=dag)

Пример #6

Показать файл

Файл: example_twitter_dag.py Проект: ld000/docker-airflow

    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

Пример #7

Показать файл

Файл: wf_incremental_load.py Проект: VViles/Airflow-Demo

from datetime import datetime, timedelta

default_args = {
    'owner': 'udaysharma',
    'start_date': datetime(2016, 1, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('incremental_load', default_args=default_args)

sqoop_job = """
 exec ./scripts/sqoop_incremental.sh
"""
# Importing the data from Mysql table to HDFS
task1 = BashOperator(
        task_id= 'sqoop_import',
        bash_command=sqoop_job,
        dag=dag
)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(
        task_id= 'hive_insert',
        hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;',
        depends_on_past=True,
        dag=dag
)

# defining the job dependency
task2.set_upstream(task1)