Exemplos de HiveOperator.HiveOperator em Python, exemplos de airflow.operators.hive_operator.HiveOperator.HiveOperator em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: adventureworks.py Projeto: yuanzheng/ETL_with_airflow

def create_operator_array(hql, hive_table):
    step1 = HiveOperator(
        hql='DROP TABLE IF EXISTS dv_temp.{0}_temp'.format(hive_table),
        hive_cli_conn_id='hive_datavault_raw',
        schema='dv_raw',
        task_id='drop_{0}'.format(hive_table),
        params={'hive_table': hive_table},
        dag=dag)

    step2 = HiveOperator(
        hql=hql,
        hive_cli_conn_id='hive_datavault_raw',
        schema='dv_raw',
        task_id='enddate_{0}'.format(hive_table),
        params={'hive_table': hive_table},
        dag=dag)

    step3 = HiveOperator(
        hql='INSERT OVERWRITE TABLE dv_raw.{0} SELECT * FROM dv_temp.{0}_temp'.format(hive_table),
        hive_cli_conn_id='hive_datavault_temp',
        schema='dv_temp',
        task_id='copyback_{0}'.format(hive_table),
        dag=dag)

    sats_done >> step1
    step1 >> step2 >> step3
    step3 >> enddating_done

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: khairulhabibatibm/airflow-python3.8-image

    def test_hive_dryrun(self, mock_popen, mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "tst"

        op = HiveOperator(task_id='dry_run_basic_hql',
                          hql=self.hql,
                          dag=self.dag)
        op.dry_run()

        hive_cmd = [
            'beeline', '-u', '"jdbc:hive2://localhost:10000/default"',
            '-hiveconf', 'airflow.ctx.dag_id=', '-hiveconf',
            'airflow.ctx.task_id=', '-hiveconf', 'airflow.ctx.execution_date=',
            '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf',
            'airflow.ctx.dag_owner=', '-hiveconf', 'airflow.ctx.dag_email=',
            '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf',
            'mapred.job.queue.name=airflow', '-hiveconf',
            'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_tst/tmptst'
        ]
        mock_popen.assert_called_with(hive_cmd,
                                      stdout=mock_subprocess.PIPE,
                                      stderr=mock_subprocess.STDOUT,
                                      cwd="/tmp/airflow_hiveop_tst",
                                      close_fds=True)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_hive_hook.py Projeto: yanxiaofei395118/airflow

 def setUp(self):
     self._upload_dataframe()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.database = 'airflow'
     self.table = 'hive_server_hook'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         a int,
         b int)
     ROW FORMAT DELIMITED
     FIELDS TERMINATED BY ',';
     LOAD DATA LOCAL INPATH '{{ params.csv_path }}'
     OVERWRITE INTO TABLE {{ params.table }};
     """
     self.columns = ['{}.a'.format(self.table),
                     '{}.b'.format(self.table)]
     self.hook = HiveMetastoreHook()
     t = HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'csv_path': self.local_path
         },
         hive_cli_conn_id='hive_cli_default',
         hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_hive_hook.py Projeto: yanxiaofei395118/airflow

 def setUp(self):
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.next_day = (DEFAULT_DATE +
                      datetime.timedelta(days=1)).isoformat()[:10]
     self.database = 'airflow'
     self.partition_by = 'ds'
     self.table = 'static_babynames_partitioned'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         state string,
         year string,
         name string,
         gender string,
         num int)
     PARTITIONED BY ({{ params.partition_by }} string);
     ALTER TABLE {{ params.table }}
     ADD PARTITION({{ params.partition_by }}='{{ ds }}');
     """
     self.hook = HiveMetastoreHook()
     t = HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'partition_by': self.partition_by
         },
         hive_cli_conn_id='hive_cli_default',
         hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: bomboradata/bombora-incubator-airflow

 def test_hiveconf_jinja_translate(self):
     hql = "SELECT ${num_col} FROM ${hiveconf:table};"
     t = HiveOperator(
         hiveconf_jinja_translate=True,
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     t.prepare_template()
     self.assertEqual(t.hql, "SELECT {{ num_col }} FROM {{ table }};")

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: khairulhabibatibm/airflow-python3.8-image

    def test_beeline(self, mock_popen, mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "tst"

        hive_cmd = [
            'beeline', '-u', '"jdbc:hive2://localhost:10000/default"',
            '-hiveconf', 'airflow.ctx.dag_id=test_dag_id', '-hiveconf',
            'airflow.ctx.task_id=beeline_hql', '-hiveconf',
            'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
            '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf',
            'airflow.ctx.dag_owner=airflow', '-hiveconf',
            'airflow.ctx.dag_email=', '-hiveconf',
            'mapreduce.job.queuename=airflow', '-hiveconf',
            'mapred.job.queue.name=airflow', '-hiveconf',
            'tez.queue.name=airflow', '-hiveconf',
            'mapred.job.name=test_job_name', '-f',
            '/tmp/airflow_hiveop_tst/tmptst'
        ]

        op = HiveOperator(task_id='beeline_hql',
                          hive_cli_conn_id='hive_cli_default',
                          hql=self.hql,
                          dag=self.dag,
                          mapred_job_name="test_job_name")
        op.run(start_date=DEFAULT_DATE,
               end_date=DEFAULT_DATE,
               ignore_ti_state=True)
        mock_popen.assert_called_with(hive_cmd,
                                      stdout=mock_subprocess.PIPE,
                                      stderr=mock_subprocess.STDOUT,
                                      cwd="/tmp/airflow_hiveop_tst",
                                      close_fds=True)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

 def test_beeline(self):
     t = HiveOperator(task_id='beeline_hql',
                      hive_cli_conn_id='hive_cli_default',
                      hql=self.hql,
                      dag=self.dag)
     t.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: bomboradata/bombora-incubator-airflow

 def test_hive_queues(self):
     t = HiveOperator(
         task_id='test_hive_queues', hql=self.hql,
         mapred_queue='default', mapred_queue_priority='HIGH',
         mapred_job_name='airflow.test_hive_queues',
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: init_datavault2_bigdata_example.py Projeto: wukuan405/etl-with-airflow

def create_object(hql, tablename, upstream, downstream):
    t = HiveOperator(task_id=tablename,
                     hive_cli_conn_id='hive_datavault_raw',
                     schema=DATAVAULT,
                     hql=hql,
                     dag=dag)
    upstream >> t
    t >> downstream

Exemplo n.º 10

0

Exibir arquivo

Arquivo: init_datavault2_example.py Projeto: yuanzheng/ETL_with_airflow

def create_table(hql, tablename, upstream, downstream):
    t = HiveOperator(task_id='table_{0}'.format(tablename),
                     hive_cli_conn_id='hive_datavault_raw',
                     schema=DATAVAULT,
                     hql=hql,
                     dag=dag)
    upstream >> t
    t >> downstream

Exemplo n.º 11

0

Exibir arquivo

Arquivo: starschema.py Projeto: yuanzheng/ETL_with_airflow

def create_operator(hql, hive_table, prev_id, next_id):
    t = HiveOperator(hql=hql,
                     hive_cli_conn_id='hive_datavault_raw',
                     schema='dv_raw',
                     task_id='star_{0}'.format(hive_table),
                     dag=dag)
    t >> next_id
    if prev_id is not None:
        prev_id >> t

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: bomboradata/bombora-incubator-airflow

 def test_hiveconf(self):
     hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});"
     t = HiveOperator(
         hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'},
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     t.prepare_template()
     self.assertEqual(
         t.hql,
         "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

    def test_hive_airflow_default_config_queue_override(self):
        specific_mapred_queue = 'default'
        t = HiveOperator(task_id='test_default_config_queue',
                         hql=self.hql,
                         mapred_queue=specific_mapred_queue,
                         mapred_queue_priority='HIGH',
                         mapred_job_name='airflow.test_default_config_queue',
                         dag=self.dag)

        self.assertEqual(t.get_hook().mapred_queue, specific_mapred_queue)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: adventureworks.py Projeto: yuanzheng/ETL_with_airflow

def create_link_operator(hql, hive_table):
    t1 = HiveOperator(
        hql=hql,
        hive_cli_conn_id='hive_datavault_raw',
        schema='dv_raw',
        task_id=hive_table,
        dag=dag)

    hubs_done >> t1
    t1 >> links_done
    return t1

Exemplo n.º 15

0

Exibir arquivo

Arquivo: dvdrentals.py Projeto: zixuedanxin/etl-with-airflow

def load_link(hql, hive_table):
    _, table = hive_table.split('.')

    t1 = HiveOperator(hql=hql,
                      hive_cli_conn_id='hive_datavault_raw',
                      schema='dv_raw',
                      task_id='load_{0}'.format(hive_table),
                      dag=dag)

    staging_done >> t1 >> loading_done
    return t1

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

    def test_hive_airflow_default_config_queue(self):
        t = HiveOperator(task_id='test_default_config_queue',
                         hql=self.hql,
                         mapred_queue_priority='HIGH',
                         mapred_job_name='airflow.test_default_config_queue',
                         dag=self.dag)

        # just check that the correct default value in test_default.cfg is used
        test_config_hive_mapred_queue = conf.get('hive',
                                                 'default_hive_mapred_queue')
        self.assertEqual(t.get_hook().mapred_queue,
                         test_config_hive_mapred_queue)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

    def test_mapred_job_name(self, mock_get_hook):
        mock_hook = mock.MagicMock()
        mock_get_hook.return_value = mock_hook
        t = HiveOperator(task_id='test_mapred_job_name',
                         hql=self.hql,
                         dag=self.dag)

        fake_execution_date = timezone.datetime(2018, 6, 19)
        fake_ti = TaskInstance(task=t, execution_date=fake_execution_date)
        fake_ti.hostname = 'fake_hostname'
        fake_context = {'ti': fake_ti}

        t.execute(fake_context)
        self.assertEqual(
            "Airflow HiveOperator task for {}.{}.{}.{}".format(
                fake_ti.hostname, self.dag.dag_id, t.task_id,
                fake_execution_date.isoformat()), mock_hook.mapred_job_name)

Exemplo n.º 18

0

Exibir arquivo

    hdfs_conn_id='hdfs',
    dag=dag,
)

hdfs_put_title_basics = HdfsPutFileOperator(
    task_id='upload_title_basics_to_hdfs',
    local_file='/home/airflow/imdb/title.basics_{{ ds }}.tsv',
    remote_file=
    '/user/hadoop/imdb/title_basics/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}/title.basics_{{ ds }}.tsv',
    hdfs_conn_id='hdfs',
    dag=dag,
)

create_HiveTable_title_ratings = HiveOperator(
    task_id='create_title_ratings_table',
    hql=hiveSQL_create_table_title_ratings,
    hive_cli_conn_id='beeline',
    dag=dag)

create_HiveTable_title_basics = HiveOperator(
    task_id='create_title_basics_table',
    hql=hiveSQL_create_table_title_basics,
    hive_cli_conn_id='beeline',
    dag=dag)

addPartition_HiveTable_title_ratings = HiveOperator(
    task_id='add_partition_title_ratings_table',
    hql=hiveSQL_add_partition_title_ratings,
    hive_cli_conn_id='beeline',
    dag=dag)

Exemplo n.º 19

0

Exibir arquivo

from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.hive_operator import HiveOperator

default_args = {
    'owner': 'airflow',  # Set the user here
    'depends_on_past': False,
    'start_date': datetime(2018, 5, 6),
    'retries': 3,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG(dag_id='hive_report',
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='@once')

# Performs a query and saves the output as a new table in default.airflowoutput Hive table
query = 'CREATE TABLE airflowoutput as SELECT * FROM lanl.day1 LIMIT 10;'

run_hive_query = HiveOperator(task_id="fetch_data", hql=query, dag=dag)

Exemplo n.º 20

0

Exibir arquivo

    # fetching tweets
    fetching_tweets = PythonOperator(task_id="fetching_tweets",
                                     python_callable=fetching_tweet.main)

    # cleaning tweets
    cleaning_tweets = PythonOperator(task_id="cleaning_tweets",
                                     python_callable=cleaning_tweet.main)

    # storing tweets into hdfs
    storing_tweets = BashOperator(
        task_id="storing_tweets",
        bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/")

    # load tweets into a hive table -> hive data warehouse tool
    loading_tweets = HiveOperator(
        task_id="loading_tweets",
        hql="LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets")
    # in the terminal enter hive to start hive
    # show tables;
    # we should see the empty table tweets

    # notice how we add the dependencies betwees tasks
    # t4 << t3 ( = t4.set_upstream(t3) )
    # t1 >> t2 ( = t1.set_downstream(t2) )
    waiting_for_tweets >> fetching_tweets >> cleaning_tweets >> storing_tweets >> loading_tweets

# to test it run in the following in a python .sandbox environment
# $  airflow test twitter_dag storing_tweets 2020-01-01

# to test the loading of tweets we enter the following command
# $  airflow test twitter_dag loading_tweets 2020-01-01

Exemplo n.º 21

0

Exibir arquivo

Arquivo: twitter_dag_final.py Projeto: GabrielSGoncalves/AirFlow_HCV

        fs_conn_id='fs_default',
        filepath='/home/airflow/airflow_files/data.csv',
        poke_interval=15,
        dag=dag)

    # Initialise a PythonOperator to execute the fetching_tweet.py script
    fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task',
                                         python_callable=fetching_tweet.main,
                                         dag=dag)

    # Initialise another PythonOperator to execute the cleaning_tweet.py script
    cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task',
                                         python_callable=cleaning_tweet.main,
                                         dag=dag)

    # Initialise a BashOperator to upload the file into HDFS
    filename = 'data_cleaned.csv'
    load_into_hdfs_task = BashOperator(task_id='load_into_hdfs_task',
                                       bash_command='hadoop fs -put -f ' +
                                       LOCAL_DIR + filename + ' ' + HDFS_DIR,
                                       dag=dag)

    # Initialise a HiveOperator to transfer data from HDFS to HIVE table
    load_into_hive_task = HiveOperator(
        task_id='transfer_into_hive_task',
        hql="LOAD DATA INPATH '" + HDFS_DIR + filename +
        "' INTO TABLE tweets PARTITION(dt='" + dt + "')",
        dag=dag)

    waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> load_into_hdfs_task >> load_into_hive_task

Exemplo n.º 22

0

Exibir arquivo

t1 = HiveOperator(task_id='create_external_table',
                  hql="""CREATE EXTERNAL TABLE IF NOT EXISTS traffic_csv (
  `DT` STRING,
  `TM` STRING,
  `BOROUGH` STRING,
  `ZIP CODE` STRING,
  `LATITUDE` STRING,
  `LONGITUDE` STRING,
  `LOCATION` STRING,
  `ON STREET NAME` STRING,
  `CROSS STREET NAME` STRING,
  `OFF STREET NAME` STRING,
  `NUMBER OF PERSONS INJURED` STRING,
  `NUMBER OF PERSONS KILLED` STRING,
  `NUMBER OF PEDESTRIANS INJURED` STRING,
  `NUMBER OF PEDESTRIANS KILLED` STRING,
  `NUMBER OF CYCLIST INJURED` STRING,
  `NUMBER OF CYCLIST KILLED` STRING,
  `NUMBER OF MOTORIST INJURED` STRING,
  `NUMBER OF MOTORIST KILLED` STRING,
  `CONTRIBUTING FACTOR VEHICLE 1` STRING,
  `CONTRIBUTING FACTOR VEHICLE 2` STRING,
  `CONTRIBUTING FACTOR VEHICLE 3` STRING,
  `CONTRIBUTING FACTOR VEHICLE 4` STRING,
  `CONTRIBUTING FACTOR VEHICLE 5` STRING,
  `UNIQUE KEY` STRING,
  `VEHICLE TYPE CODE 1` STRING,
  `VEHICLE TYPE CODE 2` STRING,
  `VEHICLE TYPE CODE 3` STRING,
  `VEHICLE TYPE CODE 4` STRING, 
  `VEHICLE TYPE CODE 5` STRING)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
  LOCATION '/tmp/data';""",
                  dag=dag)

Exemplo n.º 23

0

Exibir arquivo

import time, requests, json
from datetime import datetime, timedelta
from airflow.operators.hive_operator import HiveOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow import DAG

default_args = {
    'owner': 'Rick wu',
    'start_date': datetime(2100, 1, 1, 0, 0),
    'schedule_interval': '@daily',
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG('hql1', default_args=default_args) as dag:
    # "SELECT * FROM item_mart LIMIT 10",
    hql_job_1 = HiveOperator(hql="select * from default.item_mart limit 10",
                             hive_cli_conn_id="hive_local",
                             schema='default',
                             hiveconf_jinja_translate=True,
                             task_id='hql_job_1',
                             dag=dag)

    do_nothing = DummyOperator(task_id='no_do_nothing')

    hql_job_1 >> do_nothing

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

 def test_hive_dryrun(self):
     t = HiveOperator(task_id='dry_run_basic_hql',
                      hql=self.hql,
                      dag=self.dag)
     t.dry_run()

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_hive_operator.py Projeto: zw39125432/airflow

 def test_hive(self):
     t = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)

Exemplo n.º 26

0

Exibir arquivo

# The steps are explicitly coded, you could build them dynamically using
# a simple for loop, because only the step id and the dimension name changes.

# Notice that customer and product are processed independently and in parallel
# in a single dag.

dag = airflow.DAG('process_hive_dwh',
                  schedule_interval="@daily",
                  dagrun_timeout=timedelta(minutes=60),
                  template_searchpath=tmpl_search_path,
                  default_args=args,
                  max_active_runs=10)

customer_step_1 = HiveOperator(hql='customer/step_1.hql',
                               hive_cli_conn_id='hive_staging',
                               schema='default',
                               hiveconf_jinja_translate=True,
                               task_id='customer_step_1',
                               dag=dag)

customer_step_2 = HiveOperator(hql='customer/step_2.hql',
                               hive_cli_conn_id='hive_staging',
                               schema='default',
                               hiveconf_jinja_translate=True,
                               task_id='customer_step_2',
                               dag=dag)

product_step_1 = HiveOperator(hql='product/step_1.hql',
                              hive_cli_conn_id='hive_staging',
                              schema='default',
                              hiveconf_jinja_translate=True,
                              task_id='product_step_1',

Exemplo n.º 27

0

Exibir arquivo

}


def read_config():
    with open('config.yaml', 'r') as stream:
        return (yaml.safe_load(stream))


my_config = read_config()
dag = DAG(my_config['dag_id'], schedule_interval='@once', default_args=args)

analyze_log = HiveOperator(hql='hive_task/EC_data_analysis.hql',
                           hive_cli_conn_id=my_config['hive_connection'],
                           schema=my_config['hive_schema'],
                           hiveconf_jinja_translate=True,
                           task_id='analyze_log',
                           params={
                               'input_path': my_config['input_path'],
                               'output_path': my_config['output_path']
                           },
                           dag=dag)

merge_files = BashOperator(
    task_id='merge_files',
    dag=dag,
    bash_command=
    '{{ params.project_home }}/shell_task/table_to_csv_one.sh {{ params.project_home }}/shell_task/table_to_csv_one.sh {{ params.tmp_dir }} {{ params.output_path }} {{ params.hive_schema }}',
    params={
        'project_home': my_config['project_home'],
        'tmp_dir': my_config['tmp_dir'],
        'output_path': my_config['output_path'],
        'hive_schema': my_config['hive_schema']

Exemplo n.º 28

0

Exibir arquivo

    # Step 2: Move json file to hdfs storage
    move_to_hdfs = BashOperator(task_id="move_to_hdfs",
                                bash_command="""
            hdfs dfs -mkdir -p /dim_sku && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/sku_data.csv /dim_sku
            """)

    # Step 3: Create a hive table on our sku_data
    creating_sku_table = HiveOperator(task_id="creating_sku_table",
                                      hive_cli_conn_id="hive_conn",
                                      hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS dim_sku(
                asin STRING,
                title STRING,
                price DOUBLE,
                brand STRING
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '|'
            STORED AS TEXTFILE
        """)

    processing_sku_data = SparkSubmitOperator(
        task_id="processing_sku_data",
        conn_id="spark_conn",
        application="/usr/local/airflow/dags/scripts/dim_sku_processing.py",
        verbose=False)

    unzip_file_store_as_csv >> move_to_hdfs >> creating_sku_table >> processing_sku_data

Exemplo n.º 29

0

Exibir arquivo

hdfs_dir = " /tmp/"

for channel in to_channels:
    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir +
        file_name + hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive",
                                hql="LOAD DATA INPATH '" + hdfs_dir + channel +
                                "/" + file_name + "' "
                                "INTO TABLE " + channel + " "
                                "PARTITION(dt='" + dt + "')",
                                dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime(
        "%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir +
        file_name + hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

Exemplo n.º 30

0

Exibir arquivo

indexed_files = [(int(fn.split('.')[0]), fn) for fn in only_files]
sorted_files = sorted(indexed_files, key=lambda tup: tup[0])

# print(sorted_files)

# 储存tag_id和datatype的字典
tag_and_datatype_dict = {}

# 创建顺序执行的DAG工作流
tail_etl_task = None

for tp in sorted_files:
    file_name = tp[1]
    etl_name = tp[1].split('.')[1]
    etl_task = HiveOperator(task_id='user_tag_etl__' + etl_name,
                            hql=file_name,
                            hive_cli_conn_id=HIVE_CONN_ID,
                            dag=dag)

    if tail_etl_task is not None:
        etl_task.set_upstream(tail_etl_task)
        tail_etl_task = etl_task
    else:
        tail_etl_task = etl_task

    collect_tag_id_and_datatype_from_sql_file(file_name)

print('tag_and_datatype_dict: {}'.format(tag_and_datatype_dict))

user_tag_to_wide_sql = '''
drop table if exists user_tag_wide;
create table user_tag_wide