def create_operator_array(hql, hive_table): step1 = HiveOperator( hql='DROP TABLE IF EXISTS dv_temp.{0}_temp'.format(hive_table), hive_cli_conn_id='hive_datavault_raw', schema='dv_raw', task_id='drop_{0}'.format(hive_table), params={'hive_table': hive_table}, dag=dag) step2 = HiveOperator( hql=hql, hive_cli_conn_id='hive_datavault_raw', schema='dv_raw', task_id='enddate_{0}'.format(hive_table), params={'hive_table': hive_table}, dag=dag) step3 = HiveOperator( hql='INSERT OVERWRITE TABLE dv_raw.{0} SELECT * FROM dv_temp.{0}_temp'.format(hive_table), hive_cli_conn_id='hive_datavault_temp', schema='dv_temp', task_id='copyback_{0}'.format(hive_table), dag=dag) sats_done >> step1 step1 >> step2 >> step3 step3 >> enddating_done
def test_hive_dryrun(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "tst" op = HiveOperator(task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) op.dry_run() hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=', '-hiveconf', 'airflow.ctx.task_id=', '-hiveconf', 'airflow.ctx.execution_date=', '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf', 'airflow.ctx.dag_owner=', '-hiveconf', 'airflow.ctx.dag_email=', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_tst/tmptst' ] mock_popen.assert_called_with(hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_tst", close_fds=True)
def setUp(self): self._upload_dataframe() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.database = 'airflow' self.table = 'hive_server_hook' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( a int, b int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '{{ params.csv_path }}' OVERWRITE INTO TABLE {{ params.table }}; """ self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)] self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'csv_path': self.local_path }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hiveconf_jinja_translate(self): hql = "SELECT ${num_col} FROM ${hiveconf:table};" t = HiveOperator( hiveconf_jinja_translate=True, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) t.prepare_template() self.assertEqual(t.hql, "SELECT {{ num_col }} FROM {{ table }};")
def test_beeline(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "tst" hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=test_dag_id', '-hiveconf', 'airflow.ctx.task_id=beeline_hql', '-hiveconf', 'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00', '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf', 'airflow.ctx.dag_owner=airflow', '-hiveconf', 'airflow.ctx.dag_email=', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-hiveconf', 'mapred.job.name=test_job_name', '-f', '/tmp/airflow_hiveop_tst/tmptst' ] op = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag, mapred_job_name="test_job_name") op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_popen.assert_called_with(hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_tst", close_fds=True)
def test_beeline(self): t = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_queues(self): t = HiveOperator( task_id='test_hive_queues', hql=self.hql, mapred_queue='default', mapred_queue_priority='HIGH', mapred_job_name='airflow.test_hive_queues', dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def create_object(hql, tablename, upstream, downstream): t = HiveOperator(task_id=tablename, hive_cli_conn_id='hive_datavault_raw', schema=DATAVAULT, hql=hql, dag=dag) upstream >> t t >> downstream
def create_table(hql, tablename, upstream, downstream): t = HiveOperator(task_id='table_{0}'.format(tablename), hive_cli_conn_id='hive_datavault_raw', schema=DATAVAULT, hql=hql, dag=dag) upstream >> t t >> downstream
def create_operator(hql, hive_table, prev_id, next_id): t = HiveOperator(hql=hql, hive_cli_conn_id='hive_datavault_raw', schema='dv_raw', task_id='star_{0}'.format(hive_table), dag=dag) t >> next_id if prev_id is not None: prev_id >> t
def test_hiveconf(self): hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});" t = HiveOperator( hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'}, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) t.prepare_template() self.assertEqual( t.hql, "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")
def test_hive_airflow_default_config_queue_override(self): specific_mapred_queue = 'default' t = HiveOperator(task_id='test_default_config_queue', hql=self.hql, mapred_queue=specific_mapred_queue, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) self.assertEqual(t.get_hook().mapred_queue, specific_mapred_queue)
def create_link_operator(hql, hive_table): t1 = HiveOperator( hql=hql, hive_cli_conn_id='hive_datavault_raw', schema='dv_raw', task_id=hive_table, dag=dag) hubs_done >> t1 t1 >> links_done return t1
def load_link(hql, hive_table): _, table = hive_table.split('.') t1 = HiveOperator(hql=hql, hive_cli_conn_id='hive_datavault_raw', schema='dv_raw', task_id='load_{0}'.format(hive_table), dag=dag) staging_done >> t1 >> loading_done return t1
def test_hive_airflow_default_config_queue(self): t = HiveOperator(task_id='test_default_config_queue', hql=self.hql, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) # just check that the correct default value in test_default.cfg is used test_config_hive_mapred_queue = conf.get('hive', 'default_hive_mapred_queue') self.assertEqual(t.get_hook().mapred_queue, test_config_hive_mapred_queue)
def test_mapred_job_name(self, mock_get_hook): mock_hook = mock.MagicMock() mock_get_hook.return_value = mock_hook t = HiveOperator(task_id='test_mapred_job_name', hql=self.hql, dag=self.dag) fake_execution_date = timezone.datetime(2018, 6, 19) fake_ti = TaskInstance(task=t, execution_date=fake_execution_date) fake_ti.hostname = 'fake_hostname' fake_context = {'ti': fake_ti} t.execute(fake_context) self.assertEqual( "Airflow HiveOperator task for {}.{}.{}.{}".format( fake_ti.hostname, self.dag.dag_id, t.task_id, fake_execution_date.isoformat()), mock_hook.mapred_job_name)
hdfs_conn_id='hdfs', dag=dag, ) hdfs_put_title_basics = HdfsPutFileOperator( task_id='upload_title_basics_to_hdfs', local_file='/home/airflow/imdb/title.basics_{{ ds }}.tsv', remote_file= '/user/hadoop/imdb/title_basics/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}/title.basics_{{ ds }}.tsv', hdfs_conn_id='hdfs', dag=dag, ) create_HiveTable_title_ratings = HiveOperator( task_id='create_title_ratings_table', hql=hiveSQL_create_table_title_ratings, hive_cli_conn_id='beeline', dag=dag) create_HiveTable_title_basics = HiveOperator( task_id='create_title_basics_table', hql=hiveSQL_create_table_title_basics, hive_cli_conn_id='beeline', dag=dag) addPartition_HiveTable_title_ratings = HiveOperator( task_id='add_partition_title_ratings_table', hql=hiveSQL_add_partition_title_ratings, hive_cli_conn_id='beeline', dag=dag)
from datetime import datetime, timedelta from airflow import DAG from airflow.operators.hive_operator import HiveOperator default_args = { 'owner': 'airflow', # Set the user here 'depends_on_past': False, 'start_date': datetime(2018, 5, 6), 'retries': 3, 'retry_delay': timedelta(minutes=1) } dag = DAG(dag_id='hive_report', max_active_runs=1, default_args=default_args, schedule_interval='@once') # Performs a query and saves the output as a new table in default.airflowoutput Hive table query = 'CREATE TABLE airflowoutput as SELECT * FROM lanl.day1 LIMIT 10;' run_hive_query = HiveOperator(task_id="fetch_data", hql=query, dag=dag)
# fetching tweets fetching_tweets = PythonOperator(task_id="fetching_tweets", python_callable=fetching_tweet.main) # cleaning tweets cleaning_tweets = PythonOperator(task_id="cleaning_tweets", python_callable=cleaning_tweet.main) # storing tweets into hdfs storing_tweets = BashOperator( task_id="storing_tweets", bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/") # load tweets into a hive table -> hive data warehouse tool loading_tweets = HiveOperator( task_id="loading_tweets", hql="LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets") # in the terminal enter hive to start hive # show tables; # we should see the empty table tweets # notice how we add the dependencies betwees tasks # t4 << t3 ( = t4.set_upstream(t3) ) # t1 >> t2 ( = t1.set_downstream(t2) ) waiting_for_tweets >> fetching_tweets >> cleaning_tweets >> storing_tweets >> loading_tweets # to test it run in the following in a python .sandbox environment # $ airflow test twitter_dag storing_tweets 2020-01-01 # to test the loading of tweets we enter the following command # $ airflow test twitter_dag loading_tweets 2020-01-01
fs_conn_id='fs_default', filepath='/home/airflow/airflow_files/data.csv', poke_interval=15, dag=dag) # Initialise a PythonOperator to execute the fetching_tweet.py script fetching_tweet_task = PythonOperator(task_id='fetching_tweet_task', python_callable=fetching_tweet.main, dag=dag) # Initialise another PythonOperator to execute the cleaning_tweet.py script cleaning_tweet_task = PythonOperator(task_id='cleaning_tweet_task', python_callable=cleaning_tweet.main, dag=dag) # Initialise a BashOperator to upload the file into HDFS filename = 'data_cleaned.csv' load_into_hdfs_task = BashOperator(task_id='load_into_hdfs_task', bash_command='hadoop fs -put -f ' + LOCAL_DIR + filename + ' ' + HDFS_DIR, dag=dag) # Initialise a HiveOperator to transfer data from HDFS to HIVE table load_into_hive_task = HiveOperator( task_id='transfer_into_hive_task', hql="LOAD DATA INPATH '" + HDFS_DIR + filename + "' INTO TABLE tweets PARTITION(dt='" + dt + "')", dag=dag) waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> load_into_hdfs_task >> load_into_hive_task
t1 = HiveOperator(task_id='create_external_table', hql="""CREATE EXTERNAL TABLE IF NOT EXISTS traffic_csv ( `DT` STRING, `TM` STRING, `BOROUGH` STRING, `ZIP CODE` STRING, `LATITUDE` STRING, `LONGITUDE` STRING, `LOCATION` STRING, `ON STREET NAME` STRING, `CROSS STREET NAME` STRING, `OFF STREET NAME` STRING, `NUMBER OF PERSONS INJURED` STRING, `NUMBER OF PERSONS KILLED` STRING, `NUMBER OF PEDESTRIANS INJURED` STRING, `NUMBER OF PEDESTRIANS KILLED` STRING, `NUMBER OF CYCLIST INJURED` STRING, `NUMBER OF CYCLIST KILLED` STRING, `NUMBER OF MOTORIST INJURED` STRING, `NUMBER OF MOTORIST KILLED` STRING, `CONTRIBUTING FACTOR VEHICLE 1` STRING, `CONTRIBUTING FACTOR VEHICLE 2` STRING, `CONTRIBUTING FACTOR VEHICLE 3` STRING, `CONTRIBUTING FACTOR VEHICLE 4` STRING, `CONTRIBUTING FACTOR VEHICLE 5` STRING, `UNIQUE KEY` STRING, `VEHICLE TYPE CODE 1` STRING, `VEHICLE TYPE CODE 2` STRING, `VEHICLE TYPE CODE 3` STRING, `VEHICLE TYPE CODE 4` STRING, `VEHICLE TYPE CODE 5` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/tmp/data';""", dag=dag)
import time, requests, json from datetime import datetime, timedelta from airflow.operators.hive_operator import HiveOperator from airflow.operators.dummy_operator import DummyOperator from airflow import DAG default_args = { 'owner': 'Rick wu', 'start_date': datetime(2100, 1, 1, 0, 0), 'schedule_interval': '@daily', 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG('hql1', default_args=default_args) as dag: # "SELECT * FROM item_mart LIMIT 10", hql_job_1 = HiveOperator(hql="select * from default.item_mart limit 10", hive_cli_conn_id="hive_local", schema='default', hiveconf_jinja_translate=True, task_id='hql_job_1', dag=dag) do_nothing = DummyOperator(task_id='no_do_nothing') hql_job_1 >> do_nothing
def test_hive_dryrun(self): t = HiveOperator(task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) t.dry_run()
def test_hive(self): t = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
# The steps are explicitly coded, you could build them dynamically using # a simple for loop, because only the step id and the dimension name changes. # Notice that customer and product are processed independently and in parallel # in a single dag. dag = airflow.DAG('process_hive_dwh', schedule_interval="@daily", dagrun_timeout=timedelta(minutes=60), template_searchpath=tmpl_search_path, default_args=args, max_active_runs=10) customer_step_1 = HiveOperator(hql='customer/step_1.hql', hive_cli_conn_id='hive_staging', schema='default', hiveconf_jinja_translate=True, task_id='customer_step_1', dag=dag) customer_step_2 = HiveOperator(hql='customer/step_2.hql', hive_cli_conn_id='hive_staging', schema='default', hiveconf_jinja_translate=True, task_id='customer_step_2', dag=dag) product_step_1 = HiveOperator(hql='product/step_1.hql', hive_cli_conn_id='hive_staging', schema='default', hiveconf_jinja_translate=True, task_id='product_step_1',
} def read_config(): with open('config.yaml', 'r') as stream: return (yaml.safe_load(stream)) my_config = read_config() dag = DAG(my_config['dag_id'], schedule_interval='@once', default_args=args) analyze_log = HiveOperator(hql='hive_task/EC_data_analysis.hql', hive_cli_conn_id=my_config['hive_connection'], schema=my_config['hive_schema'], hiveconf_jinja_translate=True, task_id='analyze_log', params={ 'input_path': my_config['input_path'], 'output_path': my_config['output_path'] }, dag=dag) merge_files = BashOperator( task_id='merge_files', dag=dag, bash_command= '{{ params.project_home }}/shell_task/table_to_csv_one.sh {{ params.project_home }}/shell_task/table_to_csv_one.sh {{ params.tmp_dir }} {{ params.output_path }} {{ params.hive_schema }}', params={ 'project_home': my_config['project_home'], 'tmp_dir': my_config['tmp_dir'], 'output_path': my_config['output_path'], 'hive_schema': my_config['hive_schema']
# Step 2: Move json file to hdfs storage move_to_hdfs = BashOperator(task_id="move_to_hdfs", bash_command=""" hdfs dfs -mkdir -p /dim_sku && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/sku_data.csv /dim_sku """) # Step 3: Create a hive table on our sku_data creating_sku_table = HiveOperator(task_id="creating_sku_table", hive_cli_conn_id="hive_conn", hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS dim_sku( asin STRING, title STRING, price DOUBLE, brand STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE """) processing_sku_data = SparkSubmitOperator( task_id="processing_sku_data", conn_id="spark_conn", application="/usr/local/airflow/dags/scripts/dim_sku_processing.py", verbose=False) unzip_file_store_as_csv >> move_to_hdfs >> creating_sku_table >> processing_sku_data
hdfs_dir = " /tmp/" for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets)
indexed_files = [(int(fn.split('.')[0]), fn) for fn in only_files] sorted_files = sorted(indexed_files, key=lambda tup: tup[0]) # print(sorted_files) # 储存tag_id和datatype的字典 tag_and_datatype_dict = {} # 创建顺序执行的DAG工作流 tail_etl_task = None for tp in sorted_files: file_name = tp[1] etl_name = tp[1].split('.')[1] etl_task = HiveOperator(task_id='user_tag_etl__' + etl_name, hql=file_name, hive_cli_conn_id=HIVE_CONN_ID, dag=dag) if tail_etl_task is not None: etl_task.set_upstream(tail_etl_task) tail_etl_task = etl_task else: tail_etl_task = etl_task collect_tag_id_and_datatype_from_sql_file(file_name) print('tag_and_datatype_dict: {}'.format(tag_and_datatype_dict)) user_tag_to_wide_sql = ''' drop table if exists user_tag_wide; create table user_tag_wide