def test_execute_mysql_preoperator(self, mock_hive_hook, mock_mysql_hook): self.kwargs.update(dict(mysql_preoperator='preoperator')) HiveToMySqlTransfer(**self.kwargs).execute(context={}) mock_mysql_hook.return_value.run.assert_called_once_with( self.kwargs['mysql_preoperator'])
def test_execute(self, mock_hive_hook, mock_mysql_hook): HiveToMySqlTransfer(**self.kwargs).execute(context={}) mock_hive_hook.assert_called_once_with(hiveserver2_conn_id=self.kwargs['hiveserver2_conn_id']) mock_hive_hook.return_value.get_records.assert_called_once_with('sql', hive_conf={}) mock_mysql_hook.assert_called_once_with(mysql_conn_id=self.kwargs['mysql_conn_id']) mock_mysql_hook.return_value.insert_rows.assert_called_once_with( table=self.kwargs['mysql_table'], rows=mock_hive_hook.return_value.get_records.return_value )
def test_execute_with_hive_conf(self, mock_hive_hook, mock_mysql_hook): context = {} self.kwargs.update( dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'})) HiveToMySqlTransfer(**self.kwargs).execute(context=context) hive_conf = context_to_airflow_vars(context) hive_conf.update(self.kwargs['hive_conf']) mock_hive_hook.return_value.get_records.assert_called_once_with( self.kwargs['sql'], hive_conf=hive_conf)
def test_hive_to_mysql(self): test_hive_results = 'test_hive_results' mock_hive_hook = MockHiveServer2Hook() mock_hive_hook.get_records = MagicMock(return_value=test_hive_results) mock_mysql_hook = MockMySqlHook() mock_mysql_hook.run = MagicMock() mock_mysql_hook.insert_rows = MagicMock() with patch('airflow.operators.hive_to_mysql.HiveServer2Hook', return_value=mock_hive_hook): with patch('airflow.operators.hive_to_mysql.MySqlHook', return_value=mock_mysql_hook): op = HiveToMySqlTransfer( mysql_conn_id='airflow_db', task_id='hive_to_mysql_check', sql=""" SELECT name FROM airflow.static_babynames LIMIT 100 """, mysql_table='test_static_babynames', mysql_preoperator=[ 'DROP TABLE IF EXISTS test_static_babynames;', 'CREATE TABLE test_static_babynames (name VARCHAR(500))', ], dag=self.dag) op.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) raw_select_name_query = mock_hive_hook.get_records.call_args_list[0][ 0][0] actual_select_name_query = re.sub(r'\s{2,}', ' ', raw_select_name_query).strip() expected_select_name_query = 'SELECT name FROM airflow.static_babynames LIMIT 100' self.assertEqual(expected_select_name_query, actual_select_name_query) actual_hive_conf = mock_hive_hook.get_records.call_args_list[0][1][ 'hive_conf'] expected_hive_conf = { 'airflow.ctx.dag_owner': 'airflow', 'airflow.ctx.dag_id': 'test_dag_id', 'airflow.ctx.task_id': 'hive_to_mysql_check', 'airflow.ctx.execution_date': '2015-01-01T00:00:00+00:00' } self.assertEqual(expected_hive_conf, actual_hive_conf) expected_mysql_preoperator = [ 'DROP TABLE IF EXISTS test_static_babynames;', 'CREATE TABLE test_static_babynames (name VARCHAR(500))' ] mock_mysql_hook.run.assert_called_with(expected_mysql_preoperator) mock_mysql_hook.insert_rows.assert_called_with( table='test_static_babynames', rows=test_hive_results)
def match_datatransfer_operater(self): source_db_type = self.get_conn_db_type(self.source_conn_id) destination_db_type = self.get_conn_db_type(self.destination_conn_id) if source_db_type == 'mysql' and destination_db_type == 'hiveserver2': return MySqlToHiveTransfer( sql=self.sql, hive_table=self.destination_table, create=False, recreate=False, partition=None, delimiter=chr(1), mysql_conn_id=self.source_conn_id, hive_cli_conn_id=self.destination_conn_id, tblproperties=None) if source_db_type == 'mssql' and destination_db_type == 'hiveserver2': return MsSqlToHiveTransfer( sql=self.sql, hive_table=self.destination_table, create=False, recreate=False, partition=None, delimiter=chr(1), mysql_conn_id=self.source_conn_id, hive_cli_conn_id=self.destination_conn_id, tblproperties=None) if source_db_type == 'hiveserver2' and destination_db_type == 'mysql': return HiveToMySqlTransfer(sql=self.sql, mysql_table=self.destination_table, hiveserver2_conn_id=self.source_conn_id, mysql_conn_id=self.destination_conn_id, mysql_preoperator=None, mysql_postoperator=None, bulk_load=False) if source_db_type == 'oracle' and destination_db_type == 'oracle': return OracleToOracleTransfer( source_sql=self.sql, destination_table=self.destination_table, oracle_source_conn_id=self.source_conn_id, oracle_destination_conn_id=self.destination_conn_id, source_sql_params=None, rows_chunk=5000) return GenericTransfer(sql=self.sql, destination_table=self.destination_table, source_conn_id=self.source_conn_id, destination_conn_id=self.destination_conn_id, preoperator=None)
def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file, mock_mysql_hook): type(mock_tmp_file).name = PropertyMock(return_value='tmp_file') context = {} self.kwargs.update(dict(bulk_load=True)) HiveToMySqlTransfer(**self.kwargs).execute(context=context) mock_tmp_file.assert_called_once_with() mock_hive_hook.return_value.to_csv.assert_called_once_with( self.kwargs['sql'], mock_tmp_file.return_value.name, delimiter='\t', lineterminator='\n', output_header=False, hive_conf=context_to_airflow_vars(context)) mock_mysql_hook.return_value.bulk_load.assert_called_once_with( table=self.kwargs['mysql_table'], tmp_file=mock_tmp_file.return_value.name) mock_tmp_file.return_value.close.assert_called_once_with()
def test_hive_to_mysql(self): op = HiveToMySqlTransfer( mysql_conn_id='airflow_db', task_id='hive_to_mysql_check', create=True, sql=""" SELECT name FROM airflow.static_babynames LIMIT 100 """, mysql_table='test_static_babynames', mysql_preoperator=[ 'DROP TABLE IF EXISTS test_static_babynames;', 'CREATE TABLE test_static_babynames (name VARCHAR(500))', ], dag=self.dag) op.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
dag=dag) insert_bdm_dim_metrics = HiveToMySqlTransfer( task_id='insert_bdm_dim_metrics', sql=""" select null, dt, area_name, bdm_id, bdm_name, hbdm_name, take_time_avg, delivery_time_avg, score_peisong_avg, cancel_order_cnt, user_cancel_order_cnt, merchant_cancel_order_cnt, sys_cancel_order_cnt from ofood_bi.ofood_bdm_area_metrics_report where dt = '{{ ds }}' """, mysql_conn_id='mysql_bi', mysql_table='ofood_bdm_area_metrics_report', dag=dag) insert_shop_list_metrics = HiveToMySqlTransfer(
""", mysql_conn_id='opay_spread_mysql', dag=dag) promoter_detail_to_msyql = HiveToMySqlTransfer( task_id='promoter_detail_to_msyql', sql=""" SELECT null as id, t.code as code, t.bind_number as phone, t.bind_device as device_id, 0 as register_time, t.bind_time as bind_time, {{ ds_nodash }}, {{ execution_date.hour}}, ip FROM oride_dw.dwd_oride_driver_cheating_detection_hi LATERAL VIEW json_tuple(event_value, 'bind_refferal_code', 'bind_number', 'bind_device_id', 'bind_time') t AS code, bind_number, bind_device, bind_time WHERE dt='{{ ds }}' AND hour='{{ execution_date.strftime("%H") }}' """, mysql_conn_id='opay_spread_mysql', mysql_table='promoter_users_device', dag=dag ) promoter_hour_to_msyql = HiveToMySqlTransfer( task_id='promoter_hour_to_msyql', sql="""
''', schema='ofood_bi', dag=dag) insert_mkt_metrics = HiveToMySqlTransfer( task_id='insert_mkt_metrics', sql=""" select null, dt, mkt_id, mkt_phone, bdm_id, bdm_name, new_user_cnt, pay_order_cnt from ofood_bi.ofood_mkt_daily_metrics_info where dt = '{{ ds }}' """, mysql_conn_id='mysql_bi', mysql_table='ofood_mkt_daily_metrics_info', dag=dag) validate_partition_data >> jh_order_validate_task >> create_mkt_metrics validate_partition_data >> bd_admin_users_validate_task >> create_mkt_metrics validate_partition_data >> bd_invitation_info_validate_task >> create_mkt_metrics create_mkt_metrics >> insert_mkt_metrics
from airflow import DAG from airflow.operators.hive_to_mysql import HiveToMySqlTransfer from airflow.operators.mysql_operator import MySqlOperator from airflow.utils.dates import days_ago from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'start_date': days_ago(2), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag = DAG(dag_id='hive_to_mysql', schedule_interval='@once', default_args=default_args) t1 = MySqlOperator(task_id='drop_table', sql='drop table if exists temp_drivers', dag=dag) t2 = HiveToMySqlTransfer( task_id='hive_to_mysql', mysql_table='temp_drivers', mysql_preoperator='create table temp_drivers(col_value varchar(256))', sql='select * from temp_drivers', dag=dag) t1 >> t2
insertoverwrite_final = HiveOperator(task_id='insertoverwrite_final', hql=hiveSQL_insertoverwrite_final, hive_cli_conn_id='beeline', dag=dag) createTableToMysql = MySqlOperator(mysql_conn_id='mysql', task_id='createTableToMysql', sql=hiveSQL_create_mysql, database='spotify_data', dag=dag) exportToMysql = HiveToMySqlTransfer( task_id='exportToMysql', sql="SELECT * FROM final", mysql_preoperator='DELETE FROM spotify_data.titledata', mysql_table='spotify_data.titledata', mysql_conn_id='mysql', hiveserver2_conn_id='beeline', dag=dag) ########################################################### C O P Y ################################################################ i = 0 if os.path.exists('/home/airflow/requestData/playlist'): for filename in os.listdir('/home/airflow/requestData/playlist'): if filename.endswith(".json"): reqDataCopyPlaylist = HdfsPutFileOperator( task_id='requestdata_copy_playlist' + str(i), local_file='/home/airflow/requestData/playlist/' + filename, remote_file='/user/hadoop/spotify/rawPlaylist/' + filename, hdfs_conn_id='hdfs', dag=dag,
bucket_key= 'okash/okash/{table}/dt={{{{ ds }}}}/hour={{{{ execution_date.strftime("%H") }}}}/' .format(table='client'), bucket_name='okash', timeout=3600, dag=dag) add_client_partitions = HiveOperator(task_id='add_client_partitions', hql=""" ALTER TABLE ods_log_client_hi ADD IF NOT EXISTS PARTITION (dt = '{{ ds }}', hour = '{{ execution_date.strftime("%H") }}'); """, schema='okash_dw', dag=dag) export_to_mysql = HiveToMySqlTransfer(task_id='export_to_mysql', sql=""" --SET mapreduce.job.queuename=root.airflow; SELECT *, from_unixtime(unix_timestamp()) FROM okash_dw.ods_log_client_hi WHERE dt='{{ ds }}' AND hour='{{ execution_date.strftime("%H") }}' """, mysql_conn_id='mysql_bi', mysql_table='okash_dw.ods_log_client_hi', dag=dag) check_client_file >> add_client_partitions >> export_to_mysql
insert_crm_metrics = HiveToMySqlTransfer( task_id='insert_crm_metrics', sql=""" select null, bd_id, dt, username, area_name, shop_id, title, total_number_of_merchants, total_number_of_new_merchants, trade_number_of_merchants, have_price_number_of_merchants, number_of_new_merchants, number_of_pay_orders, number_of_new_users, gmv, paid_in_amount, platform_subsidies, amount_and_first_price, net_turnover, total_number_of_order, number_of_valid_order, number_of_cancel_order_before_payment, number_of_cancel_order_after_payment, total_number_of_invitation_new_users, number_of_invitation_new_users, roof_plat_coupon, roof_coupon from ofood_bi.ofood_area_shop_metrics_info where dt = '{{ ds }}' """, mysql_conn_id='mysql_bi', mysql_table='ofood_area_shop_metrics_info', dag=dag)
order_location_info_to_msyql = HiveToMySqlTransfer( task_id='order_location_info_to_msyql', sql=""" select null, dt, order_id , user_id , driver_id , create_time , status , start_loc , end_loc , looking_for_a_driver_show , successful_order_show , accept_order_click , rider_arrive_show , confirm_arrive_click_arrived , pick_up_passengers_sliding_arrived , start_ride_show , start_ride_sliding , complete_the_order_show , start_ride_sliding_arrived , loc_list from oride_dw.dwd_oride_order_location_di where country_code = 'nal' and dt='{{ ds }}' """, mysql_conn_id='mysql_oride_location', mysql_table='oride_order_location_info', dag=dag)
hive_insert_overwrite_final_address_data >> create_table_remote # -------------------------------------------------------------------------------- # clear table remote # -------------------------------------------------------------------------------- delete_from_remote = MySqlOperator( task_id='delete_from_remote', sql="DELETE FROM final_address_data;", mysql_conn_id='mysql_default', database='db' ) create_table_remote >> delete_from_remote # -------------------------------------------------------------------------------- # copy table to remote # -------------------------------------------------------------------------------- previous_task = delete_from_remote for country in coutry_list: hive_to_mysql = HiveToMySqlTransfer( task_id='hive_to_mysql_' + country, sql="select * from default.final_address_data where country like '" + country + "'", hiveserver2_conn_id='hiveserver2', mysql_table='db.final_address_data', mysql_conn_id='mysql_default', ) previous_task >> hive_to_mysql previous_task = hive_to_mysql