def test_get_results_data(self): hook = MockHiveServer2Hook() query = f"SELECT * FROM {self.table}" results = hook.get_results(query, schema=self.database) self.assertListEqual(results['data'], [(1, 1), (2, 2)])
def test_to_csv(self): hook = MockHiveServer2Hook() hook._get_results = mock.MagicMock( return_value=iter( [ [ ('hive_server_hook.a', 'INT_TYPE', None, None, None, None, True), ('hive_server_hook.b', 'INT_TYPE', None, None, None, None, True), ], (1, 1), (2, 2), ] ) ) query = f"SELECT * FROM {self.table}" csv_filepath = 'query_results.csv' hook.to_csv( query, csv_filepath, schema=self.database, delimiter=',', lineterminator='\n', output_header=True, fetch_size=2, ) df = pd.read_csv(csv_filepath, sep=',') self.assertListEqual(df.columns.tolist(), self.columns) self.assertListEqual(df[self.columns[0]].values.tolist(), [1, 2]) self.assertEqual(len(df), 2)
def test_get_pandas_df(self): hook = MockHiveServer2Hook() query = f"SELECT * FROM {self.table}" with mock.patch.dict( 'os.environ', { 'AIRFLOW_CTX_DAG_ID': 'test_dag_id', 'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835', 'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00', 'AIRFLOW_CTX_DAG_RUN_ID': '55', 'AIRFLOW_CTX_DAG_OWNER': 'airflow', 'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**', }, ): df = hook.get_pandas_df(query, schema=self.database) self.assertEqual(len(df), 2) self.assertListEqual(df["hive_server_hook.a"].values.tolist(), [1, 2]) hook.get_conn.assert_called_with(self.database) hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_id=test_dag_id') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.task_id=HiveHook_3835') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_run_id=55') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_owner=airflow') hook.mock_cursor.execute.assert_any_call('set [email protected]')
def test_get_results_header(self): hook = MockHiveServer2Hook() query = f"SELECT * FROM {self.table}" results = hook.get_results(query, schema=self.database) self.assertListEqual([col[0] for col in results['header']], self.columns)
def test_get_records(self): hook = MockHiveServer2Hook() query = f"SELECT * FROM {self.table}" with mock.patch.dict( 'os.environ', { 'AIRFLOW_CTX_DAG_ID': 'test_dag_id', 'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835', 'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00', 'AIRFLOW_CTX_DAG_RUN_ID': '55', 'AIRFLOW_CTX_DAG_OWNER': 'airflow', 'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**', }, ): results = hook.get_records(query, schema=self.database) self.assertListEqual(results, [(1, 1), (2, 2)]) hook.get_conn.assert_called_with(self.database) hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_id=test_dag_id') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.task_id=HiveHook_3835') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_run_id=55') hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_owner=airflow') hook.mock_cursor.execute.assert_any_call('set [email protected]')
def test_hive_to_mysql(self): test_hive_results = 'test_hive_results' mock_hive_hook = MockHiveServer2Hook() mock_hive_hook.get_records = MagicMock(return_value=test_hive_results) mock_mysql_hook = MockMySqlHook() mock_mysql_hook.run = MagicMock() mock_mysql_hook.insert_rows = MagicMock() with patch('airflow.operators.hive_to_mysql.HiveServer2Hook', return_value=mock_hive_hook): with patch('airflow.operators.hive_to_mysql.MySqlHook', return_value=mock_mysql_hook): op = HiveToMySqlTransfer( mysql_conn_id='airflow_db', task_id='hive_to_mysql_check', sql=""" SELECT name FROM airflow.static_babynames LIMIT 100 """, mysql_table='test_static_babynames', mysql_preoperator=[ 'DROP TABLE IF EXISTS test_static_babynames;', 'CREATE TABLE test_static_babynames (name VARCHAR(500))', ], dag=self.dag) op.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) raw_select_name_query = mock_hive_hook.get_records.call_args_list[0][ 0][0] actual_select_name_query = re.sub(r'\s{2,}', ' ', raw_select_name_query).strip() expected_select_name_query = 'SELECT name FROM airflow.static_babynames LIMIT 100' self.assertEqual(expected_select_name_query, actual_select_name_query) actual_hive_conf = mock_hive_hook.get_records.call_args_list[0][1][ 'hive_conf'] expected_hive_conf = { 'airflow.ctx.dag_owner': 'airflow', 'airflow.ctx.dag_id': 'test_dag_id', 'airflow.ctx.task_id': 'hive_to_mysql_check', 'airflow.ctx.execution_date': '2015-01-01T00:00:00+00:00' } self.assertEqual(expected_hive_conf, actual_hive_conf) expected_mysql_preoperator = [ 'DROP TABLE IF EXISTS test_static_babynames;', 'CREATE TABLE test_static_babynames (name VARCHAR(500))' ] mock_mysql_hook.run.assert_called_with(expected_mysql_preoperator) mock_mysql_hook.insert_rows.assert_called_with( table='test_static_babynames', rows=test_hive_results)
def test_get_results_with_hive_conf(self): hql = [ "set key", "set airflow.ctx.dag_id", "set airflow.ctx.dag_run_id", "set airflow.ctx.task_id", "set airflow.ctx.execution_date", ] dag_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format'] task_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format'] execution_date_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][ 'env_var_format' ] dag_run_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][ 'env_var_format' ] with mock.patch.dict( 'os.environ', { dag_id_ctx_var_name: 'test_dag_id', task_id_ctx_var_name: 'test_task_id', execution_date_ctx_var_name: 'test_execution_date', dag_run_id_ctx_var_name: 'test_dag_run_id', }, ): hook = MockHiveServer2Hook() hook._get_results = mock.MagicMock( return_value=iter( [ "header", ("value", "test"), ("test_dag_id", "test"), ("test_task_id", "test"), ("test_execution_date", "test"), ("test_dag_run_id", "test"), ] ) ) output = '\n'.join( res_tuple[0] for res_tuple in hook.get_results(hql=hql, hive_conf={'key': 'value'})['data'] ) self.assertIn('value', output) self.assertIn('test_dag_id', output) self.assertIn('test_task_id', output) self.assertIn('test_execution_date', output) self.assertIn('test_dag_run_id', output)
def test_multi_statements(self): sqls = [ "CREATE TABLE IF NOT EXISTS test_multi_statements (i INT)", f"SELECT * FROM {self.table}", "DROP TABLE test_multi_statements", ] hook = MockHiveServer2Hook() with mock.patch.dict( 'os.environ', { 'AIRFLOW_CTX_DAG_ID': 'test_dag_id', 'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835', 'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00', 'AIRFLOW_CTX_DAG_RUN_ID': '55', 'AIRFLOW_CTX_DAG_OWNER': 'airflow', 'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**', }, ): # df = hook.get_pandas_df(query, schema=self.database) results = hook.get_records(sqls, schema=self.database) self.assertListEqual(results, [(1, 1), (2, 2)]) # self.assertEqual(len(df), 2) # self.assertListEqual(df["hive_server_hook.a"].values.tolist(), [1, 2]) hook.get_conn.assert_called_with(self.database) hook.mock_cursor.execute.assert_any_call( 'CREATE TABLE IF NOT EXISTS test_multi_statements (i INT)') hook.mock_cursor.execute.assert_any_call(f'SELECT * FROM {self.table}') hook.mock_cursor.execute.assert_any_call( 'DROP TABLE test_multi_statements') hook.mock_cursor.execute.assert_any_call( 'set airflow.ctx.dag_id=test_dag_id') hook.mock_cursor.execute.assert_any_call( 'set airflow.ctx.task_id=HiveHook_3835') hook.mock_cursor.execute.assert_any_call( 'set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00') hook.mock_cursor.execute.assert_any_call( 'set airflow.ctx.dag_run_id=55') hook.mock_cursor.execute.assert_any_call( 'set airflow.ctx.dag_owner=airflow') hook.mock_cursor.execute.assert_any_call( 'set [email protected]')
def test_execute_with_hive_conf(self, mock_mysql_hook): context = {} mock_hive_hook = MockHiveServer2Hook() mock_hive_hook.get_records = MagicMock(return_value='test_hive_results') self.kwargs.update(dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'})) with patch('airflow.providers.apache.hive.operators.hive_to_mysql.HiveServer2Hook', return_value=mock_hive_hook): HiveToMySqlTransferOperator(**self.kwargs).execute(context=context) hive_conf = context_to_airflow_vars(context) hive_conf.update(self.kwargs['hive_conf']) mock_hive_hook.get_records.assert_called_once_with( self.kwargs['sql'], hive_conf=hive_conf )
def test_get_conn(self): hook = MockHiveServer2Hook() hook.get_conn()
def test_mysql_to_hive_verify_loaded_values(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "test_mysql_to_hive" mysql_table = 'test_mysql_to_hive' hive_table = 'test_mysql_to_hive' hook = MySqlHook() try: minmax = ( 255, 65535, 16777215, 4294967295, 18446744073709551615, -128, -32768, -8388608, -2147483648, -9223372036854775808, ) with hook.get_conn() as conn: conn.execute(f"DROP TABLE IF EXISTS {mysql_table}") conn.execute(""" CREATE TABLE {} ( c0 TINYINT UNSIGNED, c1 SMALLINT UNSIGNED, c2 MEDIUMINT UNSIGNED, c3 INT UNSIGNED, c4 BIGINT UNSIGNED, c5 TINYINT, c6 SMALLINT, c7 MEDIUMINT, c8 INT, c9 BIGINT ) """.format(mysql_table)) conn.execute(""" INSERT INTO {} VALUES ( {}, {}, {}, {}, {}, {}, {}, {}, {}, {} ) """.format(mysql_table, *minmax)) with mock.patch.dict('os.environ', self.env_vars): op = MySqlToHiveOperator( task_id='test_m2h', hive_cli_conn_id='hive_cli_default', sql=f"SELECT * FROM {mysql_table}", hive_table=hive_table, recreate=True, delimiter=",", dag=self.dag, ) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_cursor = MockConnectionCursor() mock_cursor.iterable = [minmax] hive_hook = MockHiveServer2Hook(connection_cursor=mock_cursor) result = hive_hook.get_records(f"SELECT * FROM {hive_table}") assert result[0] == minmax hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=unit_test_dag', '-hiveconf', 'airflow.ctx.task_id=test_m2h', '-hiveconf', 'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00', '-hiveconf', 'airflow.ctx.dag_run_id=55', '-hiveconf', 'airflow.ctx.dag_owner=airflow', '-hiveconf', '[email protected]', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_test_mysql_to_hive/tmptest_mysql_to_hive', ] mock_popen.assert_called_with( hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_test_mysql_to_hive", close_fds=True, ) finally: with hook.get_conn() as conn: conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")
def test_mysql_to_hive_verify_csv_special_char(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "test_mysql_to_hive" mysql_table = 'test_mysql_to_hive' hive_table = 'test_mysql_to_hive' hook = MySqlHook() try: db_record = ('c0', '["true"]') with hook.get_conn() as conn: conn.execute(f"DROP TABLE IF EXISTS {mysql_table}") conn.execute(""" CREATE TABLE {} ( c0 VARCHAR(25), c1 VARCHAR(25) ) """.format(mysql_table)) conn.execute(""" INSERT INTO {} VALUES ( '{}', '{}' ) """.format(mysql_table, *db_record)) with mock.patch.dict('os.environ', self.env_vars): import unicodecsv as csv op = MySqlToHiveOperator( task_id='test_m2h', hive_cli_conn_id='hive_cli_default', sql=f"SELECT * FROM {mysql_table}", hive_table=hive_table, recreate=True, delimiter=",", quoting=csv.QUOTE_NONE, quotechar='', escapechar='@', dag=self.dag, ) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_cursor = MockConnectionCursor() mock_cursor.iterable = [('c0', '["true"]'), (2, 2)] hive_hook = MockHiveServer2Hook(connection_cursor=mock_cursor) result = hive_hook.get_records(f"SELECT * FROM {hive_table}") assert result[0] == db_record hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=unit_test_dag', '-hiveconf', 'airflow.ctx.task_id=test_m2h', '-hiveconf', 'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00', '-hiveconf', 'airflow.ctx.dag_run_id=55', '-hiveconf', 'airflow.ctx.dag_owner=airflow', '-hiveconf', '[email protected]', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_test_mysql_to_hive/tmptest_mysql_to_hive', ] mock_popen.assert_called_with( hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_test_mysql_to_hive", close_fds=True, ) finally: with hook.get_conn() as conn: conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")