def test_get_results_data(self):
        hook = MockHiveServer2Hook()

        query = f"SELECT * FROM {self.table}"
        results = hook.get_results(query, schema=self.database)

        self.assertListEqual(results['data'], [(1, 1), (2, 2)])
 def test_to_csv(self):
     hook = MockHiveServer2Hook()
     hook._get_results = mock.MagicMock(
         return_value=iter(
             [
                 [
                     ('hive_server_hook.a', 'INT_TYPE', None, None, None, None, True),
                     ('hive_server_hook.b', 'INT_TYPE', None, None, None, None, True),
                 ],
                 (1, 1),
                 (2, 2),
             ]
         )
     )
     query = f"SELECT * FROM {self.table}"
     csv_filepath = 'query_results.csv'
     hook.to_csv(
         query,
         csv_filepath,
         schema=self.database,
         delimiter=',',
         lineterminator='\n',
         output_header=True,
         fetch_size=2,
     )
     df = pd.read_csv(csv_filepath, sep=',')
     self.assertListEqual(df.columns.tolist(), self.columns)
     self.assertListEqual(df[self.columns[0]].values.tolist(), [1, 2])
     self.assertEqual(len(df), 2)
    def test_get_pandas_df(self):
        hook = MockHiveServer2Hook()
        query = f"SELECT * FROM {self.table}"

        with mock.patch.dict(
            'os.environ',
            {
                'AIRFLOW_CTX_DAG_ID': 'test_dag_id',
                'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835',
                'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00',
                'AIRFLOW_CTX_DAG_RUN_ID': '55',
                'AIRFLOW_CTX_DAG_OWNER': 'airflow',
                'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**',
            },
        ):
            df = hook.get_pandas_df(query, schema=self.database)

        self.assertEqual(len(df), 2)
        self.assertListEqual(df["hive_server_hook.a"].values.tolist(), [1, 2])

        hook.get_conn.assert_called_with(self.database)
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_id=test_dag_id')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.task_id=HiveHook_3835')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_run_id=55')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_owner=airflow')
        hook.mock_cursor.execute.assert_any_call('set [email protected]')
    def test_get_results_header(self):
        hook = MockHiveServer2Hook()

        query = f"SELECT * FROM {self.table}"
        results = hook.get_results(query, schema=self.database)

        self.assertListEqual([col[0] for col in results['header']], self.columns)
    def test_get_records(self):
        hook = MockHiveServer2Hook()
        query = f"SELECT * FROM {self.table}"

        with mock.patch.dict(
            'os.environ',
            {
                'AIRFLOW_CTX_DAG_ID': 'test_dag_id',
                'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835',
                'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00',
                'AIRFLOW_CTX_DAG_RUN_ID': '55',
                'AIRFLOW_CTX_DAG_OWNER': 'airflow',
                'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**',
            },
        ):
            results = hook.get_records(query, schema=self.database)

        self.assertListEqual(results, [(1, 1), (2, 2)])

        hook.get_conn.assert_called_with(self.database)
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_id=test_dag_id')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.task_id=HiveHook_3835')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_run_id=55')
        hook.mock_cursor.execute.assert_any_call('set airflow.ctx.dag_owner=airflow')
        hook.mock_cursor.execute.assert_any_call('set [email protected]')
Пример #6
0
    def test_hive_to_mysql(self):
        test_hive_results = 'test_hive_results'

        mock_hive_hook = MockHiveServer2Hook()
        mock_hive_hook.get_records = MagicMock(return_value=test_hive_results)

        mock_mysql_hook = MockMySqlHook()
        mock_mysql_hook.run = MagicMock()
        mock_mysql_hook.insert_rows = MagicMock()

        with patch('airflow.operators.hive_to_mysql.HiveServer2Hook',
                   return_value=mock_hive_hook):
            with patch('airflow.operators.hive_to_mysql.MySqlHook',
                       return_value=mock_mysql_hook):

                op = HiveToMySqlTransfer(
                    mysql_conn_id='airflow_db',
                    task_id='hive_to_mysql_check',
                    sql="""
                        SELECT name
                        FROM airflow.static_babynames
                        LIMIT 100
                        """,
                    mysql_table='test_static_babynames',
                    mysql_preoperator=[
                        'DROP TABLE IF EXISTS test_static_babynames;',
                        'CREATE TABLE test_static_babynames (name VARCHAR(500))',
                    ],
                    dag=self.dag)
                op.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
                op.run(start_date=DEFAULT_DATE,
                       end_date=DEFAULT_DATE,
                       ignore_ti_state=True)

        raw_select_name_query = mock_hive_hook.get_records.call_args_list[0][
            0][0]
        actual_select_name_query = re.sub(r'\s{2,}', ' ',
                                          raw_select_name_query).strip()
        expected_select_name_query = 'SELECT name FROM airflow.static_babynames LIMIT 100'
        self.assertEqual(expected_select_name_query, actual_select_name_query)

        actual_hive_conf = mock_hive_hook.get_records.call_args_list[0][1][
            'hive_conf']
        expected_hive_conf = {
            'airflow.ctx.dag_owner': 'airflow',
            'airflow.ctx.dag_id': 'test_dag_id',
            'airflow.ctx.task_id': 'hive_to_mysql_check',
            'airflow.ctx.execution_date': '2015-01-01T00:00:00+00:00'
        }
        self.assertEqual(expected_hive_conf, actual_hive_conf)

        expected_mysql_preoperator = [
            'DROP TABLE IF EXISTS test_static_babynames;',
            'CREATE TABLE test_static_babynames (name VARCHAR(500))'
        ]
        mock_mysql_hook.run.assert_called_with(expected_mysql_preoperator)

        mock_mysql_hook.insert_rows.assert_called_with(
            table='test_static_babynames', rows=test_hive_results)
    def test_get_results_with_hive_conf(self):
        hql = [
            "set key",
            "set airflow.ctx.dag_id",
            "set airflow.ctx.dag_run_id",
            "set airflow.ctx.task_id",
            "set airflow.ctx.execution_date",
        ]

        dag_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format']
        task_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format']
        execution_date_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][
            'env_var_format'
        ]
        dag_run_id_ctx_var_name = AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][
            'env_var_format'
        ]

        with mock.patch.dict(
            'os.environ',
            {
                dag_id_ctx_var_name: 'test_dag_id',
                task_id_ctx_var_name: 'test_task_id',
                execution_date_ctx_var_name: 'test_execution_date',
                dag_run_id_ctx_var_name: 'test_dag_run_id',
            },
        ):
            hook = MockHiveServer2Hook()
            hook._get_results = mock.MagicMock(
                return_value=iter(
                    [
                        "header",
                        ("value", "test"),
                        ("test_dag_id", "test"),
                        ("test_task_id", "test"),
                        ("test_execution_date", "test"),
                        ("test_dag_run_id", "test"),
                    ]
                )
            )

            output = '\n'.join(
                res_tuple[0] for res_tuple in hook.get_results(hql=hql, hive_conf={'key': 'value'})['data']
            )
        self.assertIn('value', output)
        self.assertIn('test_dag_id', output)
        self.assertIn('test_task_id', output)
        self.assertIn('test_execution_date', output)
        self.assertIn('test_dag_run_id', output)
Пример #8
0
    def test_multi_statements(self):
        sqls = [
            "CREATE TABLE IF NOT EXISTS test_multi_statements (i INT)",
            f"SELECT * FROM {self.table}",
            "DROP TABLE test_multi_statements",
        ]

        hook = MockHiveServer2Hook()

        with mock.patch.dict(
                'os.environ',
            {
                'AIRFLOW_CTX_DAG_ID': 'test_dag_id',
                'AIRFLOW_CTX_TASK_ID': 'HiveHook_3835',
                'AIRFLOW_CTX_EXECUTION_DATE': '2015-01-01T00:00:00+00:00',
                'AIRFLOW_CTX_DAG_RUN_ID': '55',
                'AIRFLOW_CTX_DAG_OWNER': 'airflow',
                'AIRFLOW_CTX_DAG_EMAIL': '*****@*****.**',
            },
        ):
            # df = hook.get_pandas_df(query, schema=self.database)
            results = hook.get_records(sqls, schema=self.database)
        self.assertListEqual(results, [(1, 1), (2, 2)])

        # self.assertEqual(len(df), 2)
        # self.assertListEqual(df["hive_server_hook.a"].values.tolist(), [1, 2])

        hook.get_conn.assert_called_with(self.database)
        hook.mock_cursor.execute.assert_any_call(
            'CREATE TABLE IF NOT EXISTS test_multi_statements (i INT)')
        hook.mock_cursor.execute.assert_any_call(f'SELECT * FROM {self.table}')
        hook.mock_cursor.execute.assert_any_call(
            'DROP TABLE test_multi_statements')
        hook.mock_cursor.execute.assert_any_call(
            'set airflow.ctx.dag_id=test_dag_id')
        hook.mock_cursor.execute.assert_any_call(
            'set airflow.ctx.task_id=HiveHook_3835')
        hook.mock_cursor.execute.assert_any_call(
            'set airflow.ctx.execution_date=2015-01-01T00:00:00+00:00')
        hook.mock_cursor.execute.assert_any_call(
            'set airflow.ctx.dag_run_id=55')
        hook.mock_cursor.execute.assert_any_call(
            'set airflow.ctx.dag_owner=airflow')
        hook.mock_cursor.execute.assert_any_call(
            'set [email protected]')
Пример #9
0
    def test_execute_with_hive_conf(self, mock_mysql_hook):
        context = {}
        mock_hive_hook = MockHiveServer2Hook()
        mock_hive_hook.get_records = MagicMock(return_value='test_hive_results')

        self.kwargs.update(dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'}))

        with patch('airflow.providers.apache.hive.operators.hive_to_mysql.HiveServer2Hook',
                   return_value=mock_hive_hook):
            HiveToMySqlTransferOperator(**self.kwargs).execute(context=context)

            hive_conf = context_to_airflow_vars(context)
            hive_conf.update(self.kwargs['hive_conf'])

        mock_hive_hook.get_records.assert_called_once_with(
            self.kwargs['sql'],
            hive_conf=hive_conf
        )
Пример #10
0
 def test_get_conn(self):
     hook = MockHiveServer2Hook()
     hook.get_conn()
Пример #11
0
    def test_mysql_to_hive_verify_loaded_values(self, mock_popen,
                                                mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "test_mysql_to_hive"

        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        hook = MySqlHook()

        try:
            minmax = (
                255,
                65535,
                16777215,
                4294967295,
                18446744073709551615,
                -128,
                -32768,
                -8388608,
                -2147483648,
                -9223372036854775808,
            )

            with hook.get_conn() as conn:
                conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")
                conn.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT   UNSIGNED,
                        c1 SMALLINT  UNSIGNED,
                        c2 MEDIUMINT UNSIGNED,
                        c3 INT       UNSIGNED,
                        c4 BIGINT    UNSIGNED,
                        c5 TINYINT,
                        c6 SMALLINT,
                        c7 MEDIUMINT,
                        c8 INT,
                        c9 BIGINT
                    )
                """.format(mysql_table))
                conn.execute("""
                    INSERT INTO {} VALUES (
                        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
                    )
                """.format(mysql_table, *minmax))

            with mock.patch.dict('os.environ', self.env_vars):
                op = MySqlToHiveOperator(
                    task_id='test_m2h',
                    hive_cli_conn_id='hive_cli_default',
                    sql=f"SELECT * FROM {mysql_table}",
                    hive_table=hive_table,
                    recreate=True,
                    delimiter=",",
                    dag=self.dag,
                )
                op.run(start_date=DEFAULT_DATE,
                       end_date=DEFAULT_DATE,
                       ignore_ti_state=True)

                mock_cursor = MockConnectionCursor()
                mock_cursor.iterable = [minmax]
                hive_hook = MockHiveServer2Hook(connection_cursor=mock_cursor)

                result = hive_hook.get_records(f"SELECT * FROM {hive_table}")
                assert result[0] == minmax

                hive_cmd = [
                    'beeline',
                    '-u',
                    '"jdbc:hive2://localhost:10000/default"',
                    '-hiveconf',
                    'airflow.ctx.dag_id=unit_test_dag',
                    '-hiveconf',
                    'airflow.ctx.task_id=test_m2h',
                    '-hiveconf',
                    'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
                    '-hiveconf',
                    'airflow.ctx.dag_run_id=55',
                    '-hiveconf',
                    'airflow.ctx.dag_owner=airflow',
                    '-hiveconf',
                    '[email protected]',
                    '-hiveconf',
                    'mapreduce.job.queuename=airflow',
                    '-hiveconf',
                    'mapred.job.queue.name=airflow',
                    '-hiveconf',
                    'tez.queue.name=airflow',
                    '-f',
                    '/tmp/airflow_hiveop_test_mysql_to_hive/tmptest_mysql_to_hive',
                ]

                mock_popen.assert_called_with(
                    hive_cmd,
                    stdout=mock_subprocess.PIPE,
                    stderr=mock_subprocess.STDOUT,
                    cwd="/tmp/airflow_hiveop_test_mysql_to_hive",
                    close_fds=True,
                )

        finally:
            with hook.get_conn() as conn:
                conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")
Пример #12
0
    def test_mysql_to_hive_verify_csv_special_char(self, mock_popen,
                                                   mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "test_mysql_to_hive"

        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        hook = MySqlHook()

        try:
            db_record = ('c0', '["true"]')
            with hook.get_conn() as conn:
                conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")
                conn.execute("""
                    CREATE TABLE {} (
                        c0 VARCHAR(25),
                        c1 VARCHAR(25)
                    )
                """.format(mysql_table))
                conn.execute("""
                    INSERT INTO {} VALUES (
                        '{}', '{}'
                    )
                """.format(mysql_table, *db_record))

            with mock.patch.dict('os.environ', self.env_vars):
                import unicodecsv as csv

                op = MySqlToHiveOperator(
                    task_id='test_m2h',
                    hive_cli_conn_id='hive_cli_default',
                    sql=f"SELECT * FROM {mysql_table}",
                    hive_table=hive_table,
                    recreate=True,
                    delimiter=",",
                    quoting=csv.QUOTE_NONE,
                    quotechar='',
                    escapechar='@',
                    dag=self.dag,
                )
                op.run(start_date=DEFAULT_DATE,
                       end_date=DEFAULT_DATE,
                       ignore_ti_state=True)

                mock_cursor = MockConnectionCursor()
                mock_cursor.iterable = [('c0', '["true"]'), (2, 2)]
                hive_hook = MockHiveServer2Hook(connection_cursor=mock_cursor)

                result = hive_hook.get_records(f"SELECT * FROM {hive_table}")
            assert result[0] == db_record

            hive_cmd = [
                'beeline',
                '-u',
                '"jdbc:hive2://localhost:10000/default"',
                '-hiveconf',
                'airflow.ctx.dag_id=unit_test_dag',
                '-hiveconf',
                'airflow.ctx.task_id=test_m2h',
                '-hiveconf',
                'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
                '-hiveconf',
                'airflow.ctx.dag_run_id=55',
                '-hiveconf',
                'airflow.ctx.dag_owner=airflow',
                '-hiveconf',
                '[email protected]',
                '-hiveconf',
                'mapreduce.job.queuename=airflow',
                '-hiveconf',
                'mapred.job.queue.name=airflow',
                '-hiveconf',
                'tez.queue.name=airflow',
                '-f',
                '/tmp/airflow_hiveop_test_mysql_to_hive/tmptest_mysql_to_hive',
            ]

            mock_popen.assert_called_with(
                hive_cmd,
                stdout=mock_subprocess.PIPE,
                stderr=mock_subprocess.STDOUT,
                cwd="/tmp/airflow_hiveop_test_mysql_to_hive",
                close_fds=True,
            )
        finally:
            with hook.get_conn() as conn:
                conn.execute(f"DROP TABLE IF EXISTS {mysql_table}")