Exemplo n.º 1
0
    def test__get_top_row_as_list(self):
        self.kwargs['delimiter'] = '\t'
        fn_txt = self._get_fn('.txt', True)
        header_list = S3ToHiveTransfer(**self.kwargs). \
            _get_top_row_as_list(fn_txt)
        self.assertEqual(header_list, ['Sno', 'Some,Text'],
                         msg="Top row from file doesnt matched expected value")

        self.kwargs['delimiter'] = ','
        header_list = S3ToHiveTransfer(**self.kwargs). \
            _get_top_row_as_list(fn_txt)
        self.assertEqual(header_list, ['Sno\tSome', 'Text'],
                         msg="Top row from file doesnt matched expected value")
Exemplo n.º 2
0
 def test__match_headers(self):
     self.kwargs['field_dict'] = OrderedDict([('Sno', 'BIGINT'),
                                              ('Some,Text', 'STRING')])
     self.assertTrue(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Sno', 'Some,Text']),
                     msg="Header row doesnt match expected value")
     # Testing with different column order
     self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Some,Text', 'Sno']),
                      msg="Header row doesnt match expected value")
     # Testing with extra column in header
     self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Sno', 'Some,Text', 'ExtraColumn']),
                      msg="Header row doesnt match expected value")
Exemplo n.º 3
0
    def test_execute(self, mock_hiveclihook):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket='bucket')

        # Testing txt, zip, bz2 files with and without header row
        for (ext, has_header) in product(['.txt', '.gz', '.bz2', '.GZ'],
                                         [True, False]):
            self.kwargs['headers'] = has_header
            self.kwargs['check_headers'] = has_header
            logging.info("Testing %s format %s header", ext,
                         'with' if has_header else 'without')
            self.kwargs['input_compressed'] = ext.lower() != '.txt'
            self.kwargs['s3_key'] = 's3://bucket/' + self.s3_key + ext
            ip_fn = self._get_fn(ext, self.kwargs['headers'])
            op_fn = self._get_fn(ext, False)

            # Upload the file into the Mocked S3 bucket
            conn.upload_file(ip_fn, 'bucket', self.s3_key + ext)

            # file parameter to HiveCliHook.load_file is compared
            # against expected file output
            mock_hiveclihook().load_file.side_effect = \
                lambda *args, **kwargs: self.assertTrue(
                    self._check_file_equality(args[0], op_fn, ext),
                    msg='{0} output file not as expected'.format(ext))
            # Execute S3ToHiveTransfer
            s32hive = S3ToHiveTransfer(**self.kwargs)
            s32hive.execute(None)
Exemplo n.º 4
0
 def test__delete_top_row_and_compress(self):
     s32hive = S3ToHiveTransfer(**self.kwargs)
     # Testing gz file type
     fn_txt = self._get_fn('.txt', True)
     gz_txt_nh = s32hive._delete_top_row_and_compress(
         fn_txt, '.gz', self.tmp_dir)
     fn_gz = self._get_fn('.gz', False)
     self.assertTrue(self._check_file_equality(gz_txt_nh, fn_gz, '.gz'),
                     msg="gz Compressed file not as expected")
     # Testing bz2 file type
     bz2_txt_nh = s32hive._delete_top_row_and_compress(
         fn_txt, '.bz2', self.tmp_dir)
     fn_bz2 = self._get_fn('.bz2', False)
     self.assertTrue(self._check_file_equality(bz2_txt_nh, fn_bz2, '.bz2'),
                     msg="bz2 Compressed file not as expected")
Exemplo n.º 5
0
    def test_execute_with_select_expression(self, mock_hiveclihook):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket='bucket')

        select_expression = "SELECT * FROM S3Object s"
        bucket = 'bucket'

        # Only testing S3ToHiveTransfer calls S3Hook.select_key with
        # the right parameters and its execute method succeeds here,
        # since Moto doesn't support select_object_content as of 1.3.2.
        for (ext, has_header) in product(['.txt', '.gz', '.GZ'],
                                         [True, False]):
            input_compressed = ext.lower() != '.txt'
            key = self.s3_key + ext

            self.kwargs['check_headers'] = False
            self.kwargs['headers'] = has_header
            self.kwargs['input_compressed'] = input_compressed
            self.kwargs['select_expression'] = select_expression
            self.kwargs['s3_key'] = 's3://{0}/{1}'.format(bucket, key)

            ip_fn = self._get_fn(ext, has_header)

            # Upload the file into the Mocked S3 bucket
            conn.upload_file(ip_fn, bucket, key)

            input_serialization = {'CSV': {'FieldDelimiter': self.delimiter}}
            if input_compressed:
                input_serialization['CompressionType'] = 'GZIP'
            if has_header:
                input_serialization['CSV']['FileHeaderInfo'] = 'USE'

            # Confirm that select_key was called with the right params
            with mock.patch(
                    'airflow.providers.amazon.aws.hooks.s3.S3Hook.select_key',
                    return_value="") as mock_select_key:
                # Execute S3ToHiveTransfer
                s32hive = S3ToHiveTransfer(**self.kwargs)
                s32hive.execute(None)

                mock_select_key.assert_called_once_with(
                    bucket_name=bucket,
                    key=key,
                    expression=select_expression,
                    input_serialization=input_serialization)