示例#1
0
    def test__get_top_row_as_list(self):
        self.kwargs['delimiter'] = '\t'
        fn_txt = self._get_fn('.txt', True)
        header_list = S3ToHiveTransfer(**self.kwargs).\
            _get_top_row_as_list(fn_txt)
        self.assertEqual(header_list, ['Sno', 'Some,Text'],
                         msg="Top row from file doesnt matched expected value")

        self.kwargs['delimiter'] = ','
        header_list = S3ToHiveTransfer(**self.kwargs).\
            _get_top_row_as_list(fn_txt)
        self.assertEqual(header_list, ['Sno\tSome', 'Text'],
                         msg="Top row from file doesnt matched expected value")
示例#2
0
 def test__match_headers(self):
     self.kwargs['field_dict'] = OrderedDict([('Sno', 'BIGINT'),
                                              ('Some,Text', 'STRING')])
     self.assertTrue(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Sno', 'Some,Text']),
                     msg="Header row doesnt match expected value")
     # Testing with different column order
     self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Some,Text', 'Sno']),
                      msg="Header row doesnt match expected value")
     # Testing with extra column in header
     self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers(
         ['Sno', 'Some,Text', 'ExtraColumn']),
                      msg="Header row doesnt match expected value")
示例#3
0
    def test_execute(self, mock_hiveclihook):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket='bucket')

        # Testing txt, zip, bz2 files with and without header row
        for (ext, has_header) in product(['.txt', '.gz', '.bz2'],
                                         [True, False]):
            self.kwargs['headers'] = has_header
            self.kwargs['check_headers'] = has_header
            logging.info("Testing {0} format {1} header".format(
                ext, ('with' if has_header else 'without')))
            self.kwargs['input_compressed'] = ext != '.txt'
            self.kwargs['s3_key'] = 's3://bucket/' + self.s3_key + ext
            ip_fn = self._get_fn(ext, self.kwargs['headers'])
            op_fn = self._get_fn(ext, False)

            # Upload the file into the Mocked S3 bucket
            conn.upload_file(ip_fn, 'bucket', self.s3_key + ext)

            # file parameter to HiveCliHook.load_file is compared
            # against expected file output
            mock_hiveclihook().load_file.side_effect = \
                lambda *args, **kwargs: \
                self.assertTrue(
                    self._check_file_equality(args[0], op_fn, ext),
                    msg='{0} output file not as expected'.format(ext))
            # Execute S3ToHiveTransfer
            s32hive = S3ToHiveTransfer(**self.kwargs)
            s32hive.execute(None)
 def test_execute(self, mock_s3hook, mock_hiveclihook):
     # Testing txt, zip, bz2 files with and without header row
     for test in product(['.txt', '.gz', '.bz2'], [True, False]):
         ext = test[0]
         has_header = test[1]
         self.kwargs['headers'] = has_header
         self.kwargs['check_headers'] = has_header
         logging.info("Testing {0} format {1} header".format(
             ext, ('with' if has_header else 'without')))
         self.kwargs['input_compressed'] = (False
                                            if ext == '.txt' else True)
         self.kwargs['s3_key'] = self.s3_key + ext
         ip_fn = self._get_fn(ext, self.kwargs['headers'])
         op_fn = self._get_fn(ext, False)
         # Mock s3 object returned by S3Hook
         mock_s3_object = mock.Mock(key=self.kwargs['s3_key'])
         mock_s3_object.get_contents_to_file.side_effect = \
             lambda dest_file: \
             self._cp_file_contents(ip_fn, dest_file.name)
         mock_s3hook().get_key.return_value = mock_s3_object
         # file paramter to HiveCliHook.load_file is compared
         # against expected file oputput
         mock_hiveclihook().load_file.side_effect = \
             lambda *args, **kwargs: \
             self.assertTrue(
                 self._check_file_equality(args[0],
                                           op_fn,
                                           ext
                                           ),
                 msg='{0} output file not as expected'.format(ext))
         # Execute S3ToHiveTransfer
         s32hive = S3ToHiveTransfer(**self.kwargs)
         s32hive.execute(None)
示例#5
0
 def test__delete_top_row_and_compress(self):
     s32hive = S3ToHiveTransfer(**self.kwargs)
     # Testing gz file type
     fn_txt = self._get_fn('.txt', True)
     gz_txt_nh = s32hive._delete_top_row_and_compress(
         fn_txt, '.gz', self.tmp_dir)
     fn_gz = self._get_fn('.gz', False)
     self.assertTrue(self._check_file_equality(gz_txt_nh, fn_gz, '.gz'),
                     msg="gz Compressed file not as expected")
     # Testing bz2 file type
     bz2_txt_nh = s32hive._delete_top_row_and_compress(
         fn_txt, '.bz2', self.tmp_dir)
     fn_bz2 = self._get_fn('.bz2', False)
     self.assertTrue(self._check_file_equality(bz2_txt_nh, fn_bz2, '.bz2'),
                     msg="bz2 Compressed file not as expected")
示例#6
0
    def test_execute_with_select_expression(self, mock_hiveclihook):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket='bucket')

        select_expression = "SELECT * FROM S3Object s"
        bucket = 'bucket'

        # Only testing S3ToHiveTransfer calls S3Hook.select_key with
        # the right parameters and its execute method succeeds here,
        # since Moto doesn't support select_object_content as of 1.3.2.
        for (ext, has_header) in product(['.txt', '.gz', '.GZ'],
                                         [True, False]):
            input_compressed = ext.lower() != '.txt'
            key = self.s3_key + ext

            self.kwargs['check_headers'] = False
            self.kwargs['headers'] = has_header
            self.kwargs['input_compressed'] = input_compressed
            self.kwargs['select_expression'] = select_expression
            self.kwargs['s3_key'] = 's3://{0}/{1}'.format(bucket, key)

            ip_fn = self._get_fn(ext, has_header)

            # Upload the file into the Mocked S3 bucket
            conn.upload_file(ip_fn, bucket, key)

            input_serialization = {'CSV': {'FieldDelimiter': self.delimiter}}
            if input_compressed:
                input_serialization['CompressionType'] = 'GZIP'
            if has_header:
                input_serialization['CSV']['FileHeaderInfo'] = 'USE'

            # Confirm that select_key was called with the right params
            with mock.patch(
                    'airflow.providers.amazon.aws.hooks.s3.S3Hook.select_key',
                    return_value="") as mock_select_key:
                # Execute S3ToHiveTransfer
                s32hive = S3ToHiveTransfer(**self.kwargs)
                s32hive.execute(None)

                mock_select_key.assert_called_once_with(
                    bucket_name=bucket,
                    key=key,
                    expression=select_expression,
                    input_serialization=input_serialization)