def test__match_headers(self): self.kwargs['field_dict'] = OrderedDict([('Sno', 'BIGINT'), ('Some,Text', 'STRING')]) assert S3ToHiveOperator(**self.kwargs)._match_headers( ['Sno', 'Some,Text']), "Header row doesnt match expected value" # Testing with different column order assert not S3ToHiveOperator(**self.kwargs)._match_headers( ['Some,Text', 'Sno']), "Header row doesnt match expected value" # Testing with extra column in header assert not S3ToHiveOperator(**self.kwargs)._match_headers([ 'Sno', 'Some,Text', 'ExtraColumn' ]), "Header row doesnt match expected value"
def test__get_top_row_as_list(self): self.kwargs['delimiter'] = '\t' fn_txt = self._get_fn('.txt', True) header_list = S3ToHiveOperator( **self.kwargs)._get_top_row_as_list(fn_txt) self.assertEqual(header_list, ['Sno', 'Some,Text'], msg="Top row from file doesnt matched expected value") self.kwargs['delimiter'] = ',' header_list = S3ToHiveOperator( **self.kwargs)._get_top_row_as_list(fn_txt) self.assertEqual(header_list, ['Sno\tSome', 'Text'], msg="Top row from file doesnt matched expected value")
def test_execute(self, mock_hiveclihook): conn = boto3.client('s3') conn.create_bucket(Bucket='bucket') # Testing txt, zip, bz2 files with and without header row for (ext, has_header) in product(['.txt', '.gz', '.bz2', '.GZ'], [True, False]): self.kwargs['headers'] = has_header self.kwargs['check_headers'] = has_header logging.info("Testing %s format %s header", ext, 'with' if has_header else 'without') self.kwargs['input_compressed'] = ext.lower() != '.txt' self.kwargs['s3_key'] = 's3://bucket/' + self.s3_key + ext ip_fn = self._get_fn(ext, self.kwargs['headers']) op_fn = self._get_fn(ext, False) # Upload the file into the Mocked S3 bucket conn.upload_file(ip_fn, 'bucket', self.s3_key + ext) # file parameter to HiveCliHook.load_file is compared # against expected file output mock_hiveclihook( ).load_file.side_effect = lambda *args, **kwargs: self.assertTrue( self._check_file_equality(args[0], op_fn, ext), f'{ext} output file not as expected', ) # Execute S3ToHiveTransfer s32hive = S3ToHiveOperator(**self.kwargs) s32hive.execute(None)
def test__delete_top_row_and_compress(self): s32hive = S3ToHiveOperator(**self.kwargs) # Testing gz file type fn_txt = self._get_fn('.txt', True) gz_txt_nh = s32hive._delete_top_row_and_compress( fn_txt, '.gz', self.tmp_dir) fn_gz = self._get_fn('.gz', False) assert self._check_file_equality( gz_txt_nh, fn_gz, '.gz'), "gz Compressed file not as expected" # Testing bz2 file type bz2_txt_nh = s32hive._delete_top_row_and_compress( fn_txt, '.bz2', self.tmp_dir) fn_bz2 = self._get_fn('.bz2', False) assert self._check_file_equality( bz2_txt_nh, fn_bz2, '.bz2'), "bz2 Compressed file not as expected"
def test_execute_with_select_expression(self, mock_hiveclihook): conn = boto3.client('s3') conn.create_bucket(Bucket='bucket') select_expression = "SELECT * FROM S3Object s" bucket = 'bucket' # Only testing S3ToHiveTransfer calls S3Hook.select_key with # the right parameters and its execute method succeeds here, # since Moto doesn't support select_object_content as of 1.3.2. for (ext, has_header) in product(['.txt', '.gz', '.GZ'], [True, False]): input_compressed = ext.lower() != '.txt' key = self.s3_key + ext self.kwargs['check_headers'] = False self.kwargs['headers'] = has_header self.kwargs['input_compressed'] = input_compressed self.kwargs['select_expression'] = select_expression self.kwargs['s3_key'] = f's3://{bucket}/{key}' ip_fn = self._get_fn(ext, has_header) # Upload the file into the Mocked S3 bucket conn.upload_file(ip_fn, bucket, key) input_serialization = {'CSV': {'FieldDelimiter': self.delimiter}} if input_compressed: input_serialization['CompressionType'] = 'GZIP' if has_header: input_serialization['CSV']['FileHeaderInfo'] = 'USE' # Confirm that select_key was called with the right params with mock.patch( 'airflow.providers.amazon.aws.hooks.s3.S3Hook.select_key', return_value="") as mock_select_key: # Execute S3ToHiveTransfer s32hive = S3ToHiveOperator(**self.kwargs) s32hive.execute(None) mock_select_key.assert_called_once_with( bucket_name=bucket, key=key, expression=select_expression, input_serialization=input_serialization, )
def test_bad_parameters(self): self.kwargs['check_headers'] = True self.kwargs['headers'] = False with pytest.raises(AirflowException, match="To check_headers.*"): S3ToHiveOperator(**self.kwargs)