def execute(self, context: 'Context') -> List[str]: # list all files in an Google Cloud Storage bucket hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) self.log.info( 'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s', self.bucket, self.delimiter, self.prefix, ) files = hook.list(bucket_name=self.bucket, prefix=self.prefix, delimiter=self.delimiter) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify, extra_args=self.dest_s3_extra_args) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key) # look for the bucket and the prefix to avoid look into # parent directories/keys existing_files = s3_hook.list_keys(bucket_name, prefix=prefix) # in case that no files exists, return an empty array to avoid errors existing_files = existing_files if existing_files is not None else [] # remove the prefix for the existing files to allow the match existing_files = [ file.replace(prefix, '', 1) for file in existing_files ] files = list(set(files) - set(existing_files)) if files: for file in files: with hook.provide_file( object_name=file, bucket_name=self.bucket) as local_tmp_file: dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_file( filename=local_tmp_file.name, key=dest_key, replace=self.replace, acl_policy=self.s3_acl_policy, ) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def _resolve_bucket_and_key(self, key): """If key is URI, parse bucket""" if self.bucket_name is None: return S3Hook.parse_s3_url(key) else: parsed_url = urlparse(key) if parsed_url.scheme != '' or parsed_url.netloc != '': raise AirflowException('If bucket_name provided, bucket_key must be relative path, not URI.') return self.bucket_name, key
def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super().execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key) # look for the bucket and the prefix to avoid look into # parent directories/keys existing_files = s3_hook.list_keys(bucket_name, prefix=prefix) # in case that no files exists, return an empty array to avoid errors existing_files = existing_files if existing_files is not None else [] # remove the prefix for the existing files to allow the match existing_files = [ file.replace(prefix, '', 1) for file in existing_files ] files = list(set(files) - set(existing_files)) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def check_s3_url(self, s3url: str) -> bool: """ Check if an S3 URL exists :param s3url: S3 url :rtype: bool """ bucket, key = S3Hook.parse_s3_url(s3url) if not self.s3_hook.check_for_bucket(bucket_name=bucket): raise AirflowException( f"The input S3 Bucket {bucket} does not exist ") if (key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket) and not self.s3_hook.check_for_prefix( prefix=key, bucket_name=bucket, delimiter='/')): # check if s3 key exists in the case user provides a single file # or if s3 prefix exists in the case user provides multiple files in # a prefix raise AirflowException( f"The input S3 Key or Prefix {s3url} does not exist in the Bucket {bucket}" ) return True
def test_parse_s3_url(self): parsed = S3Hook.parse_s3_url(self.s3_test_url) self.assertEqual(parsed, ("test", "this/is/not/a-real-key.txt"), "Incorrect parsing of the s3 url")
def test_parse_s3_url(self): parsed = S3Hook.parse_s3_url("s3://test/this/is/not/a-real-key.txt") assert parsed == ( "test", "this/is/not/a-real-key.txt"), "Incorrect parsing of the s3 url"
from airflow.providers.amazon.aws.example_dags.example_google_api_to_s3_transfer_advanced import ( S3_DESTINATION_KEY as ADVANCED_S3_DESTINATION_KEY, ) from airflow.providers.amazon.aws.example_dags.example_google_api_to_s3_transfer_basic import ( S3_DESTINATION_KEY as BASIC_S3_DESTINATION_KEY, ) from airflow.providers.amazon.aws.hooks.s3 import S3Hook from tests.providers.google.cloud.utils.gcp_authenticator import GMP_KEY from tests.test_utils.amazon_system_helpers import ( AWS_DAG_FOLDER, AmazonSystemTest, provide_aws_context, provide_aws_s3_bucket, ) from tests.test_utils.gcp_system_helpers import GoogleSystemTest, provide_gcp_context BASIC_BUCKET, _ = S3Hook.parse_s3_url(BASIC_S3_DESTINATION_KEY) ADVANCED_BUCKET, _ = S3Hook.parse_s3_url(ADVANCED_S3_DESTINATION_KEY) @pytest.fixture def provide_s3_bucket_basic(): with provide_aws_s3_bucket(BASIC_BUCKET): yield @pytest.fixture def provide_s3_bucket_advanced(): with provide_aws_s3_bucket(ADVANCED_BUCKET): yield