def test_normalize_s3_path(self): assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path( "s3://buc/s/o/me/path.txt") assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path( "s3a://buc/s/o/me/path.txt") assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path( "s3n://buc/s/o/me/path.txt")
def __init__(self, application_buckets, data_buckets, aws_credentials_api, aws_credentials_s3_put, aws_credentials_s3_del): self.app_buckets = application_buckets self.data_buckets = data_buckets self.s3_util_api = S3Util(aws_credentials_api) self.s3_util_put = S3Util(aws_credentials_s3_put) self.s3_util_del = S3Util(aws_credentials_s3_del)
def _extract_set_of_s3_buckets(*args, **kwargs): """ Process all arguments and extract set of S3 buckets from them that represent S3 paths. """ # extract arguments which are S3 paths from args and kwargs s3_paths = [] s3_paths.extend(filter(lambda el: S3Util.is_s3_path(el), args)) s3_paths.extend(filter(lambda el: S3Util.is_s3_path(el), kwargs.values())) # extract S3 bucket names from s3_paths s3_buckets = map(lambda s3_path: S3Util.get_bucket_and_key(s3_path)[0], s3_paths) return set(s3_buckets)
def test_is_s3_path(self): assert S3Util.is_s3_path("s3a://buc/s/o/me/path.txt") assert S3Util.is_s3_path("s3n://buc/s/o/me/path.txt") assert S3Util.is_s3_path("s3://buc/s/o/me/path.txt") assert S3Util.is_s3_path("s3://bucket/") assert not S3Util.is_s3_path( "hdfs://sandbox.com:8020/user/it1/.staging") assert not S3Util.is_s3_path("/local/path.txt") assert not S3Util.is_s3_path("relative/path.txt") assert not S3Util.is_s3_path(["s3://bucket/"]) assert not S3Util.is_s3_path(3) assert not S3Util.is_s3_path({})
def dump_data_to_s3(self, object_key, data, s3_resource=None): # Use self.s3_resource if s3_resource is not specified. if s3_resource is None: s3_resource = self.s3_resource bucket_name, object_path = S3Util.get_bucket_and_key(object_key) s3_resource.Bucket(bucket_name).put_object(Key=object_path, Body=data)
def test_move_object(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_key = "test_src_key" test_destination_key = "test_destination_key" test_content = "aaa1" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key, Body=test_content) s3_util = S3Util(AWSCredentials("", "")) s3_util.move_object( ("s3://" + test_src_bucket_name + "/" + test_src_key), ("s3://" + test_destination_bucket_name + "/" + test_destination_key)) destination_objects = list( s3_resource.Bucket(test_destination_bucket_name).objects.all()) assert len(destination_objects) == 1 assert destination_objects[0].key == test_destination_key src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 0
def is_s3_path(*args, **kwargs): """ With this one function we will test invocation of static methods """ FakeS3Util.calls.append( Invocation(AWSCredentials("", ""), "is_s3_path", args, kwargs)) return S3Util.is_s3_path(*args, **kwargs)
def __init__( self, emr_cluster_id, aws_region, aws_credentials, timeout_seconds, polling_interval_seconds, long_timeout_seconds, ): """ Initialize EMR Cluster Client :param emr_cluster_id: id of the EMR Cluster :param aws_region: aws region where the cluster is running :param aws_credentials: AWSCredentials object holding access_key_id and secret_access_key :param timeout_seconds: request timeout in seconds :param polling_interval_seconds: polling interval time in seconds :param long_timeout_seconds: timeout for spark steps in seconds """ self.emr_cluster_id = emr_cluster_id self.aws_region = aws_region self.timeout_seconds = self._validate_float("timeout_seconds", timeout_seconds) self.polling_interval_seconds = self._validate_float("polling_interval_seconds", polling_interval_seconds) self.long_timeout_seconds = self._validate_float("long_timeout_seconds", long_timeout_seconds) self.client = boto3.client( 'emr', aws_access_key_id=aws_credentials.access_key_id, aws_secret_access_key=aws_credentials.secret_access_key, region_name=aws_region ) self.s3_util = S3Util(aws_credentials)
def test_wait_for_file_availability(self): bucket = "cur_bucket" key = "stdout.txt" data = "no output" s3_full_path = "s3://{}/{}".format(bucket, key) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=bucket) def create_file(): s3_resource.Bucket(bucket).put_object(Key=key, Body=data) s3_util = S3Util(AWSCredentials("", "")) polling_interval = 0.02 timeout = 0.5 with ConcurrentExecutor(create_file, 0.2): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout) s3_util.delete_object(s3_full_path) err_msg = "File {} failed to be available after {} seconds.".format( s3_full_path, timeout) with pytest.raises(M3DAWSAPIException, match=err_msg): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout)
def get_object_content_from_s3(self, file_path_s3, s3_resource=None): # Use self.s3_resource if s3_resource is not specified. if s3_resource is None: s3_resource = self.s3_resource bucket, key = S3Util.get_bucket_and_key(file_path_s3) return s3_resource.Object(bucket, key).get()["Body"].read().decode("utf-8")
def get_child_objects(self, dir_path_s3, s3_resource=None): # Use self.s3_resource if s3_resource is not specified. if s3_resource is None: s3_resource = self.s3_resource bucket_name, key = S3Util.get_bucket_and_key(dir_path_s3) child_objects = s3_resource.Bucket(bucket_name).objects.filter( Prefix=key) child_files = [ "s3://" + bucket_name + "/" + obj.key for obj in child_objects ] return child_files
def test_delete_object(self): test_bucket_name = "test_bucket" test_key = "test_dir/test_key" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 0
def test_upload_object(self): test_bucket_name = "test_bucket" test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_util = S3Util(AWSCredentials("", "")) s3_util.upload_object(file_name, "s3://" + test_bucket_name + "/" + test_key) s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all()) assert len(s3_objects) == 1 assert s3_objects[0].key == test_key
def create_output_file(self, step, dummy_text): logging.info("step={{id: {}, state: {}}}".format(step.id, step.state)) step_id = step.id s3_log_file_location = "s3://{}/log/{}/steps/{}/stdout.gz" \ .format(self.default_log_bucket, self.emr_cluster_id, step_id) local_log_file_location = self.local_run_dir.join("stdout.gz") logging.info( "local_log_file_location={}".format(local_log_file_location)) logging.info("s3_log_file_location={}".format( str(s3_log_file_location))) with gzip.open(str(local_log_file_location), 'wb') as f: f.write(dummy_text.encode("utf-8")) s3_util = S3Util(AWSCredentials("", "")) s3_util.upload_object(str(local_log_file_location), str(s3_log_file_location))
def test_delete_objects(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 1 assert remaining_objects[0].key == test_keys[0]
def test_list_objects_in_bucket(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] test_resources = [ "s3://{}/".format(test_bucket_name) + key for key in test_keys ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) keys = s3_util.list_objects("s3://" + test_bucket_name + "/" + test_prefix) assert keys == test_resources[1:4]
def test_move_objects(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_prefix = "test_src_dir" test_destination_prefix = "test_destination_dir" test_src_keys = [ "test_key1", "{}/test_key2".format(test_src_prefix), "{}/test_key3".format(test_src_prefix), "{}/test_key4".format(test_src_prefix) ] test_destination_keys = [ "{}/test_key2".format(test_destination_prefix), "{}/test_key3".format(test_destination_prefix), "{}/test_key4".format(test_destination_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) for key in test_src_keys: s3_resource.Bucket(test_src_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.move_objects( ("s3://" + test_src_bucket_name + "/" + test_src_prefix), ("s3://" + test_destination_bucket_name + "/" + test_destination_prefix)) src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 1 assert src_objects[0].key == test_src_keys[0] destination_objects = s3_resource.Bucket( test_destination_bucket_name).objects.all() assert sorted(map(lambda x: x.key, destination_objects)) == test_destination_keys
def get_step_err_path(self, emr_step_id): """ Get location in s3 of error log file for a particular failed step :param emr_step_id: emr step id :return: s3 location of the file """ _, emr_step_failure_details = self.get_step_status(emr_step_id) if emr_step_failure_details is None or emr_step_failure_details.log_file is None: return None error_file_path = emr_step_failure_details.log_file if not error_file_path.endswith(EMRClusterClient.AWSConstants.EMR_STEP_ERROR_FILE_NAME): error_file_path = os.path.join( error_file_path, EMRClusterClient.AWSConstants.EMR_STEP_ERROR_FILE_NAME ) # error_file_path can have 's3n://' protocol instead of 's3://'. So we are changing it. error_file = S3Util.normalize_s3_path(error_file_path) return error_file
def get_step_output_path(self, emr_step_id): """ Get location in s3 of stdout file for a particular step :param emr_step_id: emr step id :return: s3 location of the file """ response = self.client.describe_cluster( ClusterId=self.emr_cluster_id ) cluster_log_location = \ response[EMRClusterClient.EMRResponseFields.CLUSTER][EMRClusterClient.EMRResponseFields.LOG_URI] out_file_path = cluster_log_location + \ EMRClusterClient.AWSConstants.EMR_STEP_OUTPUT_FILE_NAME.format( self.emr_cluster_id, emr_step_id ) # out_file_path can have 's3n://' protocol instead of 's3://'. So we are changing it. out_file = S3Util.normalize_s3_path(out_file_path) return out_file