예제 #1
0
 def test_normalize_s3_path(self):
     assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path(
         "s3://buc/s/o/me/path.txt")
     assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path(
         "s3a://buc/s/o/me/path.txt")
     assert "s3://buc/s/o/me/path.txt" == S3Util.normalize_s3_path(
         "s3n://buc/s/o/me/path.txt")
예제 #2
0
    def __init__(self, application_buckets, data_buckets, aws_credentials_api,
                 aws_credentials_s3_put, aws_credentials_s3_del):
        self.app_buckets = application_buckets
        self.data_buckets = data_buckets

        self.s3_util_api = S3Util(aws_credentials_api)
        self.s3_util_put = S3Util(aws_credentials_s3_put)
        self.s3_util_del = S3Util(aws_credentials_s3_del)
예제 #3
0
    def _extract_set_of_s3_buckets(*args, **kwargs):
        """
        Process all arguments and extract set of S3 buckets from them that represent S3 paths.
        """
        # extract arguments which are S3 paths from args and kwargs
        s3_paths = []
        s3_paths.extend(filter(lambda el: S3Util.is_s3_path(el), args))
        s3_paths.extend(filter(lambda el: S3Util.is_s3_path(el), kwargs.values()))

        # extract S3 bucket names from s3_paths
        s3_buckets = map(lambda s3_path: S3Util.get_bucket_and_key(s3_path)[0], s3_paths)

        return set(s3_buckets)
예제 #4
0
    def test_is_s3_path(self):
        assert S3Util.is_s3_path("s3a://buc/s/o/me/path.txt")
        assert S3Util.is_s3_path("s3n://buc/s/o/me/path.txt")
        assert S3Util.is_s3_path("s3://buc/s/o/me/path.txt")
        assert S3Util.is_s3_path("s3://bucket/")

        assert not S3Util.is_s3_path(
            "hdfs://sandbox.com:8020/user/it1/.staging")

        assert not S3Util.is_s3_path("/local/path.txt")
        assert not S3Util.is_s3_path("relative/path.txt")

        assert not S3Util.is_s3_path(["s3://bucket/"])
        assert not S3Util.is_s3_path(3)
        assert not S3Util.is_s3_path({})
예제 #5
0
    def dump_data_to_s3(self, object_key, data, s3_resource=None):
        # Use self.s3_resource if s3_resource is not specified.
        if s3_resource is None:
            s3_resource = self.s3_resource

        bucket_name, object_path = S3Util.get_bucket_and_key(object_key)
        s3_resource.Bucket(bucket_name).put_object(Key=object_path, Body=data)
예제 #6
0
    def test_move_object(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_key = "test_src_key"
        test_destination_key = "test_destination_key"
        test_content = "aaa1"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)
        s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key,
                                                            Body=test_content)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_object(
            ("s3://" + test_src_bucket_name + "/" + test_src_key),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_key))

        destination_objects = list(
            s3_resource.Bucket(test_destination_bucket_name).objects.all())
        assert len(destination_objects) == 1
        assert destination_objects[0].key == test_destination_key

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 0
 def is_s3_path(*args, **kwargs):
     """
     With this one function we will test invocation of static methods
     """
     FakeS3Util.calls.append(
         Invocation(AWSCredentials("", ""), "is_s3_path", args, kwargs))
     return S3Util.is_s3_path(*args, **kwargs)
예제 #8
0
    def __init__(
            self,
            emr_cluster_id,
            aws_region,
            aws_credentials,
            timeout_seconds,
            polling_interval_seconds,
            long_timeout_seconds,
    ):
        """
        Initialize EMR Cluster Client

        :param emr_cluster_id: id of the EMR Cluster
        :param aws_region: aws region where the cluster is running
        :param aws_credentials: AWSCredentials object holding access_key_id and secret_access_key
        :param timeout_seconds: request timeout in seconds
        :param polling_interval_seconds: polling interval time in seconds
        :param long_timeout_seconds: timeout for spark steps in seconds
        """
        self.emr_cluster_id = emr_cluster_id
        self.aws_region = aws_region

        self.timeout_seconds = self._validate_float("timeout_seconds", timeout_seconds)
        self.polling_interval_seconds = self._validate_float("polling_interval_seconds", polling_interval_seconds)
        self.long_timeout_seconds = self._validate_float("long_timeout_seconds", long_timeout_seconds)

        self.client = boto3.client(
            'emr',
            aws_access_key_id=aws_credentials.access_key_id,
            aws_secret_access_key=aws_credentials.secret_access_key,
            region_name=aws_region
        )

        self.s3_util = S3Util(aws_credentials)
예제 #9
0
    def test_wait_for_file_availability(self):
        bucket = "cur_bucket"
        key = "stdout.txt"
        data = "no output"

        s3_full_path = "s3://{}/{}".format(bucket, key)

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=bucket)

        def create_file():
            s3_resource.Bucket(bucket).put_object(Key=key, Body=data)

        s3_util = S3Util(AWSCredentials("", ""))

        polling_interval = 0.02
        timeout = 0.5

        with ConcurrentExecutor(create_file, 0.2):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)

        s3_util.delete_object(s3_full_path)

        err_msg = "File {} failed to be available after {} seconds.".format(
            s3_full_path, timeout)

        with pytest.raises(M3DAWSAPIException, match=err_msg):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)
예제 #10
0
    def get_object_content_from_s3(self, file_path_s3, s3_resource=None):
        # Use self.s3_resource if s3_resource is not specified.
        if s3_resource is None:
            s3_resource = self.s3_resource

        bucket, key = S3Util.get_bucket_and_key(file_path_s3)
        return s3_resource.Object(bucket,
                                  key).get()["Body"].read().decode("utf-8")
예제 #11
0
    def get_child_objects(self, dir_path_s3, s3_resource=None):
        # Use self.s3_resource if s3_resource is not specified.
        if s3_resource is None:
            s3_resource = self.s3_resource

        bucket_name, key = S3Util.get_bucket_and_key(dir_path_s3)
        child_objects = s3_resource.Bucket(bucket_name).objects.filter(
            Prefix=key)
        child_files = [
            "s3://" + bucket_name + "/" + obj.key for obj in child_objects
        ]
        return child_files
예제 #12
0
    def test_delete_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_dir/test_key"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 0
예제 #13
0
    def test_upload_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"
        file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.upload_object(file_name,
                              "s3://" + test_bucket_name + "/" + test_key)

        s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(s3_objects) == 1
        assert s3_objects[0].key == test_key
예제 #14
0
    def create_output_file(self, step, dummy_text):
        logging.info("step={{id: {}, state: {}}}".format(step.id, step.state))
        step_id = step.id
        s3_log_file_location = "s3://{}/log/{}/steps/{}/stdout.gz" \
            .format(self.default_log_bucket, self.emr_cluster_id, step_id)

        local_log_file_location = self.local_run_dir.join("stdout.gz")

        logging.info(
            "local_log_file_location={}".format(local_log_file_location))
        logging.info("s3_log_file_location={}".format(
            str(s3_log_file_location)))

        with gzip.open(str(local_log_file_location), 'wb') as f:
            f.write(dummy_text.encode("utf-8"))

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.upload_object(str(local_log_file_location),
                              str(s3_log_file_location))
예제 #15
0
    def test_delete_objects(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 1
        assert remaining_objects[0].key == test_keys[0]
예제 #16
0
    def test_list_objects_in_bucket(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]
        test_resources = [
            "s3://{}/".format(test_bucket_name) + key for key in test_keys
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        keys = s3_util.list_objects("s3://" + test_bucket_name + "/" +
                                    test_prefix)

        assert keys == test_resources[1:4]
예제 #17
0
    def test_move_objects(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_prefix = "test_src_dir"
        test_destination_prefix = "test_destination_dir"
        test_src_keys = [
            "test_key1", "{}/test_key2".format(test_src_prefix),
            "{}/test_key3".format(test_src_prefix),
            "{}/test_key4".format(test_src_prefix)
        ]
        test_destination_keys = [
            "{}/test_key2".format(test_destination_prefix),
            "{}/test_key3".format(test_destination_prefix),
            "{}/test_key4".format(test_destination_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)

        for key in test_src_keys:
            s3_resource.Bucket(test_src_bucket_name).put_object(Key=key,
                                                                Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_objects(
            ("s3://" + test_src_bucket_name + "/" + test_src_prefix),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_prefix))

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 1
        assert src_objects[0].key == test_src_keys[0]

        destination_objects = s3_resource.Bucket(
            test_destination_bucket_name).objects.all()
        assert sorted(map(lambda x: x.key,
                          destination_objects)) == test_destination_keys
예제 #18
0
    def get_step_err_path(self, emr_step_id):
        """
        Get location in s3 of error log file for a particular failed step
        :param emr_step_id: emr step id
        :return: s3 location of the file
        """
        _, emr_step_failure_details = self.get_step_status(emr_step_id)

        if emr_step_failure_details is None or emr_step_failure_details.log_file is None:
            return None

        error_file_path = emr_step_failure_details.log_file

        if not error_file_path.endswith(EMRClusterClient.AWSConstants.EMR_STEP_ERROR_FILE_NAME):
            error_file_path = os.path.join(
                error_file_path,
                EMRClusterClient.AWSConstants.EMR_STEP_ERROR_FILE_NAME
            )

        # error_file_path can have 's3n://' protocol instead of 's3://'. So we are changing it.
        error_file = S3Util.normalize_s3_path(error_file_path)

        return error_file
예제 #19
0
    def get_step_output_path(self, emr_step_id):
        """
        Get location in s3 of stdout file for a particular step
        :param emr_step_id: emr step id
        :return: s3 location of the file
        """
        response = self.client.describe_cluster(
            ClusterId=self.emr_cluster_id
        )

        cluster_log_location = \
            response[EMRClusterClient.EMRResponseFields.CLUSTER][EMRClusterClient.EMRResponseFields.LOG_URI]

        out_file_path = cluster_log_location + \
            EMRClusterClient.AWSConstants.EMR_STEP_OUTPUT_FILE_NAME.format(
                self.emr_cluster_id,
                emr_step_id
            )

        # out_file_path can have 's3n://' protocol instead of 's3://'. So we are changing it.
        out_file = S3Util.normalize_s3_path(out_file_path)

        return out_file