예제 #1
0
    def test_move_object(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_key = "test_src_key"
        test_destination_key = "test_destination_key"
        test_content = "aaa1"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)
        s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key,
                                                            Body=test_content)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_object(
            ("s3://" + test_src_bucket_name + "/" + test_src_key),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_key))

        destination_objects = list(
            s3_resource.Bucket(test_destination_bucket_name).objects.all())
        assert len(destination_objects) == 1
        assert destination_objects[0].key == test_destination_key

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 0
예제 #2
0
    def test_wait_for_file_availability(self):
        bucket = "cur_bucket"
        key = "stdout.txt"
        data = "no output"

        s3_full_path = "s3://{}/{}".format(bucket, key)

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=bucket)

        def create_file():
            s3_resource.Bucket(bucket).put_object(Key=key, Body=data)

        s3_util = S3Util(AWSCredentials("", ""))

        polling_interval = 0.02
        timeout = 0.5

        with ConcurrentExecutor(create_file, 0.2):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)

        s3_util.delete_object(s3_full_path)

        err_msg = "File {} failed to be available after {} seconds.".format(
            s3_full_path, timeout)

        with pytest.raises(M3DAWSAPIException, match=err_msg):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)
예제 #3
0
    def env_setup(emr_cluster_name, aws_region, aws_credentials, timeout_ms,
                  retry_ms, long_timeout_ms):
        run_job_flow_args = dict(Instances={
            'InstanceCount': 3,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'test_zone'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
                                 JobFlowRole='EMR_EC2_DefaultRole',
                                 LogUri='s3://mybucket/log/',
                                 Name=emr_cluster_name,
                                 ServiceRole='EMR_DefaultRole',
                                 VisibleToAllUsers=True)

        emr_client = Boto3Util.create_emr_client(aws_region)
        emr_cluster_id = emr_client.run_job_flow(
            **run_job_flow_args)['JobFlowId']

        emr_cluster_client = EMRClusterClient(emr_cluster_id, aws_region,
                                              aws_credentials, timeout_ms,
                                              retry_ms, long_timeout_ms)

        return emr_cluster_client, emr_cluster_id
예제 #4
0
    def test_drop_tables_one_hive_table_fails_to_drop_2(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: self.LAKE_SPEC.table in x)
        s3_table.drop_tables()

        assert len(s3_table.emr_system.statements) == 1

        # nothing should be deleted by drop_tables() call
        assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys
        assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
예제 #5
0
    def test_drop_tables_successful_execution(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: True)
        s3_table.drop_tables()

        assert len(s3_table.emr_system.statements) == 2

        # nothing should be deleted by drop_tables() call
        assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys
        assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
예제 #6
0
    def setup_method(self, method):
        super(EMRSystemUnitTestBase, self).setup_method(method)

        # Setup EMR mock
        self.mock_emr = moto.mock_emr()
        self.mock_emr.start()

        self.emr_cluster_name = "test clustester for unit and integration tests"

        run_job_flow_args = dict(Instances={
            'InstanceCount': 3,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'test_zone'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
                                 JobFlowRole='EMR_EC2_DefaultRole',
                                 LogUri='s3://{}/log/'.format(
                                     self.default_log_bucket),
                                 Name=self.emr_cluster_name,
                                 ServiceRole='EMR_DefaultRole',
                                 VisibleToAllUsers=True)

        emr_client = Boto3Util.create_emr_client(self.default_aws_region)
        self.emr_cluster_id = emr_client.run_job_flow(
            **run_job_flow_args)['JobFlowId']
        logging.debug("Test case specific EMR cluster id is {}".format(
            self.emr_cluster_id))

        # Setup S3 mock
        self.mock_s3 = moto.mock_s3()
        self.mock_s3.start()

        self.s3_resource = Boto3Util.create_s3_resource()
        self.s3_resource.create_bucket(Bucket=self.default_dev_landing_bucket)
        self.s3_resource.create_bucket(Bucket=self.default_dev_lake_bucket)
        self.s3_resource.create_bucket(Bucket=self.default_dev_mart_cal_bucket)
        self.s3_resource.create_bucket(
            Bucket=self.default_dev_application_bucket)
        self.s3_resource.create_bucket(Bucket=self.default_log_bucket)
예제 #7
0
    def test_drop_tables_both_tables_fail_to_drop(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: False)
        with pytest.raises(Exception, match="^Unable to drop any of the following tables.+"):
            s3_table.drop_tables()

        assert len(s3_table.emr_system.statements) == 0

        # nothing should be deleted by drop_tables() call
        assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys
        assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
예제 #8
0
    def test_truncate_tables_both_repairs_fail_unexpectedly(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: False)

        with pytest.raises(M3DException, match="^Failed to truncate any of the following tables: .+"):
            s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 0

        assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
        assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
예제 #9
0
    def test_add_step_to_cluster_fail_without_output(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            s3_resource = Boto3Util.create_s3_resource()
            s3_resource.create_bucket(Bucket="mybucket")

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            cluster_steps = emr_cluster_client.get_list_of_steps()
            assert 1 == len(cluster_steps)
            assert cluster_steps[0] == emr_step_id

            emr_step_status, _ = emr_cluster_client.get_step_status(
                emr_step_id)
            assert emr_step_status == "STARTING"

            # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING"
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_step = fake_cluster.steps[0]
            fake_step.state = "RUNNING"

            def fail_step():
                fake_step.state = "FAILED"

            # Make sure that we do not wait for 300 seconds for gz file to be available.
            EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds

            # Required for correct log path generation in MockedMethod.
            MockedMethod.emr_cluster_id = emr_cluster_id

            stderr_gz_path = MockedMethod.log_file_template.format(
                emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id)

            err_msg = "File {} failed to be available after {} seconds.".\
                format(stderr_gz_path, self.timeout_seconds)

            with pytest.raises(M3DAWSAPIException, match=err_msg):
                # Wait for some time to let EMRClusterClient poll a few times.
                with ConcurrentExecutor(fail_step, 0.4):
                    with patch(
                            "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status",
                            side_effect=MockedMethod.get_step_status_mocked):
                        emr_cluster_client.wait_for_step_completion(
                            emr_step_id, self.long_timeout_seconds)
예제 #10
0
    def test_truncate_tables_both_repairs_fail_expectedly(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(
            s3_resource, lambda x: False,
            M3DEMRStepException("", "", "Table not found"))

        s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 0

        assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
        assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
예제 #11
0
    def test_delete_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_dir/test_key"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 0
예제 #12
0
    def test_upload_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"
        file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.upload_object(file_name,
                              "s3://" + test_bucket_name + "/" + test_key)

        s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(s3_objects) == 1
        assert s3_objects[0].key == test_key
예제 #13
0
    def test_truncate_tables_one_repair_fails_unexpectedly_1(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: self.LANDING_SPEC.table in x)
        s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 1

        landing_dir = "s3://{}/{}".format(self.LANDING_SPEC.bucket, self.LANDING_SPEC.data_dir)

        assert s3_table.emr_system.statements == [
            'ALTER TABLE {} SET LOCATION "{}";'.format(self.LANDING_SPEC.table, landing_dir)
        ]

        assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
        assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
예제 #14
0
    def test_truncate_tables_one_repair_fails_unexpectedly_2(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: self.LAKE_SPEC.table in x)
        s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 1

        lake_dir = "s3://{}/{}".format(self.LAKE_SPEC.bucket, self.LAKE_SPEC.data_dir)

        assert s3_table.emr_system.statements == [
            'DROP TABLE {};\n'.format(self.LAKE_SPEC.table) +
            TestS3Table._get_table_ddl_lake(self.LAKE_SPEC.table, s3_table.columns_lake, lake_dir) + "\n" +
            'MSCK REPAIR TABLE {};'.format(self.LAKE_SPEC.table)
        ]

        assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
        assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
예제 #15
0
    def test_truncate_tables_wrong_files_not_deleted(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: True)

        landing_extra_keys = sorted(["test_key1", "test_key2"])
        lake_extra_keys = sorted(["test_key1", "test_dir/test_key2"])

        for k in landing_extra_keys:
            s3_resource.Bucket(self.LANDING_SPEC.bucket).put_object(Key=k, Body="")
        for k in lake_extra_keys:
            s3_resource.Bucket(self.LAKE_SPEC.bucket).put_object(Key=k, Body="")

        s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 2

        assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == lake_extra_keys
        assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == landing_extra_keys
예제 #16
0
    def test_truncate_tables_everything_deleted_successfully(self):
        s3_resource = Boto3Util.create_s3_resource()
        s3_table = self._create_s3_table(s3_resource, lambda x: True)
        s3_table.truncate_tables()

        assert len(s3_table.emr_system.statements) == 2

        landing_dir = "s3://{}/{}".format(self.LANDING_SPEC.bucket, self.LANDING_SPEC.data_dir)
        lake_dir = "s3://{}/{}".format(self.LAKE_SPEC.bucket, self.LAKE_SPEC.data_dir)

        expected_statements = [
            'ALTER TABLE {} SET LOCATION "{}";'.format(self.LANDING_SPEC.table, landing_dir),
            'DROP TABLE {};\n'.format(self.LAKE_SPEC.table) +
            TestS3Table._get_table_ddl_lake(self.LAKE_SPEC.table, s3_table.columns_lake, lake_dir) + "\n" +
            'MSCK REPAIR TABLE {};'.format(self.LAKE_SPEC.table)
        ]

        assert s3_table.emr_system.statements == expected_statements
        assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
        assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
예제 #17
0
    def test_delete_objects(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 1
        assert remaining_objects[0].key == test_keys[0]
예제 #18
0
    def test_list_objects_in_bucket(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]
        test_resources = [
            "s3://{}/".format(test_bucket_name) + key for key in test_keys
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        keys = s3_util.list_objects("s3://" + test_bucket_name + "/" +
                                    test_prefix)

        assert keys == test_resources[1:4]
예제 #19
0
    def test_move_objects(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_prefix = "test_src_dir"
        test_destination_prefix = "test_destination_dir"
        test_src_keys = [
            "test_key1", "{}/test_key2".format(test_src_prefix),
            "{}/test_key3".format(test_src_prefix),
            "{}/test_key4".format(test_src_prefix)
        ]
        test_destination_keys = [
            "{}/test_key2".format(test_destination_prefix),
            "{}/test_key3".format(test_destination_prefix),
            "{}/test_key4".format(test_destination_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)

        for key in test_src_keys:
            s3_resource.Bucket(test_src_bucket_name).put_object(Key=key,
                                                                Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_objects(
            ("s3://" + test_src_bucket_name + "/" + test_src_prefix),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_prefix))

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 1
        assert src_objects[0].key == test_src_keys[0]

        destination_objects = s3_resource.Bucket(
            test_destination_bucket_name).objects.all()
        assert sorted(map(lambda x: x.key,
                          destination_objects)) == test_destination_keys
예제 #20
0
    def test_add_step_to_cluster_fail_with_output(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            s3_resource = Boto3Util.create_s3_resource()
            s3_resource.create_bucket(Bucket="mybucket")

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            cluster_steps = emr_cluster_client.get_list_of_steps()
            assert 1 == len(cluster_steps)
            assert cluster_steps[0] == emr_step_id

            emr_step_status, _ = emr_cluster_client.get_step_status(
                emr_step_id)
            assert emr_step_status == "STARTING"

            # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING"
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_step = fake_cluster.steps[0]
            fake_step.state = "RUNNING"

            # Make sure that we do not wait for 300 seconds for gz file to be available.
            EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds

            # Required for correct log path generation in MockedMethod.
            MockedMethod.emr_cluster_id = emr_cluster_id

            stderr_gz_path = MockedMethod.log_file_template.format(
                emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id)

            expected_content = "Lots of content here!!!"

            def fail_step_and_write_output():
                fake_step.state = "FAILED"

                time.sleep(0.3)

                compressed_content = TestEMRClusterClient._compress_string(
                    expected_content)

                bucket, key = emr_cluster_client.s3_util.get_bucket_and_key(
                    stderr_gz_path)
                s3_resource.Bucket(bucket).put_object(Key=key,
                                                      Body=compressed_content)

            with pytest.raises(M3DAWSAPIException) as exc:
                # Wait for some time to let EMRClusterClient poll a few times.
                with ConcurrentExecutor(fail_step_and_write_output, 0.3):
                    with patch(
                            "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status",
                            side_effect=MockedMethod.get_step_status_mocked):
                        emr_cluster_client.wait_for_step_completion(
                            emr_step_id, self.long_timeout_seconds)

            err_msg = "EMR Step with cluster_id='{}' and step_id='{}' failed to complete".\
                format(emr_cluster_id, emr_step_id)

            assert err_msg in str(exc.value)
            assert stderr_gz_path in str(exc.value)

            resulting_content = emr_cluster_client.s3_util.read_gzip_file_content(
                stderr_gz_path)
            assert expected_content == resulting_content
예제 #21
0
 def list_objects_in_bucket(s3_bucket):
     s3_resource = Boto3Util.create_s3_resource()
     objects = [
         obj.key for obj in s3_resource.Bucket(s3_bucket).objects.all()
     ]
     return sorted(objects)