Пример #1
0
class TestLoadTableAppendS3(S3TableTestBase):
    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append(self, remove_json_patch, add_tags_patch, _0,
                               _1):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        null_value = "test_null_value"
        quote_character = "test_quote"
        compute_table_statistics = True

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            partition_columns,
            regex_filename,
            null_value=null_value,
            quote_character=quote_character,
            compute_table_statistics=compute_table_statistics)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(
            config.s3_table.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.s3_table.dir_landing_final,
            "header_dir": config.s3_table.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "partition_columns": partition_columns,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "compute_table_statistics": True
        }
        assert actual_parameters == expected_parameters

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "load_table"
                          }, {
                              "Key": "LoadType",
                              "Value": "AppendLoad"
                          }, {
                              "Key": "TargetTable",
                              "Value": expected_table_full_name
                          }],
                          key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  partition_columns, regex_filename, "parquet")
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(
            config.s3_table.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.s3_table.dir_landing_final,
            "header_dir": config.s3_table.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "partition_columns": partition_columns,
            "regex_filename": regex_filename,
            "file_format": "parquet"
        }
        assert actual_parameters == expected_parameters

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_external_spark_parameters(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  partition_columns, regex_filename)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters1(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(
            self.local_run_dir, self.env_setup, ["year", "month"], [
                "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
                "(?<=[0-9]{6})([0-9]{2})"
            ])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of partition_columns and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters2(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  ["year", "month", "day"], [])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of partition_columns and regex_filename do not match")
Пример #2
0
class TestTruncateTableS3Integration(S3TableTestBase):
    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_check_s3_cleanup(self, add_tags_patch, _):
        cluster_mode = False
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        source_system = "bi"
        table = "test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = self.env_setup(
            self.local_run_dir, destination_system, destination_database,
            destination_environment, destination_table)

        table_config_args = [
            m3d_config_file, cluster_mode, destination_system,
            destination_database, destination_environment, destination_table
        ]

        table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_landing = scon_emr_dict["environments"][destination_environment][
            "schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment][
            "schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment][
            "s3_buckets"]["lake"]

        test_content = "sample content"

        landing_dir = "{environment}/{source_system}/{table}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table)

        landing_data_dir = os.path.join(landing_dir, "data")
        landing_archive_dir = os.path.join(landing_dir, "archive")
        landing_work_dir = os.path.join(landing_dir, "work")

        landing_data_key = os.path.join(landing_data_dir, "new_landing_dump")
        landing_archive_key = os.path.join(landing_archive_dir, "old_dump.gz")
        landing_work_key = os.path.join(landing_work_dir, "temporary_data")

        lake_dir = "{environment}/{source_system}/{table}".format(
            environment=destination_environment,
            db_cd=db_lake,
            source_system=source_system,
            table=table)

        lake_data_dir = os.path.join(lake_dir, "data")
        lake_data_key = os.path.join(lake_data_dir, "new_lake_dump")

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_data_key, Body=test_content)
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_archive_key, Body=test_content)
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_work_key, Body=test_content)

        self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key,
                                                        Body=test_content)
        logging.info("Calling  M3D.truncate_table()")
        M3D.truncate_table(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert len(fake_cluster.steps) == 2

        # Get actual HQL statements
        actual_hqls = []

        for step in fake_cluster.steps:
            assert ["hive", "--silent", "-f"] == step.args[0:3]

            hql_file = step.args[3]
            hql_content = self.get_object_content_from_s3(hql_file)
            actual_hqls.append(hql_content)

        db_table_landing = "{}.{}{}".format(
            db_landing, destination_table,
            m3d_config_dict["tags"]["table_suffix_stage"])
        landing_table_location = os.path.join("s3://", bucket_landing,
                                              landing_data_dir, "")

        db_table_lake = "{}.{}".format(db_lake, destination_table)
        lake_table_location = os.path.join("s3://", bucket_lake, lake_data_dir,
                                           "")

        landing_hql = "ALTER TABLE {} SET LOCATION \"{}\";".format(
            db_table_landing, landing_table_location)
        lake_hql = "\n".join([
            "DROP TABLE {};".format(db_table_lake),
            TestTruncateTableS3Integration._get_table_ddl_lake(
                db_table_lake, lake_table_location),
            "MSCK REPAIR TABLE {};".format(db_table_lake)
        ])

        expected_hqls = [landing_hql, lake_hql]

        assert actual_hqls == expected_hqls

        # checking landing directory
        landing_files = [
            k.key
            for k in self.s3_resource.Bucket(bucket_landing).objects.all()
        ]
        assert len(landing_files) == 0

        # checking lake directory
        lake_files = [
            k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()
        ]
        assert len(lake_files) == 0

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key":
            "ApiMethod",
            "Value":
            "truncate_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key":
            "TargetTable",
            "Value":
            "dev_lake.bi_test101"
        }]

    @staticmethod
    def _get_table_ddl_lake(db_table, location):
        columns = ", ".join(
            ["name1 varchar(21)", "name2 varchar(6)", "name3 varchar(4)"])

        return "\n".join([
            "CREATE EXTERNAL TABLE {}({})".format(db_table, columns),
            "PARTITIONED BY (year smallint, month smallint)",
            "STORED AS PARQUET", "LOCATION \'{}\'".format(location),
            "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"
        ])
Пример #3
0
class TestDropDatasetS3Integration(EMRSystemUnitTestBase):

    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_s3_cleanup(self, add_tags_patch, _):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_dataset = "nest_nest_test"

        source_system = "nest"
        short_dataset_name = "nest_test"

        m3d_config_file, _, m3d_config_dict, scon_emr_dict = self.env_setup(
            self.local_run_dir,
            destination_system,
            destination_database,
            destination_environment
        )

        dataset_config_args = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_dataset
        ]

        dataset_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"]

        test_content = "sample content"

        landing_dir = "{environment}/{source_system}/{dataset}".format(
            environment=destination_environment,
            source_system=source_system,
            dataset=short_dataset_name
        )

        landing_data_dir = os.path.join(landing_dir, "data")

        landing_data_key = os.path.join(landing_data_dir, "new_landing_dump")

        lake_dir = "{environment}/{source_system}/{dataset}".format(
            environment=destination_environment,
            db_cd=db_lake,
            source_system=source_system,
            dataset=short_dataset_name
        )

        lake_data_dir = os.path.join(lake_dir, "data")
        lake_data_key = os.path.join(lake_data_dir, "new_lake_dump")

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(Key=landing_data_key, Body=test_content)
        self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content)

        # checking if landing and lake directories contain the uploaded files
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 1
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 1

        logging.info("Calling  M3D.drop_dataset()")
        M3D.drop_dataset(*dataset_config_args, **dataset_config_kwargs)

        # checking if the files were removed
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 0
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 0

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "drop_dataset"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetDataset",
            "Value": "{}.{}".format(db_lake, destination_dataset)
        }]
Пример #4
0
class TestDropOutViewS3Integration(S3TableTestBase):

    default_tconx = "test/resources/test_drop_out_view_s3/tconx-bdp-emr_test-dev-bi_test101.csv"

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_check_s3_cleanup(self, add_tags_patch, _):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config_args = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table
        ]

        table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_lake_out = scon_emr_dict["environments"][destination_environment][
            "schemas"]["lake_out"]

        lake_out = "bi_test101"

        logging.info("Calling  M3D.drop_out_view()")
        M3D.drop_out_view(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        actual_hql_content_in_bucket = self.get_object_content_from_s3(
            hive_step.args[3])
        expected_hql = "DROP VIEW IF EXISTS {}.{};".format(
            db_lake_out, lake_out)
        assert expected_hql == actual_hql_content_in_bucket

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key":
            "ApiMethod",
            "Value":
            "drop_out_view"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key":
            "TargetView",
            "Value":
            "dev_lake_out.bi_test101"
        }]
Пример #5
0
class TestDropTableS3Integration(S3TableTestBase):

    default_tconx = "test/resources/test_drop_table_s3/tconx-bdp-emr_prod-dev-bi_test101.json"

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_s3_cleanup(self, add_tags_patch, _):
        logging.info("Starting s3 Checkup cleanup")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config_args = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"]

        test_content = "sample content"
        test_lake_key_filename = "test_lake_key"
        test_land_key_filename = "test_land_key"

        source_system = "bi"
        table = "test101"

        test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_land_key_filename
        )

        test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_lake_key_filename
        )

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content)
        self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content)

        logging.info("Calling  M3D.create_table()")
        M3D.create_table(*table_config_args, **table_config_kwargs)

        logging.info("Calling  M3D.drop_table()")
        M3D.drop_table(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert 3 == len(fake_cluster.steps)

        # Get actual HQL statements
        actual_hqls = []

        for step in fake_cluster.steps:
            assert ["hive", "--silent", "-f"] == step.args[0:3]

            hql_file = step.args[3]
            hql_content = self.get_object_content_from_s3(hql_file)
            actual_hqls.append(hql_content)

        expected_hqls = [
            'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]),
            'DROP TABLE {}.{};'.format(db_lake, destination_table)
        ]

        assert expected_hqls == actual_hqls[1:3]

        # checking landing directory
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 1
        assert landing_files[0] == test_land_key

        # checking lake directory
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 1
        assert lake_files[0] == test_lake_key

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 4
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
        assert add_tags_patch_call_args_list[2][0][0] == [{
            "Key": "ApiMethod",
            "Value": "drop_table"
        }]
        assert add_tags_patch_call_args_list[3][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
Пример #6
0
class TestCreateTableS3(S3TableTestBase):

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_hql(self, add_tags_patch, _):
        logging.info("Starting TestCreateTableS3.test_check_hql()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, _, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \
                   "name3 varchar(4))\n" \
                   "PARTITIONED BY (year smallint, month smallint)\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        logging.info("Expected: {0}\n".format(expected_hql))
        logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_hql_with_custom_location(self, add_tags_patch, _):
        logging.info("Starting TestCreateTableS3.test_check_hql_with_custom_location()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"
        destination_table_location_prefix = "data_20200101100015123"

        m3d_config_file, _, _, _, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table,
            destination_table_location_prefix
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \
                   "name3 varchar(4))\n" \
                   "PARTITIONED BY (year smallint, month smallint)\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test101/{}/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");".format(destination_table_location_prefix)

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        logging.info("Expected: {0}\n".format(expected_hql))
        logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
class TestLoadTableFullS3(S3TableTestBase):
    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_full_load_emr(self, _0, _1):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id, spark_external_parameters
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]
        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config)

        # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have
        # old files in it and dir_landing_archive will not have new files
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    def test_full_load_emr_external_spark_parameters(self, _0):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"
        acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )
        AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"],
                                        destination_database,
                                        destination_environment,
                                        destination_table, acon_src_path)

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config,
                       spark_params=json.dumps(spark_external_parameters))

        # psv file will still be in landing since move operation should be
        # performed by EMR Step which we mock here. Accordingly archive will
        # still be empty.
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"
Пример #8
0
class TestLoadTableDeltaS3(S3TableTestBase):
    def env_setup(self,
                  tmpdir,
                  destination_system,
                  destination_database,
                  destination_environment,
                  destination_table,
                  tconx_content=None,
                  tconx_cl_content=None):
        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(
                TestLoadTableDeltaS3,
                self
            ).env_setup(
                tmpdir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        tconx_filename_template = "tconx-{source_system}-{db_cd}-{environment}-{table}.json"

        tconx_cl_filename = tconx_filename_template.format(
            source_system=destination_system,
            db_cd=destination_database,
            environment=destination_environment,
            table=destination_table + "_cl")

        tconx_cl_file = os.path.join(os.path.dirname(tconx_file),
                                     tconx_cl_filename)

        if tconx_content:
            py.path.local(tconx_file).write(tconx_content)

        if tconx_cl_content:
            py.path.local(tconx_cl_file).write(tconx_cl_content)

        return m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, \
            m3d_config_dict, scon_emr_dict

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=spark_external_parameters)

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted([{
                          "Key": "ApiMethod",
                          "Value": "load_table"
                      }, {
                          "Key": "LoadType",
                          "Value": "DeltaLoad"
                      }, {
                          "Key": "TargetTable",
                          "Value": "bi_test101"
                      }],
                                                        key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_delta_external_spark_parameters(
            self, remove_json_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=json.dumps(spark_external_parameters))

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format
        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[5] == "--conf"
        assert delta_load_step.args[7] == "--conf"
        assert delta_load_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: delta_load_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
class TestLoadTableAppendS3(S3TableTestBase):
    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append(self, remove_json_patch, add_tags_patch, _0,
                               _1):

        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        null_value = "test_null_value"
        quote_character = "test_quote"
        compute_table_statistics = True
        verify_schema = False
        data_type = DataType.STRUCTURED
        reader_mode = "DROPMALFORMED"
        metadata_update_strategy = "SparkRecoverPartitionsCustom"

        source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=source_system,
            table=table)

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            target_partitions,
            regex_filename,
            null_value=null_value,
            quote_character=quote_character,
            metadata_update_strategy=metadata_update_strategy,
            compute_table_statistics=compute_table_statistics,
            verify_schema=verify_schema,
            data_type=data_type,
            reader_mode=reader_mode)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "compute_table_statistics": True,
            "data_type": DataType.STRUCTURED,
            "verify_schema": False,
            "metadata_update_strategy": "SparkRecoverPartitionsCustom",
            "target_dir": test_target_dir,
            "reader_mode": "DROPMALFORMED"
        }
        assert actual_parameters == expected_parameters

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "load_table"
                          }, {
                              "Key": "LoadType",
                              "Value": "AppendLoad"
                          }, {
                              "Key": "TargetTable",
                              "Value": config.destination_table
                          }],
                          key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2):
        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=source_system,
            table=table)

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            target_partitions,
            regex_filename,
            file_format="parquet",
            metadata_update_strategy="SparkRecoverPartitionsNative")
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "metadata_update_strategy": "SparkRecoverPartitionsNative",
            "file_format": "parquet",
            "target_dir": test_target_dir
        }
        assert actual_parameters == expected_parameters

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_external_spark_parameters(self, _0, _1):

        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  target_partitions, regex_filename)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters1(self, _0, _1):

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(
            self.local_run_dir, self.env_setup, ["year", "month"], [
                "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
                "(?<=[0-9]{6})([0-9]{2})"
            ])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of target_partitions and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters2(self, _0, _1):

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  ["year", "month", "day"], [])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of target_partitions and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_valid_parameters_semistructured_data(
            self, _0, _1, _2, _3):

        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=AppendLoadConfig.source_system,
            table=table)
        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        null_value = "test_null_value"
        quote_character = "test_quote"
        data_type = DataType.SEMISTRUCTURED
        verify_schema = True
        schema = {
            "type":
            "struct",
            "fields": [{
                "name": "first_name",
                "type": "string",
                "nullable": True,
                "metadata": {}
            }, {
                "name": "surname",
                "type": "string",
                "nullable": True,
                "metadata": {}
            }, {
                "name": "age",
                "type": "integer",
                "nullable": True,
                "metadata": {}
            }]
        }

        config = AppendLoadConfig(self.local_run_dir,
                                  self.env_setup,
                                  target_partitions,
                                  regex_filename,
                                  null_value=null_value,
                                  quote_character=quote_character,
                                  schema=schema,
                                  verify_schema=verify_schema,
                                  data_type=data_type)

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_parameters = {
            "target_table": "test101",
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "data_type": DataType.SEMISTRUCTURED,
            "verify_schema": True,
            "target_dir": test_target_dir,
            "schema": schema
        }
        assert actual_parameters == expected_parameters