Python FakeStep примеры использования

Язык программирования: Python

Пространство имен/Пакет: moto.emr.models

Класс/Тип: FakeStep

Примеров на hotexamples.com: 9

Python FakeStep - 9 примеров найдено. Это лучшие примеры Python кода для moto.emr.models.FakeStep, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FakeStep(9)

Основные методы

FakeStep (9)

Пример #1

Показать файл

Файл: test_load_table_append_s3.py Проект: sbakiu/m3d-api

class TestLoadTableAppendS3(S3TableTestBase):
    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append(self, remove_json_patch, add_tags_patch, _0,
                               _1):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        null_value = "test_null_value"
        quote_character = "test_quote"
        compute_table_statistics = True

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            partition_columns,
            regex_filename,
            null_value=null_value,
            quote_character=quote_character,
            compute_table_statistics=compute_table_statistics)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(
            config.s3_table.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.s3_table.dir_landing_final,
            "header_dir": config.s3_table.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "partition_columns": partition_columns,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "compute_table_statistics": True
        }
        assert actual_parameters == expected_parameters

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "load_table"
                          }, {
                              "Key": "LoadType",
                              "Value": "AppendLoad"
                          }, {
                              "Key": "TargetTable",
                              "Value": expected_table_full_name
                          }],
                          key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  partition_columns, regex_filename, "parquet")
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(
            config.s3_table.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.s3_table.dir_landing_final,
            "header_dir": config.s3_table.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "partition_columns": partition_columns,
            "regex_filename": regex_filename,
            "file_format": "parquet"
        }
        assert actual_parameters == expected_parameters

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_external_spark_parameters(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        partition_columns = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  partition_columns, regex_filename)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters1(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(
            self.local_run_dir, self.env_setup, ["year", "month"], [
                "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
                "(?<=[0-9]{6})([0-9]{2})"
            ])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of partition_columns and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters2(self, _0, _1):
        # responses.add_passthru(self.default_server_url)

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  ["year", "month", "day"], [])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of partition_columns and regex_filename do not match")

Пример #2

Показать файл

Файл: test_truncate_table_s3.py Проект: sbakiu/m3d-api

class TestTruncateTableS3Integration(S3TableTestBase):
    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_check_s3_cleanup(self, add_tags_patch, _):
        cluster_mode = False
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        source_system = "bi"
        table = "test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = self.env_setup(
            self.local_run_dir, destination_system, destination_database,
            destination_environment, destination_table)

        table_config_args = [
            m3d_config_file, cluster_mode, destination_system,
            destination_database, destination_environment, destination_table
        ]

        table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_landing = scon_emr_dict["environments"][destination_environment][
            "schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment][
            "schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment][
            "s3_buckets"]["lake"]

        test_content = "sample content"

        landing_dir = "{environment}/{source_system}/{table}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table)

        landing_data_dir = os.path.join(landing_dir, "data")
        landing_archive_dir = os.path.join(landing_dir, "archive")
        landing_work_dir = os.path.join(landing_dir, "work")

        landing_data_key = os.path.join(landing_data_dir, "new_landing_dump")
        landing_archive_key = os.path.join(landing_archive_dir, "old_dump.gz")
        landing_work_key = os.path.join(landing_work_dir, "temporary_data")

        lake_dir = "{environment}/{source_system}/{table}".format(
            environment=destination_environment,
            db_cd=db_lake,
            source_system=source_system,
            table=table)

        lake_data_dir = os.path.join(lake_dir, "data")
        lake_data_key = os.path.join(lake_data_dir, "new_lake_dump")

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_data_key, Body=test_content)
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_archive_key, Body=test_content)
        self.s3_resource.Bucket(bucket_landing).put_object(
            Key=landing_work_key, Body=test_content)

        self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key,
                                                        Body=test_content)
        logging.info("Calling  M3D.truncate_table()")
        M3D.truncate_table(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert len(fake_cluster.steps) == 2

        # Get actual HQL statements
        actual_hqls = []

        for step in fake_cluster.steps:
            assert ["hive", "--silent", "-f"] == step.args[0:3]

            hql_file = step.args[3]
            hql_content = self.get_object_content_from_s3(hql_file)
            actual_hqls.append(hql_content)

        db_table_landing = "{}.{}{}".format(
            db_landing, destination_table,
            m3d_config_dict["tags"]["table_suffix_stage"])
        landing_table_location = os.path.join("s3://", bucket_landing,
                                              landing_data_dir, "")

        db_table_lake = "{}.{}".format(db_lake, destination_table)
        lake_table_location = os.path.join("s3://", bucket_lake, lake_data_dir,
                                           "")

        landing_hql = "ALTER TABLE {} SET LOCATION \"{}\";".format(
            db_table_landing, landing_table_location)
        lake_hql = "\n".join([
            "DROP TABLE {};".format(db_table_lake),
            TestTruncateTableS3Integration._get_table_ddl_lake(
                db_table_lake, lake_table_location),
            "MSCK REPAIR TABLE {};".format(db_table_lake)
        ])

        expected_hqls = [landing_hql, lake_hql]

        assert actual_hqls == expected_hqls

        # checking landing directory
        landing_files = [
            k.key
            for k in self.s3_resource.Bucket(bucket_landing).objects.all()
        ]
        assert len(landing_files) == 0

        # checking lake directory
        lake_files = [
            k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()
        ]
        assert len(lake_files) == 0

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key":
            "ApiMethod",
            "Value":
            "truncate_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key":
            "TargetTable",
            "Value":
            "dev_lake.bi_test101"
        }]

    @staticmethod
    def _get_table_ddl_lake(db_table, location):
        columns = ", ".join(
            ["name1 varchar(21)", "name2 varchar(6)", "name3 varchar(4)"])

        return "\n".join([
            "CREATE EXTERNAL TABLE {}({})".format(db_table, columns),
            "PARTITIONED BY (year smallint, month smallint)",
            "STORED AS PARQUET", "LOCATION \'{}\'".format(location),
            "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"
        ])

Пример #3

Показать файл

Файл: test_drop_dataset_s3.py Проект: karok2m/m3d-api

class TestDropDatasetS3Integration(EMRSystemUnitTestBase):

    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_s3_cleanup(self, add_tags_patch, _):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_dataset = "nest_nest_test"

        source_system = "nest"
        short_dataset_name = "nest_test"

        m3d_config_file, _, m3d_config_dict, scon_emr_dict = self.env_setup(
            self.local_run_dir,
            destination_system,
            destination_database,
            destination_environment
        )

        dataset_config_args = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_dataset
        ]

        dataset_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"]

        test_content = "sample content"

        landing_dir = "{environment}/{source_system}/{dataset}".format(
            environment=destination_environment,
            source_system=source_system,
            dataset=short_dataset_name
        )

        landing_data_dir = os.path.join(landing_dir, "data")

        landing_data_key = os.path.join(landing_data_dir, "new_landing_dump")

        lake_dir = "{environment}/{source_system}/{dataset}".format(
            environment=destination_environment,
            db_cd=db_lake,
            source_system=source_system,
            dataset=short_dataset_name
        )

        lake_data_dir = os.path.join(lake_dir, "data")
        lake_data_key = os.path.join(lake_data_dir, "new_lake_dump")

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(Key=landing_data_key, Body=test_content)
        self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content)

        # checking if landing and lake directories contain the uploaded files
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 1
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 1

        logging.info("Calling  M3D.drop_dataset()")
        M3D.drop_dataset(*dataset_config_args, **dataset_config_kwargs)

        # checking if the files were removed
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 0
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 0

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "drop_dataset"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetDataset",
            "Value": "{}.{}".format(db_lake, destination_dataset)
        }]

Пример #4

Показать файл

Файл: test_drop_out_view_s3.py Проект: visionarylab/m3d-api

class TestDropOutViewS3Integration(S3TableTestBase):

    default_tconx = "test/resources/test_drop_out_view_s3/tconx-bdp-emr_test-dev-bi_test101.csv"

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_check_s3_cleanup(self, add_tags_patch, _):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config_args = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table
        ]

        table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        db_lake_out = scon_emr_dict["environments"][destination_environment][
            "schemas"]["lake_out"]

        lake_out = "bi_test101"

        logging.info("Calling  M3D.drop_out_view()")
        M3D.drop_out_view(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        actual_hql_content_in_bucket = self.get_object_content_from_s3(
            hive_step.args[3])
        expected_hql = "DROP VIEW IF EXISTS {}.{};".format(
            db_lake_out, lake_out)
        assert expected_hql == actual_hql_content_in_bucket

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key":
            "ApiMethod",
            "Value":
            "drop_out_view"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key":
            "TargetView",
            "Value":
            "dev_lake_out.bi_test101"
        }]

Пример #5

Показать файл

Файл: test_drop_table_s3.py Проект: visionarylab/m3d-api

class TestDropTableS3Integration(S3TableTestBase):

    default_tconx = "test/resources/test_drop_table_s3/tconx-bdp-emr_prod-dev-bi_test101.json"

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_s3_cleanup(self, add_tags_patch, _):
        logging.info("Starting s3 Checkup cleanup")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config_args = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"]

        test_content = "sample content"
        test_lake_key_filename = "test_lake_key"
        test_land_key_filename = "test_land_key"

        source_system = "bi"
        table = "test101"

        test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_land_key_filename
        )

        test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_lake_key_filename
        )

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content)
        self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content)

        logging.info("Calling  M3D.create_table()")
        M3D.create_table(*table_config_args, **table_config_kwargs)

        logging.info("Calling  M3D.drop_table()")
        M3D.drop_table(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert 3 == len(fake_cluster.steps)

        # Get actual HQL statements
        actual_hqls = []

        for step in fake_cluster.steps:
            assert ["hive", "--silent", "-f"] == step.args[0:3]

            hql_file = step.args[3]
            hql_content = self.get_object_content_from_s3(hql_file)
            actual_hqls.append(hql_content)

        expected_hqls = [
            'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]),
            'DROP TABLE {}.{};'.format(db_lake, destination_table)
        ]

        assert expected_hqls == actual_hqls[1:3]

        # checking landing directory
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 1
        assert landing_files[0] == test_land_key

        # checking lake directory
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 1
        assert lake_files[0] == test_lake_key

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 4
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
        assert add_tags_patch_call_args_list[2][0][0] == [{
            "Key": "ApiMethod",
            "Value": "drop_table"
        }]
        assert add_tags_patch_call_args_list[3][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]

Пример #6

Показать файл

Файл: test_create_table_s3.py Проект: visionarylab/m3d-api

class TestCreateTableS3(S3TableTestBase):

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_hql(self, add_tags_patch, _):
        logging.info("Starting TestCreateTableS3.test_check_hql()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, _, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \
                   "name3 varchar(4))\n" \
                   "PARTITIONED BY (year smallint, month smallint)\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        logging.info("Expected: {0}\n".format(expected_hql))
        logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]

    @pytest.mark.emr
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED"))
    @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags")
    def test_check_hql_with_custom_location(self, add_tags_patch, _):
        logging.info("Starting TestCreateTableS3.test_check_hql_with_custom_location()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"
        destination_table_location_prefix = "data_20200101100015123"

        m3d_config_file, _, _, _, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table,
            destination_table_location_prefix
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \
                   "name3 varchar(4))\n" \
                   "PARTITIONED BY (year smallint, month smallint)\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test101/{}/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");".format(destination_table_location_prefix)

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        logging.info("Expected: {0}\n".format(expected_hql))
        logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]

Пример #7

Показать файл

Файл: test_load_table_full_s3.py Проект: visionarylab/m3d-api

class TestLoadTableFullS3(S3TableTestBase):
    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    def test_full_load_emr(self, _0, _1):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id, spark_external_parameters
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]
        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config)

        # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have
        # old files in it and dir_landing_archive will not have new files
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
                return_value=FakeStep("COMPLETED"))
    def test_full_load_emr_external_spark_parameters(self, _0):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"
        acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )
        AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"],
                                        destination_database,
                                        destination_environment,
                                        destination_table, acon_src_path)

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config,
                       spark_params=json.dumps(spark_external_parameters))

        # psv file will still be in landing since move operation should be
        # performed by EMR Step which we mock here. Accordingly archive will
        # still be empty.
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"

Пример #8

Показать файл

Файл: test_load_table_delta_s3.py Проект: karok2m/m3d-api

class TestLoadTableDeltaS3(S3TableTestBase):
    def env_setup(self,
                  tmpdir,
                  destination_system,
                  destination_database,
                  destination_environment,
                  destination_table,
                  tconx_content=None,
                  tconx_cl_content=None):
        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(
                TestLoadTableDeltaS3,
                self
            ).env_setup(
                tmpdir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        tconx_filename_template = "tconx-{source_system}-{db_cd}-{environment}-{table}.json"

        tconx_cl_filename = tconx_filename_template.format(
            source_system=destination_system,
            db_cd=destination_database,
            environment=destination_environment,
            table=destination_table + "_cl")

        tconx_cl_file = os.path.join(os.path.dirname(tconx_file),
                                     tconx_cl_filename)

        if tconx_content:
            py.path.local(tconx_file).write(tconx_content)

        if tconx_cl_content:
            py.path.local(tconx_cl_file).write(tconx_cl_content)

        return m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, \
            m3d_config_dict, scon_emr_dict

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=spark_external_parameters)

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted([{
                          "Key": "ApiMethod",
                          "Value": "load_table"
                      }, {
                          "Key": "LoadType",
                          "Value": "DeltaLoad"
                      }, {
                          "Key": "TargetTable",
                          "Value": "bi_test101"
                      }],
                                                        key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_delta_external_spark_parameters(
            self, remove_json_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=json.dumps(spark_external_parameters))

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format
        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[5] == "--conf"
        assert delta_load_step.args[7] == "--conf"
        assert delta_load_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: delta_load_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

Пример #9

Показать файл

Файл: test_load_table_append_s3.py Проект: visionarylab/m3d-api

class TestLoadTableAppendS3(S3TableTestBase):
    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append(self, remove_json_patch, add_tags_patch, _0,
                               _1):

        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        null_value = "test_null_value"
        quote_character = "test_quote"
        compute_table_statistics = True
        verify_schema = False
        data_type = DataType.STRUCTURED
        reader_mode = "DROPMALFORMED"
        metadata_update_strategy = "SparkRecoverPartitionsCustom"

        source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=source_system,
            table=table)

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            target_partitions,
            regex_filename,
            null_value=null_value,
            quote_character=quote_character,
            metadata_update_strategy=metadata_update_strategy,
            compute_table_statistics=compute_table_statistics,
            verify_schema=verify_schema,
            data_type=data_type,
            reader_mode=reader_mode)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "compute_table_statistics": True,
            "data_type": DataType.STRUCTURED,
            "verify_schema": False,
            "metadata_update_strategy": "SparkRecoverPartitionsCustom",
            "target_dir": test_target_dir,
            "reader_mode": "DROPMALFORMED"
        }
        assert actual_parameters == expected_parameters

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "load_table"
                          }, {
                              "Key": "LoadType",
                              "Value": "AppendLoad"
                          }, {
                              "Key": "TargetTable",
                              "Value": config.destination_table
                          }],
                          key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2):
        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=source_system,
            table=table)

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            target_partitions,
            regex_filename,
            file_format="parquet",
            metadata_update_strategy="SparkRecoverPartitionsNative")
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id, spark_external_parameters)

        # Check EMR steps
        assert len(fake_cluster.steps) == 1

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == config.load_type
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_table_full_name = "{}.{}".format(config.db_name_lake,
                                                  config.destination_table)
        expected_parameters = {
            "target_table": expected_table_full_name,
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "delimiter": "|",
            "has_header": False,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "metadata_update_strategy": "SparkRecoverPartitionsNative",
            "file_format": "parquet",
            "target_dir": test_target_dir
        }
        assert actual_parameters == expected_parameters

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_external_spark_parameters(self, _0, _1):

        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  target_partitions, regex_filename)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters1(self, _0, _1):

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(
            self.local_run_dir, self.env_setup, ["year", "month"], [
                "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
                "(?<=[0-9]{6})([0-9]{2})"
            ])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of target_partitions and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    def test_load_table_append_invalid_parameters2(self, _0, _1):

        spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(self.local_run_dir, self.env_setup,
                                  ["year", "month", "day"], [])
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith(
            "Lengths of target_partitions and regex_filename do not match")

    @pytest.mark.emr
    @patch("m3d.util.util.Util.send_email")
    @patch("moto.emr.models.ElasticMapReduceBackend.describe_step",
           return_value=FakeStep("COMPLETED"))
    @patch(
        "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags"
    )
    @patch(
        "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json")
    def test_load_table_append_valid_parameters_semistructured_data(
            self, _0, _1, _2, _3):

        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        target_partitions = ["year", "month", "day"]
        regex_filename = [
            "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})",
            "(?<=[0-9]{6})([0-9]{2})"
        ]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=AppendLoadConfig.source_system,
            table=table)
        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        null_value = "test_null_value"
        quote_character = "test_quote"
        data_type = DataType.SEMISTRUCTURED
        verify_schema = True
        schema = {
            "type":
            "struct",
            "fields": [{
                "name": "first_name",
                "type": "string",
                "nullable": True,
                "metadata": {}
            }, {
                "name": "surname",
                "type": "string",
                "nullable": True,
                "metadata": {}
            }, {
                "name": "age",
                "type": "integer",
                "nullable": True,
                "metadata": {}
            }]
        }

        config = AppendLoadConfig(self.local_run_dir,
                                  self.env_setup,
                                  target_partitions,
                                  regex_filename,
                                  null_value=null_value,
                                  quote_character=quote_character,
                                  schema=schema,
                                  verify_schema=verify_schema,
                                  data_type=data_type)

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        config.load_table(self.emr_cluster_id,
                          json.dumps(spark_external_parameters))

        # Check EMR step.
        assert len(fake_cluster.steps) == 1

        spark_step = fake_cluster.steps[0]

        # Check args of EMR step
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == config.expected_algorithms_jar_path
        assert spark_step.args[-3] == "AppendLoad"
        assert spark_step.args[-2] == config.config_filepath
        assert spark_step.args[-1] == "s3"

        # Check that config_file_s3 file is on application S3 bucket
        app_files = self.get_child_objects(config.dataset.dir_apps_append_load)
        app_json_files = list(
            filter(
                lambda app_file: os.path.basename(app_file).endswith(".json"),
                app_files))
        assert len(app_json_files) == 1
        assert app_json_files[0] == config.config_filepath

        # Check config file for Spark
        actual_parameters = json.loads(
            self.get_object_content_from_s3(config.config_filepath))
        expected_parameters = {
            "target_table": "test101",
            "source_dir": config.dataset.dir_landing_final,
            "header_dir": config.dataset.dir_landing_header,
            "target_partitions": target_partitions,
            "regex_filename": regex_filename,
            "file_format": "dsv",
            "null_value": "test_null_value",
            "quote_character": "test_quote",
            "data_type": DataType.SEMISTRUCTURED,
            "verify_schema": True,
            "target_dir": test_target_dir,
            "schema": schema
        }
        assert actual_parameters == expected_parameters