def test_check_hql(self, add_tags_patch, _):
        logging.info("Starting TestCreateTableS3.test_check_hql()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, _, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        hive_step = fake_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \
                   "name3 varchar(4))\n" \
                   "PARTITIONED BY (year smallint, month smallint)\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        logging.info("Expected: {0}\n".format(expected_hql))
        logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
    def test_check_s3_cleanup(self, add_tags_patch, _):
        logging.info("Starting s3 Checkup cleanup")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        table_config_args = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"]

        bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"]
        bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"]

        test_content = "sample content"
        test_lake_key_filename = "test_lake_key"
        test_land_key_filename = "test_land_key"

        source_system = "bi"
        table = "test101"

        test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_land_key_filename
        )

        test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format(
            environment=destination_environment,
            source_system=source_system,
            table=table,
            obj_name=test_lake_key_filename
        )

        # adding data to landing and lake directories
        self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content)
        self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content)

        logging.info("Calling  M3D.create_table()")
        M3D.create_table(*table_config_args, **table_config_kwargs)

        logging.info("Calling  M3D.drop_table()")
        M3D.drop_table(*table_config_args, **table_config_kwargs)

        emr_backend = self.mock_emr.backends[self.default_aws_region]
        fake_cluster = emr_backend.clusters[self.emr_cluster_id]

        assert 3 == len(fake_cluster.steps)

        # Get actual HQL statements
        actual_hqls = []

        for step in fake_cluster.steps:
            assert ["hive", "--silent", "-f"] == step.args[0:3]

            hql_file = step.args[3]
            hql_content = self.get_object_content_from_s3(hql_file)
            actual_hqls.append(hql_content)

        expected_hqls = [
            'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]),
            'DROP TABLE {}.{};'.format(db_lake, destination_table)
        ]

        assert expected_hqls == actual_hqls[1:3]

        # checking landing directory
        landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()]
        assert len(landing_files) == 1
        assert landing_files[0] == test_land_key

        # checking lake directory
        lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()]
        assert len(lake_files) == 1
        assert lake_files[0] == test_lake_key

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 4
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_table"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
        assert add_tags_patch_call_args_list[2][0][0] == [{
            "Key": "ApiMethod",
            "Value": "drop_table"
        }]
        assert add_tags_patch_call_args_list[3][0][0] == [{
            "Key": "TargetTable",
            "Value": "dev_lake.bi_test101"
        }]
示例#3
0
    def test_check_hql_single_partitioning(self, add_tags_patch, _):
        logging.info(
            "Starting TestCreateTableS3.test_check_hql_single_partitioning()")

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test103"

        m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        TconxHelper.setup_tconx_from_file(
            m3d_config_dict["tags"]["config"], destination_system,
            destination_database, destination_environment, destination_table,
            S3TableTestBase.single_partition_tconx)

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table
        ]

        table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id}

        logging.info("Calling  M3D.create_table().")
        M3D.create_table(*table_config, **table_config_kwargs)

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        executed_steps = fake_cluster.steps

        assert len(executed_steps) == 1

        hive_step = executed_steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        db_landing = scon_emr_dict["environments"][destination_environment][
            "schemas"]["landing"]
        db_lake = scon_emr_dict["environments"][destination_environment][
            "schemas"]["lake"]

        ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \
                      "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \
                      "CREATE EXTERNAL TABLE dev_landing.bi_test103_stg1(name1 varchar(21), name2 varchar(6), " \
                      "name3 varchar(4))\n" \
                      "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \
                      "LOCATION 's3://m3d-dev-landing/dev/bi/test103/data/'\n" \
                      "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test103(name2 varchar(6), name3 varchar(4))\n" \
                   "PARTITIONED BY (name1 varchar(21))\n" \
                   "STORED AS PARQUET\n" \
                   "LOCATION 's3://m3d-dev-lake/dev/bi/test103/data/'\n" \
                   "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");"

        # Get content of hql in s3 bucket
        actual_hql_content_in_bucket = self.get_object_content_from_s3(
            hive_step.args[3])
        expected_hql = \
            ddl_landing + "\n" + \
            "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \
            ddl_lake + "\n" + \
            "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table)

        print("Expected: {0}\n".format(expected_hql))
        print("Actual: {0}\n".format(actual_hql_content_in_bucket))

        assert actual_hql_content_in_bucket == expected_hql