Пример #1
0
    def test_get_partition_columns_string(self):
        """
        This method tests the correct functionality of get_partition_column_string of Util class
        :return:
        """
        assert Util.get_partition_columns_string("year") == "year"
        assert Util.get_partition_columns_string("month") == "year,month"
        assert Util.get_partition_columns_string("day") == "year,month,day"
        assert Util.get_partition_columns_string("") == ""

        with pytest.raises(Exception) as exc_info:
            Util.get_partition_columns_list("country")
        assert "Partition type country not supported" in str(exc_info.value)
Пример #2
0
 def get_projection_columns(self, src_column_names, destination_column_names):
     columns = list(filter(lambda x: x[1], zip(src_column_names, destination_column_names)))
     if self.partitioned_by in Util.defined_partitions:
         partition_columns = list(map(lambda x: (x, x), Util.get_partition_columns_list(self.partitioned_by)))
         return columns + partition_columns
     else:
         return columns
Пример #3
0
    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        cluster_mode = False
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(
            m3d_config_file,
            cluster_mode,
            destination_system,
            destination_database,
            destination_environment
        )
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(
            m3d_config_file,
            cluster_mode,
            destination_system,
            destination_database,
            destination_environment,
            destination_active_table,
            load_type,
            self.emr_cluster_id,
            spark_params=spark_external_parameters
        )

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table
        )

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
        assert load_table_parameters["business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            partition_columns = Util.get_partition_columns_list(s3_table_active.partitioned_by)
        else:
            partition_columns = s3_table_active.partitioned_by

        assert load_table_parameters["partition_columns"] == partition_columns
        assert load_table_parameters["partition_column"] == s3_table_active.partition_column
        assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
            destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"]
        )

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted([
            {"Key": "ApiMethod", "Value": "load_table"},
            {"Key": "LoadType", "Value": "DeltaLoad"},
            {"Key": "TargetTable", "Value": "dev_lake.bi_test101"}
        ], key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]