示例#1
0
    def _create_s3_table(self,
                         s3_resource,
                         hql_validation_function,
                         hql_validation_error=None):
        s3_resource.create_bucket(Bucket=self.LAKE_SPEC.bucket)
        for f in self.LAKE_SPEC.keys:
            logging.info("Creating object s3://{}/{}".format(
                self.LAKE_SPEC.bucket, f))
            s3_resource.Bucket(self.LAKE_SPEC.bucket).put_object(Key=f,
                                                                 Body="")

        s3_resource.create_bucket(Bucket=self.LANDING_SPEC.bucket)
        for f in self.LANDING_SPEC.keys:
            logging.info("Creating object s3://{}/{}".format(
                self.LANDING_SPEC.bucket, f))
            s3_resource.Bucket(self.LANDING_SPEC.bucket).put_object(Key=f,
                                                                    Body="")

        s3_table = S3Table.__new__(S3Table)

        # landing
        s3_table.db_table_landing = self.LANDING_SPEC.table

        dir_landing_data = "s3://{}/{}".format(self.LANDING_SPEC.bucket,
                                               self.LANDING_SPEC.data_dir)
        s3_table.dir_landing_data = dir_landing_data
        s3_table.dir_landing_work = dir_landing_data.replace("data", "work")
        s3_table.dir_landing_archive = dir_landing_data.replace(
            "data", "archive")
        s3_table.dir_landing_final = s3_table.dir_landing_data

        # lake
        s3_table.db_table_lake = self.LAKE_SPEC.table

        dir_lake_data = "s3://{}/{}".format(self.LAKE_SPEC.bucket,
                                            self.LAKE_SPEC.data_dir)
        s3_table.dir_lake_final = dir_lake_data

        s3_table.emr_system = FakeStorageSystem(hql_validation_function,
                                                hql_validation_error)
        s3_table.s3_resource = s3_resource

        test_landing_bucket_name = self.LANDING_SPEC.bucket
        test_lake_bucket_name = self.LAKE_SPEC.bucket

        s3_table.dir_landing_table = "s3://" + test_landing_bucket_name + "/" + self.LANDING_SPEC.data_dir
        s3_table.dir_lake_table = "s3://" + test_lake_bucket_name + "/" + self.LAKE_SPEC.data_dir

        s3_table.config_service = ConfigService(
            TestS3Table.DEFAULT_CONFIG_PATH)

        s3_table.partitioned_by = "month"
        s3_table.header_lines = 0
        s3_table.delimiter = "|"

        s3_table.columns_lake = [("name1", "varchar(21)"),
                                 ("name2", "varchar(6)"),
                                 ("name3", "varchar(4)")]

        return s3_table
    def __init__(self, execution_system, algorithm_instance, algorithm_params):
        """
        Initialize Algorithm Decompression

        :param execution_system: an instance of EMRSystem object
        :param algorithm_instance: name of the algorithm instance
        :param algorithm_params: algorithm configuration
        """

        super(AlgorithmGzipDecompressionEMR,
              self).__init__(execution_system, algorithm_instance,
                             algorithm_params)

        destination_table_name = algorithm_params["destination_table"]
        self._table = S3Table(execution_system, destination_table_name)
        self._thread_pool_size = self._parameters["thread_pool_size"]
示例#3
0
    def create_dataset(execution_system, load_type, data_type, dataset_name):

        if data_type == DataType.STRUCTURED:
            dataset = S3Table(emr_system=execution_system,
                              destination_table=dataset_name)
        elif data_type == DataType.SEMISTRUCTURED:
            if load_type == HiveTable.TableLoadType.APPEND:
                dataset = SemistructuredDataSet(emr_system=execution_system,
                                                dataset_name=dataset_name)
            else:
                raise M3DUnsupportedLoadTypeException(
                    load_type=load_type,
                    message="Loading algorithm {} not support for data type {}."
                    .format(load_type, data_type))
        else:
            raise M3DUnsupportedDataTypeException(
                message="Data Type {} not available.".format(data_type))

        return dataset
    def __init__(self,
                 test_run_dir,
                 setup_function,
                 partition_columns,
                 regex_filename,
                 file_format=None,
                 null_value=None,
                 quote_character=None,
                 compute_table_statistics=None):
        self.config_file, _, self.tconx_file, self.config_dict, self.scon_emr_dict = setup_function(
            *([test_run_dir] + self.destination_params))

        self._write_acon(partition_columns, regex_filename, file_format,
                         null_value, quote_character, compute_table_statistics)
        self._write_tconx()

        self.table_config = [self.config_file, self.cluster_mode
                             ] + self.destination_params
        emr_system = EMRSystem(self.config_file, self.cluster_mode,
                               self.destination_system,
                               self.destination_database,
                               self.destination_environment)
        self.s3_table = S3Table(emr_system, self.destination_table)

        config_filename = "append_load-{}-{}.json".format(
            self.destination_environment, self.destination_table)
        self.config_filepath = os.path.join(self.s3_table.dir_apps_append_load,
                                            config_filename)
        self.db_name_lake = self.scon_emr_dict["environments"][
            self.destination_environment]["schemas"]["lake"]

        self.expected_algorithms_jar_path = "s3://" + os.path.join(
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_buckets"]["application"]).strip("/"),
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_deployment_dir_base"]).strip("/"),
            self.destination_environment, self.scon_emr_dict["subdir"]["m3d"],
            self.config_dict["subdir_projects"]["m3d_api"],
            self.scon_emr_dict["spark"]["jar_name"])
示例#5
0
    def __init__(self, execution_system, load_type, destination_table,
                 spark_params_dict):
        """
        Initialize Load Executor

        :param execution_system: execution system
        :param load_type: load type
        :param destination_table: table to load
        :param spark_params_dict: spark parameters
        """

        super(LoadExecutorHadoop, self).__init__(execution_system)

        self._destination_table = destination_table
        self._spark_params_dict = spark_params_dict

        available_loads = self._get_available_emr_load_types()
        if load_type not in available_loads:
            raise M3DUnsupportedLoadTypeException(
                load_type=load_type,
                message="Loading algorithm {} not available.".format(
                    load_type))

        table = S3Table(emr_system=execution_system,
                        destination_table=destination_table)

        self._load_wrapper = available_loads[load_type](
            execution_system=self._execution_system, table=table)

        self._execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.API_METHOD:
            M3D.load_table.__name__,
            EMRSystem.EMRClusterTag.LOAD_TYPE:
            load_type,
            EMRSystem.EMRClusterTag.TARGET_TABLE:
            table.db_table_lake
        })
示例#6
0
    def test_lakeout_view_hql(self, add_tags_patch):
        tconx_src_path = "test/resources/test_create_out_view_hive/test_lakeout_view_structure/config/tconx.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        # Use test case specific tconx
        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3)

        with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4):
            logging.info("Calling M3D.create_out_view().")
            M3D.create_out_view(*table_config, **table_config_kwargs)

        emr_system = EMRSystem(*table_config[:5])
        s3_table = S3Table(emr_system, destination_table)

        mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
        assert 1 == len(mock_cluster.steps)

        hive_step = mock_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])

        column_name_pairs = [
            ("record_date", "v_record_date"),
            ("p_string", "v_string"),
            ("p_int", "v_int"),
            ("p_bigint", "v_bigint"),
            ("p_float", "v_float"),
            ("p_varchar_1", "v_varchar_10"),
            ("p_varchar_2", "v_varchar_100"),
            ("p_char_1", "v_char"),
            ("p_boolean", "v_boolean"),
            ("year", "year"),
            ("month", "month")
        ]
        columns_str = ", ".join(map(lambda x: "{} AS {}".format(x[0], x[1]), column_name_pairs))

        drop_view = "DROP VIEW IF EXISTS {};".format(s3_table.db_view_lake_out)

        # S3Table is partitioned by year and month
        create_view = "\n".join([
            "CREATE VIEW {}".format(s3_table.db_view_lake_out),
            "AS",
            "SELECT {}".format(columns_str),
            "FROM {};".format(s3_table.db_table_lake)
        ])

        expected_hql = "\n".join([drop_view, create_view])

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_out_view"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetView",
            "Value": "dev_lake_out.bi_test101"
        }]
示例#7
0
 def truncate_table(self, destination_table):
     from m3d.hadoop.emr.s3_table import S3Table
     full_table_name = "{}.{}".format(self.db_lake, destination_table)
     self.add_cluster_tag(self.EMRClusterTag.TARGET_TABLE, full_table_name)
     S3Table(self, destination_table).truncate_tables()
示例#8
0
 def drop_out_view(self, destination_table):
     from m3d.hadoop.emr.s3_table import S3Table
     full_table_name = "{}.{}".format(self.db_lake_out, destination_table)
     self.add_cluster_tag(self.EMRClusterTag.TARGET_VIEW, full_table_name)
     S3Table(self, destination_table).drop_out_view()
    def test_full_load_emr(self, _0, _1):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id, spark_external_parameters
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]
        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config)

        # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have
        # old files in it and dir_landing_archive will not have new files
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"
    def test_full_load_emr_external_spark_parameters(self, _0):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"
        acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )
        AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"],
                                        destination_database,
                                        destination_environment,
                                        destination_table, acon_src_path)

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config,
                       spark_params=json.dumps(spark_external_parameters))

        # psv file will still be in landing since move operation should be
        # performed by EMR Step which we mock here. Accordingly archive will
        # still be empty.
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"
示例#11
0
    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=spark_external_parameters)

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted([{
                          "Key": "ApiMethod",
                          "Value": "load_table"
                      }, {
                          "Key": "LoadType",
                          "Value": "DeltaLoad"
                      }, {
                          "Key": "TargetTable",
                          "Value": "bi_test101"
                      }],
                                                        key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]