def test_run_algorithm(self):
        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                self.destination_system,
                self.destination_database,
                self.destination_environment,
                self.destination_table
            )

        _, acon_dict = AconHelper.setup_acon_from_file(
            m3d_config_dict["tags"]["config"],
            self.destination_database,
            self.destination_environment,
            self.algorithm_instance,
            self.test_acon
        )

        algorithm_args = [
            m3d_config_file,
            self.destination_system,
            self.destination_database,
            self.destination_environment,
            self.algorithm_instance,
        ]

        algorithm_kwargs = {
            "emr_cluster_id": self.emr_cluster_id,
            "ext_params": json.dumps({
                "environment": {
                    "spark": {
                        "spark.driver.memory": "5G",
                        "spark.executor.memory": "20G",
                        "spark.executor.instances": 10,
                        "spark.executor.cores": 1,
                        "spark.scheduler.mode": "FAIR"
                    }
                },
                "algorithm": {
                    "destination_table": self.destination_table,
                }
            })
        }

        bucket_landing = scon_emr_dict["environments"][self.destination_environment]["s3_buckets"]["landing"]

        expected_param_dict = {
            "directory": "s3://{bucket}/dev/bi/{table}/data/".format(
                bucket=bucket_landing,
                table=self.table
            ),
            "format": "csv",
            "thread_pool_size": 8
        }

        def run_command_in_cluster_patch(cmd, name):
            # Check command name
            assert "Running Spark Application" in str(name)
            logging.info("Command is: {0}".format(cmd))
            command_components = cmd.split()

            # Check algorithm name from the spark command
            algorithm_class_name = command_components[-3]
            assert algorithm_class_name == ScalaClasses.GZIP_DECOMPRESSOR

            # Check configuration file content
            algorithm_config_file_name = command_components[-2]
            actual_config_file_content = self.get_object_content_from_s3(algorithm_config_file_name)
            logging.info("Actual config content: {0}".format(actual_config_file_content))

            algorithm_config_file_dict = json.loads(actual_config_file_content)

            assert algorithm_config_file_dict == expected_param_dict

        with patch("m3d.hadoop.emr.emr_system.EMRSystem.run_command_in_cluster",
                   side_effect=run_command_in_cluster_patch):
            with patch("m3d.util.util.Util.send_email") as email_patch:
                M3D.run_algorithm(*algorithm_args, **algorithm_kwargs)

        # Check the successful execution of algorithm
        call_args, _ = email_patch.call_args
        assert str(call_args[1]).startswith("Success")
    def test_run_algorithm(self, add_tags_patch, delete_object_patch,
                           send_email_patch):
        parameters_dict = {
            "scala_class": "CustomScalaClass",
            "key_el": "val",
            "key_list": ["x", 15],
            "key_dict": {
                "first": 1,
                "second": "2nd"
            }
        }

        acon_dict = {
            "algorithm": {
                "python_class": "AlgorithmScalaRunner",
                "parameters": parameters_dict
            }
        }

        m3d_config_file, scon_emr_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.destination_system,
                self.destination_database,
                self.destination_environment,
                self.algorithm_instance,
                acon_dict
            )

        algorithm_args = [
            m3d_config_file, self.cluster_mode, self.destination_system,
            self.destination_database, self.destination_environment,
            self.algorithm_instance
        ]

        spark_options = {
            "spark.driver.memory": "5G",
            "spark.executor.memory": "35G",
            "spark.executor.instances": 12,
            "spark.executor.cores": 2,
            "spark.scheduler.mode": "FAIR"
        }

        ext_params_dict = {
            "environment": {
                "emr_cluster_id": self.emr_cluster_id,
                "spark": spark_options
            }
        }

        algorithm_kwargs = {"ext_params": json.dumps(ext_params_dict)}

        emr_steps_completer = self.create_emr_steps_completer(
            expected_steps_count=1, timeout_seconds=3)

        with ConcurrentExecutor(emr_steps_completer):
            M3D.run_algorithm(*algorithm_args, **algorithm_kwargs)

        # Check EMR step
        mock_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        assert len(mock_cluster.steps) == 1

        spark_step = mock_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"
        assert spark_step.args[11] == "--conf"
        assert spark_step.args[13] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-3] == "CustomScalaClass"
        config_json_s3 = spark_step.args[-2]
        assert spark_step.args[-1] == "s3"

        # Check config.json file content
        config_json_content = self.get_object_content_from_s3(config_json_s3)
        config_json_dict = json.loads(config_json_content)
        assert config_json_dict == parameters_dict

        # Check that config.json was removed in the end
        delete_object_patch.assert_called_once()
        delete_object_patch_call_args, _ = delete_object_patch.call_args
        assert delete_object_patch_call_args == (config_json_s3, )

        # Check the successful execution of algorithm
        send_email_patch.assert_called_once()
        send_email_patch_call_args, _ = send_email_patch.call_args
        assert str(send_email_patch_call_args[1]).startswith("Success")

        add_tags_patch.assert_called_once()
        add_tags_patch_call_args, _ = add_tags_patch.call_args
        assert sorted(add_tags_patch_call_args[0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "run_algorithm"
                          }, {
                              "Key": "AlgorithmClass",
                              "Value": "AlgorithmScalaRunner"
                          }, {
                              "Key": "AlgorithmInstance",
                              "Value": "scala_runner_custom"
                          }],
                          key=lambda x: x["Key"])
예제 #3
0
    def test_run_algorithm(self, email_patch, delete_object_patch,
                           add_tags_patch):
        m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup(
            self.local_run_dir, self.destination_system,
            self.destination_database, self.destination_environment)

        schema_lake = scon_emr_dict["environments"][
            self.destination_environment]["schemas"]["lake"]
        bucket_lake = scon_emr_dict["environments"][
            self.destination_environment]["s3_buckets"]["lake"]

        spark_options = {
            "spark.driver.memory": "5G",
            "spark.executor.memory": "20G",
            "spark.executor.instances": 10,
            "spark.executor.cores": 1,
            "spark.scheduler.mode": "FAIR"
        }

        ext_params_dict = {"environment": {"spark": spark_options}}

        algorithm_args = [
            m3d_config_file, self.destination_system,
            self.destination_database, self.destination_environment,
            self.algorithm_instance, self.emr_cluster_id,
            json.dumps(ext_params_dict)
        ]

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        expected_step_count = 1
        timeout_seconds = 6

        emr_steps_completer = self.create_emr_steps_completer(
            expected_steps_count=expected_step_count,
            timeout_seconds=timeout_seconds)

        with ConcurrentExecutor(emr_steps_completer):
            M3D.run_algorithm(*algorithm_args)

        logging.info("Number of steps after execution: {}".format(
            len(fake_cluster.steps)))

        # Check the successful execution of algorithm
        email_patch.assert_called_once()
        call_args, _ = email_patch.call_args
        assert str(call_args[1]).startswith("Success")

        assert len(fake_cluster.steps) == expected_step_count

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"
        assert spark_step.args[11] == "--conf"
        assert spark_step.args[13] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"

        assert spark_step.args[-3] == "NestedFlattener"
        spark_json_s3 = spark_step.args[-2]

        assert spark_step.args[-1] == "s3"

        logging.info("Checking {}".format(spark_json_s3))

        # check that we tried to delete it
        delete_object_patch.assert_called_once()
        delete_object_call_args, _ = delete_object_patch.call_args
        assert str(delete_object_call_args[0]) == spark_json_s3

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "SourceTable",
                              "Value": "s3://m3d-dev-lake/nest/nest_test/data"
                          }, {
                              "Key": "TargetTable",
                              "Value": "dev_lake.nest_flattened"
                          }],
                          key=lambda x: x["Key"])
        assert sorted(add_tags_patch_call_args_list[1][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "run_algorithm"
                          }, {
                              "Key": "AlgorithmClass",
                              "Value": "AlgorithmNestedFlattener"
                          }, {
                              "Key": "AlgorithmInstance",
                              "Value": "nested_flattener"
                          }],
                          key=lambda x: x["Key"])

        # check content of config.json file
        spark_json_content = self.get_object_content_from_s3(spark_json_s3)

        spark_json_dict = json.loads(spark_json_content)

        assert spark_json_dict["source_location"] == os.path.join(
            ConfigService.Protocols.S3, bucket_lake, "nest/nest_test/data")
        assert spark_json_dict[
            "target_table"] == schema_lake + "." + "nest_flattened"
        assert spark_json_dict["fields_to_flatten"] == [
            "user_attributes", "device_info", "events", "events__data",
            "events__data__device_current_state"
        ]
        assert spark_json_dict["column_mapping"] == {
            "batch_id":
            "batch_id",
            "environment":
            "environment",
            "timestamp_unixtime_ms":
            "event_timestamp",
            "message_type":
            "message_type",
            "device_info__brand":
            "device_brand",
            "device_info__network_country":
            "network_country",
            "events__event_type":
            "event_type",
            "events__data__screen_name":
            "screen_name",
            "events__data__device_current_state__total_system_memory_usage_bytes":
            "memory_usage_bytes"
        }
        assert spark_json_dict["chars_to_replace"] == "[.:#]+"
        assert spark_json_dict["replacement_char"] == "_"
    def test_run_algorithm(self, email_patch, delete_object_patch,
                           add_tags_patch):
        m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup(
            self.local_run_dir, self.destination_system,
            self.destination_database, self.destination_environment)

        schema_lake = scon_emr_dict["environments"][
            self.destination_environment]["schemas"]["lake"]

        spark_options = {
            "spark.driver.memory": "5G",
            "spark.executor.memory": "20G",
            "spark.executor.instances": 10,
            "spark.executor.cores": 1,
            "spark.scheduler.mode": "FAIR"
        }

        ext_params_dict = {"environment": {"spark": spark_options}}

        algorithm_args = [
            m3d_config_file, self.destination_system,
            self.destination_database, self.destination_environment,
            self.algorithm_instance, self.emr_cluster_id,
            json.dumps(ext_params_dict)
        ]

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        expected_step_count = 1
        timeout_seconds = 6

        emr_steps_completer = self.create_emr_steps_completer(
            expected_steps_count=expected_step_count,
            timeout_seconds=timeout_seconds)

        with ConcurrentExecutor(emr_steps_completer):
            M3D.run_algorithm(*algorithm_args)

        logging.info("Number of steps after execution: {}".format(
            len(fake_cluster.steps)))

        # Check the successful execution of algorithm
        email_patch.assert_called_once()
        call_args, _ = email_patch.call_args
        assert str(call_args[1]).startswith("Success")

        assert len(fake_cluster.steps) == expected_step_count

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"
        assert spark_step.args[11] == "--conf"
        assert spark_step.args[13] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"

        assert spark_step.args[-3] == "Transpose"
        spark_json_s3 = spark_step.args[-2]

        assert spark_step.args[-1] == "s3"

        logging.info("Checking {}".format(spark_json_s3))

        # check that we tried to delete it
        delete_object_patch.assert_called_once()
        delete_object_call_args, _ = delete_object_patch.call_args
        assert str(delete_object_call_args[0]) == spark_json_s3

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "SourceTable",
                              "Value": schema_lake + "." + "pretranspose"
                          }, {
                              "Key": "TargetTable",
                              "Value": schema_lake + "." + "transpose"
                          }],
                          key=lambda x: x["Key"])
        assert sorted(add_tags_patch_call_args_list[1][0][0],
                      key=lambda x: x["Key"]) == sorted(
                          [{
                              "Key": "ApiMethod",
                              "Value": "run_algorithm"
                          }, {
                              "Key": "AlgorithmClass",
                              "Value": "AlgorithmTranspose"
                          }, {
                              "Key": "AlgorithmInstance",
                              "Value": "transpose"
                          }],
                          key=lambda x: x["Key"])

        # check content of config.json file
        spark_json_content = self.get_object_content_from_s3(spark_json_s3)

        spark_json_dict = json.loads(spark_json_content)

        assert spark_json_dict[
            "source_table"] == schema_lake + "." + "pretranspose"
        assert spark_json_dict[
            "target_table"] == schema_lake + "." + "transpose"
        assert spark_json_dict["group_by_column"] == [
            "product", "articleNo", "FactoryID"
        ]
        assert spark_json_dict["pivot_column"] == "name"
        assert spark_json_dict["aggregation_column"] == "value"