def test_run_algorithm(self): m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment, self.destination_table ) _, acon_dict = AconHelper.setup_acon_from_file( m3d_config_dict["tags"]["config"], self.destination_database, self.destination_environment, self.algorithm_instance, self.test_acon ) algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, ] algorithm_kwargs = { "emr_cluster_id": self.emr_cluster_id, "ext_params": json.dumps({ "environment": { "spark": { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } }, "algorithm": { "destination_table": self.destination_table, } }) } bucket_landing = scon_emr_dict["environments"][self.destination_environment]["s3_buckets"]["landing"] expected_param_dict = { "directory": "s3://{bucket}/dev/bi/{table}/data/".format( bucket=bucket_landing, table=self.table ), "format": "csv", "thread_pool_size": 8 } def run_command_in_cluster_patch(cmd, name): # Check command name assert "Running Spark Application" in str(name) logging.info("Command is: {0}".format(cmd)) command_components = cmd.split() # Check algorithm name from the spark command algorithm_class_name = command_components[-3] assert algorithm_class_name == ScalaClasses.GZIP_DECOMPRESSOR # Check configuration file content algorithm_config_file_name = command_components[-2] actual_config_file_content = self.get_object_content_from_s3(algorithm_config_file_name) logging.info("Actual config content: {0}".format(actual_config_file_content)) algorithm_config_file_dict = json.loads(actual_config_file_content) assert algorithm_config_file_dict == expected_param_dict with patch("m3d.hadoop.emr.emr_system.EMRSystem.run_command_in_cluster", side_effect=run_command_in_cluster_patch): with patch("m3d.util.util.Util.send_email") as email_patch: M3D.run_algorithm(*algorithm_args, **algorithm_kwargs) # Check the successful execution of algorithm call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success")
def test_run_algorithm(self, add_tags_patch, delete_object_patch, send_email_patch): parameters_dict = { "scala_class": "CustomScalaClass", "key_el": "val", "key_list": ["x", 15], "key_dict": { "first": 1, "second": "2nd" } } acon_dict = { "algorithm": { "python_class": "AlgorithmScalaRunner", "parameters": parameters_dict } } m3d_config_file, scon_emr_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, acon_dict ) algorithm_args = [ m3d_config_file, self.cluster_mode, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance ] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "35G", "spark.executor.instances": 12, "spark.executor.cores": 2, "spark.scheduler.mode": "FAIR" } ext_params_dict = { "environment": { "emr_cluster_id": self.emr_cluster_id, "spark": spark_options } } algorithm_kwargs = {"ext_params": json.dumps(ext_params_dict)} emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=1, timeout_seconds=3) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args, **algorithm_kwargs) # Check EMR step mock_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert len(mock_cluster.steps) == 1 spark_step = mock_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "CustomScalaClass" config_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" # Check config.json file content config_json_content = self.get_object_content_from_s3(config_json_s3) config_json_dict = json.loads(config_json_content) assert config_json_dict == parameters_dict # Check that config.json was removed in the end delete_object_patch.assert_called_once() delete_object_patch_call_args, _ = delete_object_patch.call_args assert delete_object_patch_call_args == (config_json_s3, ) # Check the successful execution of algorithm send_email_patch.assert_called_once() send_email_patch_call_args, _ = send_email_patch.call_args assert str(send_email_patch_call_args[1]).startswith("Success") add_tags_patch.assert_called_once() add_tags_patch_call_args, _ = add_tags_patch.call_args assert sorted(add_tags_patch_call_args[0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmScalaRunner" }, { "Key": "AlgorithmInstance", "Value": "scala_runner_custom" }], key=lambda x: x["Key"])
def test_run_algorithm(self, email_patch, delete_object_patch, add_tags_patch): m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment) schema_lake = scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] bucket_lake = scon_emr_dict["environments"][ self.destination_environment]["s3_buckets"]["lake"] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } ext_params_dict = {"environment": {"spark": spark_options}} algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, self.emr_cluster_id, json.dumps(ext_params_dict) ] fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] expected_step_count = 1 timeout_seconds = 6 emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=expected_step_count, timeout_seconds=timeout_seconds) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args) logging.info("Number of steps after execution: {}".format( len(fake_cluster.steps))) # Check the successful execution of algorithm email_patch.assert_called_once() call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success") assert len(fake_cluster.steps) == expected_step_count spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "NestedFlattener" spark_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" logging.info("Checking {}".format(spark_json_s3)) # check that we tried to delete it delete_object_patch.assert_called_once() delete_object_call_args, _ = delete_object_patch.call_args assert str(delete_object_call_args[0]) == spark_json_s3 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "SourceTable", "Value": "s3://m3d-dev-lake/nest/nest_test/data" }, { "Key": "TargetTable", "Value": "dev_lake.nest_flattened" }], key=lambda x: x["Key"]) assert sorted(add_tags_patch_call_args_list[1][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmNestedFlattener" }, { "Key": "AlgorithmInstance", "Value": "nested_flattener" }], key=lambda x: x["Key"]) # check content of config.json file spark_json_content = self.get_object_content_from_s3(spark_json_s3) spark_json_dict = json.loads(spark_json_content) assert spark_json_dict["source_location"] == os.path.join( ConfigService.Protocols.S3, bucket_lake, "nest/nest_test/data") assert spark_json_dict[ "target_table"] == schema_lake + "." + "nest_flattened" assert spark_json_dict["fields_to_flatten"] == [ "user_attributes", "device_info", "events", "events__data", "events__data__device_current_state" ] assert spark_json_dict["column_mapping"] == { "batch_id": "batch_id", "environment": "environment", "timestamp_unixtime_ms": "event_timestamp", "message_type": "message_type", "device_info__brand": "device_brand", "device_info__network_country": "network_country", "events__event_type": "event_type", "events__data__screen_name": "screen_name", "events__data__device_current_state__total_system_memory_usage_bytes": "memory_usage_bytes" } assert spark_json_dict["chars_to_replace"] == "[.:#]+" assert spark_json_dict["replacement_char"] == "_"
def test_run_algorithm(self, email_patch, delete_object_patch, add_tags_patch): m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment) schema_lake = scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } ext_params_dict = {"environment": {"spark": spark_options}} algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, self.emr_cluster_id, json.dumps(ext_params_dict) ] fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] expected_step_count = 1 timeout_seconds = 6 emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=expected_step_count, timeout_seconds=timeout_seconds) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args) logging.info("Number of steps after execution: {}".format( len(fake_cluster.steps))) # Check the successful execution of algorithm email_patch.assert_called_once() call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success") assert len(fake_cluster.steps) == expected_step_count spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "Transpose" spark_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" logging.info("Checking {}".format(spark_json_s3)) # check that we tried to delete it delete_object_patch.assert_called_once() delete_object_call_args, _ = delete_object_patch.call_args assert str(delete_object_call_args[0]) == spark_json_s3 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "SourceTable", "Value": schema_lake + "." + "pretranspose" }, { "Key": "TargetTable", "Value": schema_lake + "." + "transpose" }], key=lambda x: x["Key"]) assert sorted(add_tags_patch_call_args_list[1][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmTranspose" }, { "Key": "AlgorithmInstance", "Value": "transpose" }], key=lambda x: x["Key"]) # check content of config.json file spark_json_content = self.get_object_content_from_s3(spark_json_s3) spark_json_dict = json.loads(spark_json_content) assert spark_json_dict[ "source_table"] == schema_lake + "." + "pretranspose" assert spark_json_dict[ "target_table"] == schema_lake + "." + "transpose" assert spark_json_dict["group_by_column"] == [ "product", "articleNo", "FactoryID" ] assert spark_json_dict["pivot_column"] == "name" assert spark_json_dict["aggregation_column"] == "value"