class TestLoadTableAppendS3(S3TableTestBase): @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_append(self, remove_json_patch, add_tags_patch, _0, _1): # responses.add_passthru(self.default_server_url) partition_columns = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' null_value = "test_null_value" quote_character = "test_quote" compute_table_statistics = True config = AppendLoadConfig( self.local_run_dir, self.env_setup, partition_columns, regex_filename, null_value=null_value, quote_character=quote_character, compute_table_statistics=compute_table_statistics) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, spark_external_parameters) # Check EMR steps assert len(fake_cluster.steps) == 1 # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == config.load_type assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" # Check that config_file_s3 file is on application S3 bucket app_files = self.get_child_objects( config.s3_table.dir_apps_append_load) app_json_files = list( filter( lambda app_file: os.path.basename(app_file).endswith(".json"), app_files)) assert len(app_json_files) == 1 assert app_json_files[0] == config.config_filepath # Check config file for Spark actual_parameters = json.loads( self.get_object_content_from_s3(config.config_filepath)) expected_table_full_name = "{}.{}".format(config.db_name_lake, config.destination_table) expected_parameters = { "target_table": expected_table_full_name, "source_dir": config.s3_table.dir_landing_final, "header_dir": config.s3_table.dir_landing_header, "delimiter": "|", "has_header": False, "partition_columns": partition_columns, "regex_filename": regex_filename, "file_format": "dsv", "null_value": "test_null_value", "quote_character": "test_quote", "compute_table_statistics": True } assert actual_parameters == expected_parameters add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "AppendLoad" }, { "Key": "TargetTable", "Value": expected_table_full_name }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0] @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2): # responses.add_passthru(self.default_server_url) partition_columns = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' config = AppendLoadConfig(self.local_run_dir, self.env_setup, partition_columns, regex_filename, "parquet") fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, spark_external_parameters) # Check EMR steps assert len(fake_cluster.steps) == 1 # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == config.load_type assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" # Check that config_file_s3 file is on application S3 bucket app_files = self.get_child_objects( config.s3_table.dir_apps_append_load) app_json_files = list( filter( lambda app_file: os.path.basename(app_file).endswith(".json"), app_files)) assert len(app_json_files) == 1 assert app_json_files[0] == config.config_filepath # Check config file for Spark actual_parameters = json.loads( self.get_object_content_from_s3(config.config_filepath)) expected_table_full_name = "{}.{}".format(config.db_name_lake, config.destination_table) expected_parameters = { "target_table": expected_table_full_name, "source_dir": config.s3_table.dir_landing_final, "header_dir": config.s3_table.dir_landing_header, "delimiter": "|", "has_header": False, "partition_columns": partition_columns, "regex_filename": regex_filename, "file_format": "parquet" } assert actual_parameters == expected_parameters remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0] @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_external_spark_parameters(self, _0, _1): # responses.add_passthru(self.default_server_url) partition_columns = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } config = AppendLoadConfig(self.local_run_dir, self.env_setup, partition_columns, regex_filename) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, json.dumps(spark_external_parameters)) # Check EMR step. assert len(fake_cluster.steps) == 1 spark_step = fake_cluster.steps[0] # Check args of EMR step assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == "AppendLoad" assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_invalid_parameters1(self, _0, _1): # responses.add_passthru(self.default_server_url) spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' config = AppendLoadConfig( self.local_run_dir, self.env_setup, ["year", "month"], [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ]) with pytest.raises(M3DIllegalArgumentException) as ex: config.load_table(self.emr_cluster_id, spark_external_parameters) assert str(ex.value).startswith( "Lengths of partition_columns and regex_filename do not match") @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_invalid_parameters2(self, _0, _1): # responses.add_passthru(self.default_server_url) spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' config = AppendLoadConfig(self.local_run_dir, self.env_setup, ["year", "month", "day"], []) with pytest.raises(M3DIllegalArgumentException) as ex: config.load_table(self.emr_cluster_id, spark_external_parameters) assert str(ex.value).startswith( "Lengths of partition_columns and regex_filename do not match")
class TestTruncateTableS3Integration(S3TableTestBase): @pytest.mark.emr @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) def test_check_s3_cleanup(self, add_tags_patch, _): cluster_mode = False destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" source_system = "bi" table = "test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table) table_config_args = [ m3d_config_file, cluster_mode, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_landing = scon_emr_dict["environments"][destination_environment][ "schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment][ "s3_buckets"]["lake"] test_content = "sample content" landing_dir = "{environment}/{source_system}/{table}".format( environment=destination_environment, source_system=source_system, table=table) landing_data_dir = os.path.join(landing_dir, "data") landing_archive_dir = os.path.join(landing_dir, "archive") landing_work_dir = os.path.join(landing_dir, "work") landing_data_key = os.path.join(landing_data_dir, "new_landing_dump") landing_archive_key = os.path.join(landing_archive_dir, "old_dump.gz") landing_work_key = os.path.join(landing_work_dir, "temporary_data") lake_dir = "{environment}/{source_system}/{table}".format( environment=destination_environment, db_cd=db_lake, source_system=source_system, table=table) lake_data_dir = os.path.join(lake_dir, "data") lake_data_key = os.path.join(lake_data_dir, "new_lake_dump") # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_data_key, Body=test_content) self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_archive_key, Body=test_content) self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_work_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content) logging.info("Calling M3D.truncate_table()") M3D.truncate_table(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert len(fake_cluster.steps) == 2 # Get actual HQL statements actual_hqls = [] for step in fake_cluster.steps: assert ["hive", "--silent", "-f"] == step.args[0:3] hql_file = step.args[3] hql_content = self.get_object_content_from_s3(hql_file) actual_hqls.append(hql_content) db_table_landing = "{}.{}{}".format( db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]) landing_table_location = os.path.join("s3://", bucket_landing, landing_data_dir, "") db_table_lake = "{}.{}".format(db_lake, destination_table) lake_table_location = os.path.join("s3://", bucket_lake, lake_data_dir, "") landing_hql = "ALTER TABLE {} SET LOCATION \"{}\";".format( db_table_landing, landing_table_location) lake_hql = "\n".join([ "DROP TABLE {};".format(db_table_lake), TestTruncateTableS3Integration._get_table_ddl_lake( db_table_lake, lake_table_location), "MSCK REPAIR TABLE {};".format(db_table_lake) ]) expected_hqls = [landing_hql, lake_hql] assert actual_hqls == expected_hqls # checking landing directory landing_files = [ k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all() ] assert len(landing_files) == 0 # checking lake directory lake_files = [ k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all() ] assert len(lake_files) == 0 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "truncate_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }] @staticmethod def _get_table_ddl_lake(db_table, location): columns = ", ".join( ["name1 varchar(21)", "name2 varchar(6)", "name3 varchar(4)"]) return "\n".join([ "CREATE EXTERNAL TABLE {}({})".format(db_table, columns), "PARTITIONED BY (year smallint, month smallint)", "STORED AS PARQUET", "LOCATION \'{}\'".format(location), "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ])
class TestDropDatasetS3Integration(EMRSystemUnitTestBase): @pytest.mark.emr @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags") def test_check_s3_cleanup(self, add_tags_patch, _): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_dataset = "nest_nest_test" source_system = "nest" short_dataset_name = "nest_test" m3d_config_file, _, m3d_config_dict, scon_emr_dict = self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment ) dataset_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_dataset ] dataset_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] test_content = "sample content" landing_dir = "{environment}/{source_system}/{dataset}".format( environment=destination_environment, source_system=source_system, dataset=short_dataset_name ) landing_data_dir = os.path.join(landing_dir, "data") landing_data_key = os.path.join(landing_data_dir, "new_landing_dump") lake_dir = "{environment}/{source_system}/{dataset}".format( environment=destination_environment, db_cd=db_lake, source_system=source_system, dataset=short_dataset_name ) lake_data_dir = os.path.join(lake_dir, "data") lake_data_key = os.path.join(lake_data_dir, "new_lake_dump") # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object(Key=landing_data_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content) # checking if landing and lake directories contain the uploaded files landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 1 lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 1 logging.info("Calling M3D.drop_dataset()") M3D.drop_dataset(*dataset_config_args, **dataset_config_kwargs) # checking if the files were removed landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 0 lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 0 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "drop_dataset" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetDataset", "Value": "{}.{}".format(db_lake, destination_dataset) }]
class TestDropOutViewS3Integration(S3TableTestBase): default_tconx = "test/resources/test_drop_out_view_s3/tconx-bdp-emr_test-dev-bi_test101.csv" @pytest.mark.emr @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) def test_check_s3_cleanup(self, add_tags_patch, _): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_lake_out = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake_out"] lake_out = "bi_test101" logging.info("Calling M3D.drop_out_view()") M3D.drop_out_view(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" actual_hql_content_in_bucket = self.get_object_content_from_s3( hive_step.args[3]) expected_hql = "DROP VIEW IF EXISTS {}.{};".format( db_lake_out, lake_out) assert expected_hql == actual_hql_content_in_bucket add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "drop_out_view" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetView", "Value": "dev_lake_out.bi_test101" }]
class TestDropTableS3Integration(S3TableTestBase): default_tconx = "test/resources/test_drop_table_s3/tconx-bdp-emr_prod-dev-bi_test101.json" @pytest.mark.emr @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags") def test_check_s3_cleanup(self, add_tags_patch, _): logging.info("Starting s3 Checkup cleanup") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] test_content = "sample content" test_lake_key_filename = "test_lake_key" test_land_key_filename = "test_land_key" source_system = "bi" table = "test101" test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_land_key_filename ) test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_lake_key_filename ) # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content) logging.info("Calling M3D.create_table()") M3D.create_table(*table_config_args, **table_config_kwargs) logging.info("Calling M3D.drop_table()") M3D.drop_table(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert 3 == len(fake_cluster.steps) # Get actual HQL statements actual_hqls = [] for step in fake_cluster.steps: assert ["hive", "--silent", "-f"] == step.args[0:3] hql_file = step.args[3] hql_content = self.get_object_content_from_s3(hql_file) actual_hqls.append(hql_content) expected_hqls = [ 'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]), 'DROP TABLE {}.{};'.format(db_lake, destination_table) ] assert expected_hqls == actual_hqls[1:3] # checking landing directory landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 1 assert landing_files[0] == test_land_key # checking lake directory lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 1 assert lake_files[0] == test_lake_key add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 4 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }] assert add_tags_patch_call_args_list[2][0][0] == [{ "Key": "ApiMethod", "Value": "drop_table" }] assert add_tags_patch_call_args_list[3][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
class TestCreateTableS3(S3TableTestBase): @pytest.mark.emr @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags") def test_check_hql(self, add_tags_patch, _): logging.info("Starting TestCreateTableS3.test_check_hql()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, _, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "PARTITIONED BY (year smallint, month smallint)\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) logging.info("Expected: {0}\n".format(expected_hql)) logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }] @pytest.mark.emr @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch("m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags") def test_check_hql_with_custom_location(self, add_tags_patch, _): logging.info("Starting TestCreateTableS3.test_check_hql_with_custom_location()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" destination_table_location_prefix = "data_20200101100015123" m3d_config_file, _, _, _, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, destination_table_location_prefix ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "PARTITIONED BY (year smallint, month smallint)\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test101/{}/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");".format(destination_table_location_prefix) # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) logging.info("Expected: {0}\n".format(expected_hql)) logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
class TestLoadTableFullS3(S3TableTestBase): @pytest.mark.emr @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) def test_full_load_emr(self, _0, _1): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" load_type = "FullLoad" landing_dataset = "landing-dataset.psv" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id, spark_external_parameters ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config) # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have # old files in it and dir_landing_archive will not have new files landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3" @pytest.mark.emr @mock.patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_full_load_emr_external_spark_parameters(self, _0): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } load_type = "FullLoad" landing_dataset = "landing-dataset.psv" m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"], destination_database, destination_environment, destination_table, acon_src_path) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config, spark_params=json.dumps(spark_external_parameters)) # psv file will still be in landing since move operation should be # performed by EMR Step which we mock here. Accordingly archive will # still be empty. landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3"
class TestLoadTableDeltaS3(S3TableTestBase): def env_setup(self, tmpdir, destination_system, destination_database, destination_environment, destination_table, tconx_content=None, tconx_cl_content=None): m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super( TestLoadTableDeltaS3, self ).env_setup( tmpdir, destination_system, destination_database, destination_environment, destination_table ) tconx_filename_template = "tconx-{source_system}-{db_cd}-{environment}-{table}.json" tconx_cl_filename = tconx_filename_template.format( source_system=destination_system, db_cd=destination_database, environment=destination_environment, table=destination_table + "_cl") tconx_cl_file = os.path.join(os.path.dirname(tconx_file), tconx_cl_filename) if tconx_content: py.path.local(tconx_file).write(tconx_content) if tconx_cl_content: py.path.local(tconx_cl_file).write(tconx_cl_content) return m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, \ m3d_config_dict, scon_emr_dict @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1): # responses.add_passthru(self.default_server_url) destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_active_table = "bi_test101" destination_changelog_table = "bi_test101_cl" load_type = "DeltaLoad" src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json" src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' # pass desired content of tconx files for active and changelog tables to self.env_setup() src_tconx_content = py.path.local(src_tconx_path).read() src_tconx_cl_content = py.path.local(src_tconx_cl_table).read() m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_active_table, src_tconx_content, src_tconx_cl_content ) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) s3_table_active = S3Table(emr_system, destination_active_table) s3_table_changelog = S3Table(emr_system, destination_changelog_table) # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] # Put lake data for changelog table, this should be archived self.dump_data_to_s3( os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"), "t|e|s|t|a|d|i|d|a|s|m|3|d|", ) M3D.load_table(m3d_config_file, destination_system, destination_database, destination_environment, destination_active_table, load_type, self.emr_cluster_id, spark_params=spark_external_parameters) filename_json = "delta_load-{environment}-{table}.json".format( environment=destination_environment, table=destination_active_table) # Checking configuration file for m3d-engine app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load) assert len(app_files) == 1 assert app_files[ 0] == s3_table_active.dir_apps_delta_load + filename_json delta_load_config_s3 = app_files[0] delta_load_config_content = self.get_object_content_from_s3( delta_load_config_s3) load_table_parameters = json.loads(delta_load_config_content) assert load_table_parameters[ "active_records_table_lake"] == s3_table_active.db_table_lake assert load_table_parameters[ "active_records_dir_lake"] == s3_table_active.dir_lake_final assert load_table_parameters[ "delta_records_file_path"] == s3_table_active.dir_landing_data assert load_table_parameters["technical_key"] == [ "m3d_timestamp", "datapakid", "partno", "record" ] assert load_table_parameters[ "business_key"] == s3_table_active.business_key if s3_table_active.partitioned_by in Util.defined_partitions: target_partitions = Util.get_target_partitions_list( s3_table_active.partitioned_by) else: target_partitions = s3_table_active.partitioned_by assert load_table_parameters["target_partitions"] == target_partitions assert load_table_parameters[ "partition_column"] == s3_table_active.partition_column assert load_table_parameters[ "partition_column_format"] == s3_table_active.partition_column_format # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) delta_load_step = fake_cluster.steps[0] assert delta_load_step.jar == "command-runner.jar" assert delta_load_step.args[0] == "spark-submit" assert delta_load_step.args[ -5] == "com.adidas.analytics.AlgorithmFactory" assert delta_load_step.args[-4] == expected_algorithms_jar_path assert delta_load_step.args[-3] == "DeltaLoad" assert delta_load_step.args[-2] == delta_load_config_s3 assert delta_load_step.args[-1] == "s3" add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted([{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "DeltaLoad" }, { "Key": "TargetTable", "Value": "bi_test101" }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0] @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_delta_external_spark_parameters( self, remove_json_patch, _0, _1): # responses.add_passthru(self.default_server_url) destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_active_table = "bi_test101" destination_changelog_table = "bi_test101_cl" spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } load_type = "DeltaLoad" src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json" src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json" # pass desired content of tconx files for active and changelog tables to self.env_setup() src_tconx_content = py.path.local(src_tconx_path).read() src_tconx_cl_content = py.path.local(src_tconx_cl_table).read() m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_active_table, src_tconx_content, src_tconx_cl_content ) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) s3_table_active = S3Table(emr_system, destination_active_table) s3_table_changelog = S3Table(emr_system, destination_changelog_table) # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] # Put lake data for changelog table, this should be archived self.dump_data_to_s3( os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"), "t|e|s|t|a|d|i|d|a|s|m|3|d|", ) M3D.load_table(m3d_config_file, destination_system, destination_database, destination_environment, destination_active_table, load_type, self.emr_cluster_id, spark_params=json.dumps(spark_external_parameters)) filename_json = "delta_load-{environment}-{table}.json".format( environment=destination_environment, table=destination_active_table) # Checking configuration file for m3d-engine app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load) assert len(app_files) == 1 assert app_files[ 0] == s3_table_active.dir_apps_delta_load + filename_json delta_load_config_s3 = app_files[0] delta_load_config_content = self.get_object_content_from_s3( delta_load_config_s3) load_table_parameters = json.loads(delta_load_config_content) assert load_table_parameters[ "active_records_table_lake"] == s3_table_active.db_table_lake assert load_table_parameters[ "active_records_dir_lake"] == s3_table_active.dir_lake_final assert load_table_parameters[ "delta_records_file_path"] == s3_table_active.dir_landing_data assert load_table_parameters["technical_key"] == [ "m3d_timestamp", "datapakid", "partno", "record" ] assert load_table_parameters[ "business_key"] == s3_table_active.business_key if s3_table_active.partitioned_by in Util.defined_partitions: target_partitions = Util.get_target_partitions_list( s3_table_active.partitioned_by) else: target_partitions = s3_table_active.partitioned_by assert load_table_parameters["target_partitions"] == target_partitions assert load_table_parameters[ "partition_column"] == s3_table_active.partition_column assert load_table_parameters[ "partition_column_format"] == s3_table_active.partition_column_format # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) delta_load_step = fake_cluster.steps[0] assert delta_load_step.jar == "command-runner.jar" assert delta_load_step.args[0] == "spark-submit" assert delta_load_step.args[5] == "--conf" assert delta_load_step.args[7] == "--conf" assert delta_load_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: delta_load_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert delta_load_step.args[ -5] == "com.adidas.analytics.AlgorithmFactory" assert delta_load_step.args[-4] == expected_algorithms_jar_path assert delta_load_step.args[-3] == "DeltaLoad" assert delta_load_step.args[-2] == delta_load_config_s3 assert delta_load_step.args[-1] == "s3" remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
class TestLoadTableAppendS3(S3TableTestBase): @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_append(self, remove_json_patch, add_tags_patch, _0, _1): target_partitions = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' null_value = "test_null_value" quote_character = "test_quote" compute_table_statistics = True verify_schema = False data_type = DataType.STRUCTURED reader_mode = "DROPMALFORMED" metadata_update_strategy = "SparkRecoverPartitionsCustom" source_system = AppendLoadConfig.destination_table.split("_", 1)[0] table = AppendLoadConfig.destination_table.split("_", 1)[-1] test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format( lake_bucket=self.default_dev_lake_bucket, destination_environment=AppendLoadConfig.destination_environment, system=source_system, table=table) config = AppendLoadConfig( self.local_run_dir, self.env_setup, target_partitions, regex_filename, null_value=null_value, quote_character=quote_character, metadata_update_strategy=metadata_update_strategy, compute_table_statistics=compute_table_statistics, verify_schema=verify_schema, data_type=data_type, reader_mode=reader_mode) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, spark_external_parameters) # Check EMR steps assert len(fake_cluster.steps) == 1 # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == config.load_type assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" # Check that config_file_s3 file is on application S3 bucket app_files = self.get_child_objects(config.dataset.dir_apps_append_load) app_json_files = list( filter( lambda app_file: os.path.basename(app_file).endswith(".json"), app_files)) assert len(app_json_files) == 1 assert app_json_files[0] == config.config_filepath # Check config file for Spark actual_parameters = json.loads( self.get_object_content_from_s3(config.config_filepath)) expected_table_full_name = "{}.{}".format(config.db_name_lake, config.destination_table) expected_parameters = { "target_table": expected_table_full_name, "source_dir": config.dataset.dir_landing_final, "header_dir": config.dataset.dir_landing_header, "delimiter": "|", "has_header": False, "target_partitions": target_partitions, "regex_filename": regex_filename, "file_format": "dsv", "null_value": "test_null_value", "quote_character": "test_quote", "compute_table_statistics": True, "data_type": DataType.STRUCTURED, "verify_schema": False, "metadata_update_strategy": "SparkRecoverPartitionsCustom", "target_dir": test_target_dir, "reader_mode": "DROPMALFORMED" } assert actual_parameters == expected_parameters add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "AppendLoad" }, { "Key": "TargetTable", "Value": config.destination_table }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0] @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_append_parquet(self, remove_json_patch, _0, _1, _2): target_partitions = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' source_system = AppendLoadConfig.destination_table.split("_", 1)[0] table = AppendLoadConfig.destination_table.split("_", 1)[-1] test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format( lake_bucket=self.default_dev_lake_bucket, destination_environment=AppendLoadConfig.destination_environment, system=source_system, table=table) config = AppendLoadConfig( self.local_run_dir, self.env_setup, target_partitions, regex_filename, file_format="parquet", metadata_update_strategy="SparkRecoverPartitionsNative") fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, spark_external_parameters) # Check EMR steps assert len(fake_cluster.steps) == 1 # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == config.load_type assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" # Check that config_file_s3 file is on application S3 bucket app_files = self.get_child_objects(config.dataset.dir_apps_append_load) app_json_files = list( filter( lambda app_file: os.path.basename(app_file).endswith(".json"), app_files)) assert len(app_json_files) == 1 assert app_json_files[0] == config.config_filepath # Check config file for Spark actual_parameters = json.loads( self.get_object_content_from_s3(config.config_filepath)) expected_table_full_name = "{}.{}".format(config.db_name_lake, config.destination_table) expected_parameters = { "target_table": expected_table_full_name, "source_dir": config.dataset.dir_landing_final, "header_dir": config.dataset.dir_landing_header, "delimiter": "|", "has_header": False, "target_partitions": target_partitions, "regex_filename": regex_filename, "metadata_update_strategy": "SparkRecoverPartitionsNative", "file_format": "parquet", "target_dir": test_target_dir } assert actual_parameters == expected_parameters remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0] @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_external_spark_parameters(self, _0, _1): target_partitions = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } config = AppendLoadConfig(self.local_run_dir, self.env_setup, target_partitions, regex_filename) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, json.dumps(spark_external_parameters)) # Check EMR step. assert len(fake_cluster.steps) == 1 spark_step = fake_cluster.steps[0] # Check args of EMR step assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == "AppendLoad" assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_invalid_parameters1(self, _0, _1): spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' config = AppendLoadConfig( self.local_run_dir, self.env_setup, ["year", "month"], [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ]) with pytest.raises(M3DIllegalArgumentException) as ex: config.load_table(self.emr_cluster_id, spark_external_parameters) assert str(ex.value).startswith( "Lengths of target_partitions and regex_filename do not match") @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) def test_load_table_append_invalid_parameters2(self, _0, _1): spark_external_parameters = ''' { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' config = AppendLoadConfig(self.local_run_dir, self.env_setup, ["year", "month", "day"], []) with pytest.raises(M3DIllegalArgumentException) as ex: config.load_table(self.emr_cluster_id, spark_external_parameters) assert str(ex.value).startswith( "Lengths of target_partitions and regex_filename do not match") @pytest.mark.emr @patch("m3d.util.util.Util.send_email") @patch("moto.emr.models.ElasticMapReduceBackend.describe_step", return_value=FakeStep("COMPLETED")) @patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient._do_add_emr_cluster_tags" ) @patch( "m3d.hadoop.core.spark_executor.SparkExecutor._remove_parameter_json") def test_load_table_append_valid_parameters_semistructured_data( self, _0, _1, _2, _3): table = AppendLoadConfig.destination_table.split("_", 1)[-1] target_partitions = ["year", "month", "day"] regex_filename = [ "[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})" ] test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format( lake_bucket=self.default_dev_lake_bucket, destination_environment=AppendLoadConfig.destination_environment, system=AppendLoadConfig.source_system, table=table) spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } null_value = "test_null_value" quote_character = "test_quote" data_type = DataType.SEMISTRUCTURED verify_schema = True schema = { "type": "struct", "fields": [{ "name": "first_name", "type": "string", "nullable": True, "metadata": {} }, { "name": "surname", "type": "string", "nullable": True, "metadata": {} }, { "name": "age", "type": "integer", "nullable": True, "metadata": {} }] } config = AppendLoadConfig(self.local_run_dir, self.env_setup, target_partitions, regex_filename, null_value=null_value, quote_character=quote_character, schema=schema, verify_schema=verify_schema, data_type=data_type) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] config.load_table(self.emr_cluster_id, json.dumps(spark_external_parameters)) # Check EMR step. assert len(fake_cluster.steps) == 1 spark_step = fake_cluster.steps[0] # Check args of EMR step assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == config.expected_algorithms_jar_path assert spark_step.args[-3] == "AppendLoad" assert spark_step.args[-2] == config.config_filepath assert spark_step.args[-1] == "s3" # Check that config_file_s3 file is on application S3 bucket app_files = self.get_child_objects(config.dataset.dir_apps_append_load) app_json_files = list( filter( lambda app_file: os.path.basename(app_file).endswith(".json"), app_files)) assert len(app_json_files) == 1 assert app_json_files[0] == config.config_filepath # Check config file for Spark actual_parameters = json.loads( self.get_object_content_from_s3(config.config_filepath)) expected_parameters = { "target_table": "test101", "source_dir": config.dataset.dir_landing_final, "header_dir": config.dataset.dir_landing_header, "target_partitions": target_partitions, "regex_filename": regex_filename, "file_format": "dsv", "null_value": "test_null_value", "quote_character": "test_quote", "data_type": DataType.SEMISTRUCTURED, "verify_schema": True, "target_dir": test_target_dir, "schema": schema } assert actual_parameters == expected_parameters