def delete_emr_cluster(config, destination_system, destination_database, destination_environment, emr_cluster_id): from m3d.hadoop.emr.emr_system import EMRSystem emr = EMRSystem(config, destination_system, destination_database, destination_environment) emr.delete_emr_cluster(emr_cluster_id)
def drop_table(config, destination_system, destination_database, destination_environment, destination_table, emr_cluster_id=None): # create abstract table object to retrieve source technology abstract_table = Table(config, destination_system, destination_database, destination_environment, destination_table) destination_system_technology = abstract_table.get_destination_technology( ) # hadoop if destination_system_technology == DataSystem.SystemTechnology.HIVE: if abstract_table.storage_type == DataSystem.StorageType.S3: from m3d.hadoop.emr.emr_system import EMRSystem emr_system = EMRSystem(config, destination_system, destination_database, destination_environment, emr_cluster_id) emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD, M3D.drop_table.__name__) emr_system.drop_table(destination_table) else: raise m3d_exceptions.M3DUnsupportedStorageException( abstract_table.storage_type) else: raise m3d_exceptions.M3DUnsupportedDestinationSystemException( destination_system_technology)
def create( config_path, cluster_mode, destination_system, destination_database, destination_environment, algorithm_instance, ext_params_str ): data_system = DataSystem( config_path, cluster_mode, destination_system, destination_database, destination_environment ) if data_system.database_type == DataSystem.DatabaseType.EMR: config = AlgorithmConfigurationHadoop.create_with_ext_params( config_path, cluster_mode, destination_database, destination_environment, algorithm_instance, ext_params_str ) execution_system = EMRSystem.from_data_system(data_system, config.get_emr_cluster_id()) return AlgorithmExecutorHadoop(execution_system, config) else: raise M3DUnsupportedDatabaseTypeException(data_system.database_type)
def add_emr_cluster_tags(config, destination_system, destination_database, destination_environment, emr_cluster_id, cluster_tags): from m3d.hadoop.emr.emr_system import EMRSystem emr_system = EMRSystem(config, destination_system, destination_database, destination_environment, emr_cluster_id) emr_system.add_cluster_tags(cluster_tags)
def test_add_emr_cluster_tags(self): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] m3d_config_file, _, _, _, _ = self.env_setup(self.local_run_dir, destination_system, destination_database, destination_environment, destination_table) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment, self.emr_cluster_id) emr_system.add_cluster_tag("DataFormat", "csv") assert fake_cluster.tags == {"DataFormat": "csv"}
def _create_emr_system(self): destination_system = "bdp" destination_database = "emr_test" destination_environment = "prod" m3d_config_file, _, _, _ = self.env_setup(self.local_run_dir, destination_system, destination_database, destination_environment) return EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment, self.emr_cluster_id)
def test_add_emr_cluster_tags_multiple_calls(self): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, _, _ = self.env_setup(self.local_run_dir, destination_system, destination_database, destination_environment, destination_table) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment, self.emr_cluster_id) tags1 = {"DataFormat": "csv"} emr_system.add_cluster_tag("DataFormat", tags1["DataFormat"]) tags2 = {"Database": "test_lake", "Table": destination_table} emr_system.add_cluster_tags(tags2) all_tags = tags1.copy() all_tags.update(tags2) assert fake_cluster.tags == all_tags
def drop_dataset(config, destination_system, destination_database, destination_environment, destination_dataset, emr_cluster_id=None): from m3d.hadoop.emr.emr_system import EMRSystem emr_system = EMRSystem(config, destination_system, destination_database, destination_environment, emr_cluster_id) emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD, M3D.drop_dataset.__name__) emr_system.drop_dataset(destination_dataset)
def create(config_path, cluster_mode, destination_system, destination_database, destination_environment, destination_table, load_type, emr_cluster_id, spark_params_str): data_system = DataSystem(config_path, cluster_mode, destination_system, destination_database, destination_environment) if data_system.database_type == DataSystem.DatabaseType.EMR: execution_system = EMRSystem.from_data_system( data_system, emr_cluster_id) spark_params_dict = json.loads(spark_params_str) return LoadExecutorHadoop(execution_system, load_type, destination_table, spark_params_dict) else: raise M3DUnsupportedDatabaseTypeException( data_system.database_type)
def __init__(self, test_run_dir, setup_function, partition_columns, regex_filename, file_format=None, null_value=None, quote_character=None, compute_table_statistics=None): self.config_file, _, self.tconx_file, self.config_dict, self.scon_emr_dict = setup_function( *([test_run_dir] + self.destination_params)) self._write_acon(partition_columns, regex_filename, file_format, null_value, quote_character, compute_table_statistics) self._write_tconx() self.table_config = [self.config_file, self.cluster_mode ] + self.destination_params emr_system = EMRSystem(self.config_file, self.cluster_mode, self.destination_system, self.destination_database, self.destination_environment) self.s3_table = S3Table(emr_system, self.destination_table) config_filename = "append_load-{}-{}.json".format( self.destination_environment, self.destination_table) self.config_filepath = os.path.join(self.s3_table.dir_apps_append_load, config_filename) self.db_name_lake = self.scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] self.expected_algorithms_jar_path = "s3://" + os.path.join( (self.scon_emr_dict["environments"][self.destination_environment] ["s3_buckets"]["application"]).strip("/"), (self.scon_emr_dict["environments"][self.destination_environment] ["s3_deployment_dir_base"]).strip("/"), self.destination_environment, self.scon_emr_dict["subdir"]["m3d"], self.config_dict["subdir_projects"]["m3d_api"], self.scon_emr_dict["spark"]["jar_name"])
def test_parses_basic_attributes_from_system_config_file(self, _): """ Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member variables of EMRSystem object. """ aws_api_credentials = AWSCredentials("fake_aws_api_access_key", "fake_aws_api_secret_key") aws_api_credentials_file = self.local_run_dir.join( "aws-credentials-emr-api.json") self.dump_aws_credentials(aws_api_credentials, str(aws_api_credentials_file)) aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key", "fake_aws_s3_put_secret_key") aws_s3_put_credentials_file = self.local_run_dir.join( "aws-credentials-emr-s3_put.json") self.dump_aws_credentials(aws_s3_put_credentials, str(aws_s3_put_credentials_file)) aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key", "fake_aws_s3_del_secret_key") aws_s3_del_credentials_file = self.local_run_dir.join( "aws-credentials-emr-s3_del.json") self.dump_aws_credentials(aws_s3_del_credentials, str(aws_s3_del_credentials_file)) test_scon_json = TestEMRSystem.test_scon_json_template.format( aws_api_credentials=str(aws_api_credentials_file), aws_s3_put_credentials=str(aws_s3_put_credentials_file), aws_s3_del_credentials=str(aws_s3_del_credentials_file)) s3_scon_file = self.local_run_dir.join("scon-emr-emr-test.json") s3_scon_file.write(test_scon_json) MockConfigService.scon_path = str(s3_scon_file) emr_system = EMRSystem(*self.test_emr_system_arguments) expected_system_params = { "bucket_landing": "m3d-da-bdp-test-landing", "bucket_lake": "m3d-da-bdp-test-lake", "bucket_mart_cal": "m3d-da-bdp-test-mart-cal", "bucket_log": "io.3stripes.factory.test.ireland.infrastructure-logs", "default_ebs_size": "128", "default_emr_version": "emr-5.17.0", "aws_api_credentials": aws_api_credentials, "aws_s3_put_credentials": aws_s3_put_credentials, "aws_s3_del_credentials": aws_s3_del_credentials, "api_action_timeout_seconds": 120, "api_action_polling_interval_seconds": 3, "api_long_timeout_seconds": 300, "aws_region": "eu-west-1", "packages_to_deploy": ["hadoop"], "configs_to_deploy": ["test_config_1", "test_config_2"], "subdir_archive": "test_archive/", "subdir_header": "test_header/", "subdir_config": "test_config/", "subdir_data": "test_data/", "subdir_delta_table": "delta_table/", "subdir_data_backup": "data_backup/", "subdir_error": "test_error/", "subdir_work": "test_work/", "subdir_log": "test_log/", "subdir_apps": "test_apps/", "subdir_m3d_engine": "test_m3d_engine/", "subdir_loading": "test_loading/", "subdir_full_load": "test_full_load/", "subdir_delta_load": "test_delta_load/", "subdir_delta_lake_load": "test_delta_lake_load/", "subdir_append_load": "test_append_load/", "subdir_black_whole": "test_black_whole/", "subdir_credentials": "test_credentials/", "subdir_keytab": "test_keytab/", "subdir_tmp": "test_tmp/", "subdir_code": "m3d", "subdir_metadata": "metadata", "spark_jar_name": "test_jar.jar", "dir_apps": "s3://m3d-da-landing-application/m3d-test/test_environment/test_apps/", "dir_apps_algorithm": "s3://m3d-da-landing-application/m3d-test/" "test_environment/test_apps/test_m3d_engine/", "dir_apps_loading": "s3://m3d-da-landing-application/m3d-test/test_environment/" "test_apps/test_loading/", "dir_tmp_s3": "s3://m3d-da-landing-application/m3d-test/test_environment/test_tmp/", "dir_tmp_local": "/test_tmp/", "spark_jar_path": "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/" + "test_subdir_projects_m3d_api/test_jar.jar", "dir_m3d_api_deployment": "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/test_subdir_projects_m3d_api", "dir_metadata_deployment": "s3://m3d-da-landing-application/m3d-test/test_environment/metadata/test_subdir_projects_m3d_api" } for param in expected_system_params.keys(): assert getattr(emr_system, param) == expected_system_params[param]
def test_lakeout_view_hql(self, add_tags_patch): tconx_src_path = "test/resources/test_create_out_view_hive/test_lakeout_view_structure/config/tconx.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) # Use test case specific tconx py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3) with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4): logging.info("Calling M3D.create_out_view().") M3D.create_out_view(*table_config, **table_config_kwargs) emr_system = EMRSystem(*table_config[:5]) s3_table = S3Table(emr_system, destination_table) mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(mock_cluster.steps) hive_step = mock_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) column_name_pairs = [ ("record_date", "v_record_date"), ("p_string", "v_string"), ("p_int", "v_int"), ("p_bigint", "v_bigint"), ("p_float", "v_float"), ("p_varchar_1", "v_varchar_10"), ("p_varchar_2", "v_varchar_100"), ("p_char_1", "v_char"), ("p_boolean", "v_boolean"), ("year", "year"), ("month", "month") ] columns_str = ", ".join(map(lambda x: "{} AS {}".format(x[0], x[1]), column_name_pairs)) drop_view = "DROP VIEW IF EXISTS {};".format(s3_table.db_view_lake_out) # S3Table is partitioned by year and month create_view = "\n".join([ "CREATE VIEW {}".format(s3_table.db_view_lake_out), "AS", "SELECT {}".format(columns_str), "FROM {};".format(s3_table.db_table_lake) ]) expected_hql = "\n".join([drop_view, create_view]) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_out_view" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetView", "Value": "dev_lake_out.bi_test101" }]
def test_full_load_emr(self, _0, _1): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" load_type = "FullLoad" landing_dataset = "landing-dataset.psv" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id, spark_external_parameters ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config) # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have # old files in it and dir_landing_archive will not have new files landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3"
def test_full_load_emr_external_spark_parameters(self, _0): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } load_type = "FullLoad" landing_dataset = "landing-dataset.psv" m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"], destination_database, destination_environment, destination_table, acon_src_path) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config, spark_params=json.dumps(spark_external_parameters)) # psv file will still be in landing since move operation should be # performed by EMR Step which we mock here. Accordingly archive will # still be empty. landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3"
def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1): # responses.add_passthru(self.default_server_url) destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_active_table = "bi_test101" destination_changelog_table = "bi_test101_cl" load_type = "DeltaLoad" src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json" src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' # pass desired content of tconx files for active and changelog tables to self.env_setup() src_tconx_content = py.path.local(src_tconx_path).read() src_tconx_cl_content = py.path.local(src_tconx_cl_table).read() m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_active_table, src_tconx_content, src_tconx_cl_content ) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) s3_table_active = S3Table(emr_system, destination_active_table) s3_table_changelog = S3Table(emr_system, destination_changelog_table) # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] # Put lake data for changelog table, this should be archived self.dump_data_to_s3( os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"), "t|e|s|t|a|d|i|d|a|s|m|3|d|", ) M3D.load_table(m3d_config_file, destination_system, destination_database, destination_environment, destination_active_table, load_type, self.emr_cluster_id, spark_params=spark_external_parameters) filename_json = "delta_load-{environment}-{table}.json".format( environment=destination_environment, table=destination_active_table) # Checking configuration file for m3d-engine app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load) assert len(app_files) == 1 assert app_files[ 0] == s3_table_active.dir_apps_delta_load + filename_json delta_load_config_s3 = app_files[0] delta_load_config_content = self.get_object_content_from_s3( delta_load_config_s3) load_table_parameters = json.loads(delta_load_config_content) assert load_table_parameters[ "active_records_table_lake"] == s3_table_active.db_table_lake assert load_table_parameters[ "active_records_dir_lake"] == s3_table_active.dir_lake_final assert load_table_parameters[ "delta_records_file_path"] == s3_table_active.dir_landing_data assert load_table_parameters["technical_key"] == [ "m3d_timestamp", "datapakid", "partno", "record" ] assert load_table_parameters[ "business_key"] == s3_table_active.business_key if s3_table_active.partitioned_by in Util.defined_partitions: target_partitions = Util.get_target_partitions_list( s3_table_active.partitioned_by) else: target_partitions = s3_table_active.partitioned_by assert load_table_parameters["target_partitions"] == target_partitions assert load_table_parameters[ "partition_column"] == s3_table_active.partition_column assert load_table_parameters[ "partition_column_format"] == s3_table_active.partition_column_format # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) delta_load_step = fake_cluster.steps[0] assert delta_load_step.jar == "command-runner.jar" assert delta_load_step.args[0] == "spark-submit" assert delta_load_step.args[ -5] == "com.adidas.analytics.AlgorithmFactory" assert delta_load_step.args[-4] == expected_algorithms_jar_path assert delta_load_step.args[-3] == "DeltaLoad" assert delta_load_step.args[-2] == delta_load_config_s3 assert delta_load_step.args[-1] == "s3" add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted([{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "DeltaLoad" }, { "Key": "TargetTable", "Value": "bi_test101" }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
def __init__(self, test_run_dir, setup_function, target_partitions, regex_filename, file_format=None, null_value=None, quote_character=None, compute_table_statistics=None, schema=None, verify_schema=None, data_type=None, reader_mode=None, metadata_update_strategy=None): destination_params = [ self.destination_system, self.destination_database, self.destination_environment, self.destination_table ] self.config_file, _, self.tconx_file, self.config_dict, self.scon_emr_dict = setup_function( *([test_run_dir] + destination_params)) self._write_acon(target_partitions, regex_filename, file_format=file_format, null_value=null_value, quote_character=quote_character, compute_table_statistics=compute_table_statistics, schema=schema, verify_schema=verify_schema, data_type=data_type, reader_mode=reader_mode, metadata_update_strategy=metadata_update_strategy) self._write_tconx() self.table_config = [self.config_file] + destination_params emr_system = EMRSystem(self.config_file, self.destination_system, self.destination_database, self.destination_environment) # self.s3_table = S3Table(emr_system, self.destination_table) if data_type is None: data_type = DataType.STRUCTURED self.dataset = DataSetFactory.create_dataset( emr_system, HiveTable.TableLoadType.APPEND, data_type, self.destination_table) config_filename = "append_load-{}-{}.json".format( self.destination_environment, self.dataset.table_lake) self.config_filepath = os.path.join(self.dataset.dir_apps_append_load, config_filename) self.db_name_lake = self.scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] self.expected_algorithms_jar_path = "s3://" + os.path.join( (self.scon_emr_dict["environments"][self.destination_environment] ["s3_buckets"]["application"]).strip("/"), (self.scon_emr_dict["environments"][self.destination_environment] ["s3_deployment_dir_base"]).strip("/"), self.destination_environment, self.scon_emr_dict["subdir"]["m3d"], self.config_dict["subdir_projects"]["m3d_api"], self.scon_emr_dict["spark"]["jar_name"])