def test_read_all_file_body(s3, land_path): from data_linter.utils import read_all_file_body test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") table_1_path = os.path.join(land_folder, "table1.csv") with open(config_path) as yml: config = yaml.safe_load(yml) with open(table_1_path) as f_in: table_1_body_actual = f_in.read() config["land-base-path"] = land_path set_up_s3(s3, land_folder, config) land_base_path = config["land-base-path"] table_1_body = read_all_file_body(f"{land_base_path}table1.csv") # Unix new line is \r\n, Mac new line is \n table_1_body = table_1_body.replace("\r\n", "\n") table_1_body_actual = table_1_body_actual.replace("\r\n", "\n") assert table_1_body == table_1_body_actual
def para_collect_all_logs(config: Union[str, dict] = "config.yaml"): config = load_and_validate_config(config) log_base_path = config["log-base-path"] log_path_fin = get_main_log_path_from_config(config) log_base_path_is_s3 = log_base_path.startswith("s3://") tmp_log_base_path = get_temp_log_basepath(config) init_log_path = os.path.join(tmp_log_base_path, "init") val_log_path = os.path.join(tmp_log_base_path, "val") status_log_path = os.path.join(tmp_log_base_path, "status") if log_base_path_is_s3: init_log_paths = get_filepaths_from_s3_folder(init_log_path) val_log_paths = get_filepaths_from_s3_folder(val_log_path) status_log_paths = get_filepaths_from_s3_folder(status_log_path) else: init_log_paths = get_filepaths_from_local_folder(init_log_path) val_log_paths = get_filepaths_from_local_folder(val_log_path) status_log_paths = get_filepaths_from_local_folder(status_log_path) log_string_list = [] for init_log_path in init_log_paths: log_string_list.append(read_all_file_body(init_log_path)) for val_log_path in val_log_paths: log_string_list.append(read_all_file_body(val_log_path)) for status_log_path in status_log_paths: log_string_list.append(read_all_file_body(status_log_path)) log_io = io.StringIO() for log_str in log_string_list: log_io.write(log_str) upload_log(log, log_io, log_path_fin) log_path_del = os.path.join(log_base_path, "data_linter_temporary_fs") if log_base_path_is_s3: delete_s3_folder_contents(log_path_del) else: shutil.rmtree(log_path_del, ignore_errors=True)
def load_and_validate_config(config: Union[str, dict] = "config.yaml") -> dict: """ Loads and validates the config """ if isinstance(config, str): config_raw_text = read_all_file_body(config) config = yaml.safe_load(config_raw_text) elif isinstance(config, dict): pass else: raise TypeError("Input 'config' must be a str or dict.") return _validate_and_clean_config(config)
def test_bin_pack_configs(s3, max_bin_count): from data_linter import validation from data_linter.utils import read_all_file_body from dataengineeringutils3.s3 import get_filepaths_from_s3_folder from botocore.exceptions import ClientError test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config_matched_files.yml") with open(config_path) as yml: config = yaml.safe_load(yml) set_up_s3(s3, land_folder, config) validation.bin_pack_configs(config, max_bin_count) land_base_path = config["land-base-path"] all_bin_packed_configs = get_filepaths_from_s3_folder( f"{land_base_path}/data_linter_temporary_storage/configs") for i, file_path in enumerate(all_bin_packed_configs): bin_pack_path = os.path.join( test_folder, f"bin_pack/config_{max_bin_count}_{i}.yml") with open(bin_pack_path) as yml: pre_bin_packed = yaml.safe_load(yml) try: actual_bin_pack = yaml.safe_load(read_all_file_body(file_path)) except ClientError as e: if e.response["Error"]["Code"] == "NoSuchKey": assert pre_bin_packed is None else: assert actual_bin_pack == pre_bin_packed