def set_up_s3(mocked_s3, test_folder, config, ext_filter=None): """ Used to setup mocked s3 before a run that expects data in S3 """ if ext_filter is None: ext_filter = (".csv", ".jsonl", ".parquet") from dataengineeringutils3.s3 import s3_path_to_bucket_key land_base_path = config.get("land-base-path", "s3://land/") fail_base_path = config.get("fail-base-path", "s3://fail/") pass_base_path = config.get("pass-base-path", "s3://pass/") log_base_path = config.get("log-base-path", "s3://log/") land_base_path_is_s3 = land_base_path.startswith("s3://") fail_base_path_is_s3 = fail_base_path.startswith("s3://") pass_base_path_is_s3 = pass_base_path.startswith("s3://") log_base_path_is_s3 = log_base_path.startswith("s3://") buckets = [] if land_base_path_is_s3: land_bucket, _ = s3_path_to_bucket_key(land_base_path) buckets.append(land_bucket) if fail_base_path_is_s3: fail_bucket, _ = s3_path_to_bucket_key(fail_base_path) buckets.append(fail_bucket) if pass_base_path_is_s3: pass_bucket, _ = s3_path_to_bucket_key(pass_base_path) buckets.append(pass_bucket) if log_base_path_is_s3: log_bucket, _ = s3_path_to_bucket_key(log_base_path) buckets.append(log_bucket) for b in buckets: mocked_s3.meta.client.create_bucket( Bucket=b, CreateBucketConfiguration={"LocationConstraint": "eu-west-1"}, ) files = [f for f in os.listdir(test_folder)] if ext_filter: files = [f for f in files if f.endswith(ext_filter)] if land_base_path_is_s3: for filename in files: full_path = os.path.join(test_folder, filename) mocked_s3.meta.client.upload_file(full_path, land_bucket, filename)
def validate_from_chunked_configs(config: dict, config_num: int) -> bool: land_base_path = config["land-base-path"] land_base_path_is_s3 = land_base_path.startswith("s3://") if land_base_path_is_s3: tmp_log_bp = get_temp_log_basepath(config) s3_temp_path = os.path.join(tmp_log_bp, "configs", str(config_num)) config_file_paths = get_filepaths_from_s3_folder(s3_temp_path) if not config_file_paths: return False s3_client = boto3.client("s3") all_configs = [] for config_file_path in config_file_paths: bucket, key = s3_path_to_bucket_key(config_file_path) config_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_configs.append(yaml.safe_load(config_file_obj["Body"].read())) for config in all_configs: validate_data(config) return True else: raise ValueError("Local land path not supported for parrallel running")
def compress_data(download_path: str, upload_path: str): download_path_is_s3 = download_path.startswith("s3://") upload_path_is_s3 = upload_path.startswith("s3://") if download_path_is_s3: s3_client = boto3.client("s3") if not upload_path_is_s3: upload_path_dir = os.path.dirname(upload_path) if not os.path.exists(upload_path_dir): os.makedirs(upload_path_dir, exist_ok=True) with tempfile.TemporaryDirectory() as temp_dir: if download_path_is_s3: bucket, key = s3_path_to_bucket_key(download_path) temp_file = os.path.join(temp_dir, key.split("/")[-1]) with open(temp_file, "wb") as opened_temp_file: s3_client.download_fileobj(bucket, key, opened_temp_file) else: temp_file = os.path.join(temp_dir, download_path.split(os.path.sep)[-1]) with open(temp_file, "wb") as opened_temp_file: shutil.copy(download_path, temp_file) with open(temp_file, "rb") as f_in, gzip.open(temp_file + ".gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) if upload_path_is_s3: write_local_file_to_s3(temp_file + ".gz", upload_path, overwrite=True) else: shutil.copy(temp_file + ".gz", upload_path)
def download_data(s3_path: str, local_path: str): s3_client = boto3.client("s3") dirname = os.path.dirname(local_path) Path(dirname).mkdir(parents=True, exist_ok=True) with open(local_path, "wb") as f: b, o = s3_path_to_bucket_key(s3_path) s3_client.download_fileobj(b, o, f)
def read_all_file_body(file_path: str) -> str: """ Returns the text content of a file (will decode bytes if file read is bytes like) Args: file_path: A string specifying the location of the file to load text from. can be s3 or local """ file_path_is_s3 = file_path.startswith("s3://") if file_path_is_s3: s3_client = boto3.client("s3") if not check_for_s3_file(file_path): raise FileNotFoundError("Path to config: {file_path}. Not found.") bucket, key = s3_path_to_bucket_key(file_path) file_obj = s3_client.get_object(Bucket=bucket, Key=key) file_obj_body = file_obj["Body"].read() else: with open(file_path) as f_in: file_obj_body = f_in.read() if isinstance(file_obj_body, bytes): return file_obj_body.decode("utf-8") else: return file_obj_body
def open_input_stream(s3_file_path_in: str) -> io.BytesIO: s3_resource = boto3.resource("s3") bucket, key = s3_path_to_bucket_key(s3_file_path_in) obj_bytes = s3_resource.Object(bucket, key).get()["Body"].read() obj_io_bytes = io.BytesIO(obj_bytes) try: yield obj_io_bytes finally: obj_io_bytes.close()
def write_to_s3(self): s3_resource = boto3.resource("s3") b, k = s3_path_to_bucket_key(self.get_s3_filepath()) data = self.mem_file.getvalue() if self.compress_on_upload: data = self._compress_data(data) s3_resource.Object(b, k).put(Body=data) self.reset_file_buffer()
def local_file_to_s3(local_path: str, s3_path: str): s3_client = boto3.client("s3") if (not local_path.endswith(".gz")) and (s3_path.endswith(".gz")): new_path = local_path + ".gz" with open(local_path, "rb") as f_in, gzip.open(new_path, "wb") as f_out: f_out.writelines(f_in) local_path = new_path b, o = s3_path_to_bucket_key(s3_path) with open(local_path, "rb") as f: s3_client.upload_fileobj(f, b, o)
def upload_log(log: logging.Logger, log_stringio: io.StringIO, log_path: str): log_path_is_s3 = log_path.startswith("s3://") if log_path: if log_path_is_s3: s3_client = boto3.client("s3") b, k = s3_path_to_bucket_key(log_path) s3_client.put_object(Body=log_stringio.getvalue(), Bucket=b, Key=k) else: dir_out = os.path.dirname(log_path) if not os.path.exists(dir_out): os.makedirs(dir_out, exist_ok=True) with open(log_path, "w") as log_out: log_out.write(log_stringio.getvalue()) else: log.error( "An error occurred but no log path registered, " "likely due to issue with config, so logs not saved." )
def test_download_fileobj(s3_client): # s3 is a fixture defined above that yields a boto3 s3 client. from dataengineeringutils3.s3 import s3_path_to_bucket_key s3_client.create_bucket( Bucket="somebucket", CreateBucketConfiguration={"LocationConstraint": "eu-west-1"}, ) s3_download_path = "somebucket/" bucket, key = s3_path_to_bucket_key(s3_download_path) table1 = "table1.csv" test_path = "tests/data/end_to_end1/land/" full_path = os.path.join(test_path, table1) s3_client.upload_file(full_path, bucket, table1) with open(table1, "wb") as downloaded_file: s3_client.download_fileobj(bucket, table1, downloaded_file) my_file = open(table1, "rb").read() assert my_file == open(full_path, "rb").read()
def test_validation_multiple_workers_no_init(s3, monkeypatch): """ Simple example on how to run DL for multiple workers. But without using the init. You would want to do this if you want to specify which worker works on what specific dataset. In the example below we run 1 worker per table validation [worker]x2 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) import boto3 from data_linter import validation from data_linter.logging_functions import get_temp_log_basepath from dataengineeringutils3.s3 import ( s3_path_to_bucket_key, get_filepaths_from_s3_folder, ) s3_client = boto3.client("s3") land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) worker_config_path = os.path.join(get_temp_log_basepath(config), "configs") log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path) config = validation.load_and_validate_config(config) config = validation.match_files_in_land_to_config(config) # Create a config for worker 0 to only process table1 # (aka drop other tables in config) # and write to worker 0 config to s3 worker0_conf = deepcopy(config) del worker0_conf["tables"]["table2"] s3_client.put_object( Body=yaml.dump(worker0_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/0/config.yml", ) # Create a config for worker 1 to only process table2 # and write to worker 1 config to s3 worker1_conf = deepcopy(config) del worker1_conf["tables"]["table1"] s3_client.put_object( Body=yaml.dump(worker1_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/1/config.yml", ) validation.para_run_validation(0, config) validation.para_run_validation(1, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def collect_all_status(config: dict): """ collects the status files saved and determines whether the linting was a succes or not and copies/removes/compresses the files to and from the correct places Args: config: the config as given at the beggining with the paths of where to collect and save data from as well as compression, remove-on-pass etc. """ utc_ts = int(datetime.utcnow().timestamp()) land_base_path = config["land-base-path"] all_must_pass = config.get("all-must-pass", False) pass_base_path = config["pass-base-path"] log_base_path = config["log-base-path"] fail_base_path = config.get("fail-base-path") remove_on_pass = config.get("remove-tables-on-pass") compress = config.get("compress-data") timestamp_partition_name = config.get("timestamp-partition-name") land_base_path_is_s3 = land_base_path.startswith("s3://") log_base_path_is_s3 = log_base_path.startswith("s3://") temp_status_basepath = os.path.join(get_temp_log_basepath(config), "status") if log_base_path_is_s3: status_file_paths = get_filepaths_from_s3_folder(temp_status_basepath) s3_client = boto3.client("s3") all_table_response = [] for status_file_path in status_file_paths: bucket, key = s3_path_to_bucket_key(status_file_path) status_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_table_response.append( json.loads(status_file_obj["Body"].read())) else: status_file_paths = get_filepaths_from_local_folder( temp_status_basepath) all_table_response = [] for status_file_path in status_file_paths: with open(status_file_path) as json_in: all_table_response.append(json.load(json_in)) all_tables_passed = True pass_count = sum([i["valid"] for i in all_table_response]) if pass_count != len(all_table_response): all_tables_passed = False there_was_a_fail = False all_tables_to_fail = False all_tables_to_respective = False if all_tables_passed: all_tables_to_respective = True else: if all_must_pass: all_tables_to_fail = True else: all_tables_to_respective = True for i, table_response in enumerate(all_table_response): table_name = table_response.get("table-name") matched_file = table_response.get("original-path") file_basename = os.path.basename(matched_file) if all_tables_to_fail: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}") compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) elif all_tables_to_respective: if table_response["valid"]: final_outpath = get_out_path( pass_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) if remove_on_pass: log.info(f"Removing data in land: {matched_file}") if land_base_path_is_s3: delete_s3_object(matched_file) else: os.remove(matched_file) else: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) table_response["archived-path"] = final_outpath # write (table specific) log log_outpath = get_table_log_path(log_base_path, table_name, utc_ts, filenum=i) if log_base_path_is_s3: write_json_to_s3(table_response, log_outpath) else: path_name = os.path.dirname(log_outpath) os.makedirs(path_name, exist_ok=True) with open(log_outpath, "w") as json_out: json.dump(table_response, json_out) log.info(f"log for {matched_file} uploaded to {log_outpath}") if there_was_a_fail and all_must_pass: log.info("The following tables have failed: ") for failed_table in [i for i in all_table_response if not i["valid"]]: log.info(f"{failed_table['table-name']} failed") log.info(f"...original path: {failed_table['original-path']}") log.info(f"...out path: {failed_table['archived-path']}") _del_path(get_temp_log_basepath(config)) raise ValueError("Tables did not pass linter") if not all_must_pass and there_was_a_fail: msg6 = "Some tables failed but all_must_pass set to false." msg6 += " Check logs for details" log.info(msg6) _del_path(get_temp_log_basepath(config))
def bin_pack_configs(config: dict, max_bin_count: int): """ creates up to max_bin_count of config files by splitting the files from the config by size and grouping them into or below the average size of all the files Args: config: a config file specifying the files to be linted max_bin_count: the maximum of bins to split the files up into - optimal number is equal to the amount of workers available """ log_base_path = config.get("log-base-path") log_base_path_is_s3 = log_base_path.startswith("s3://") if log_base_path_is_s3: tmp_log_bp = get_temp_log_basepath(config) s3_temp_path = os.path.join(tmp_log_bp, "configs") file_list = [] # create a list of dictionaries, for each file with all attributes for table_name, table in config["tables"].items(): table_sans_files = deepcopy(table) mfiles = table_sans_files.pop("matched_files") for file_name in mfiles: table_sans_files["file-name"] = file_name table_sans_files["table-name"] = table_name file_list.append(deepcopy(table_sans_files)) # get the size of them all acum_file_size = 0 for i, file_dict in enumerate(file_list): s3_client = boto3.client("s3") file_name = file_dict["file-name"] bucket, key = s3_path_to_bucket_key(file_name) obj = s3_client.get_object(Bucket=bucket, Key=key) file_size = obj.get("ContentLength") file_list[i]["file-size"] = file_size acum_file_size += file_size target_bin_size = acum_file_size / max_bin_count # sort them in descending order file_list.sort(key=lambda x: -x["file-size"]) bins = [None] * max_bin_count offset = 0 for i in range(max_bin_count): curr_bin = [] curr_bin_size = 0 has_been_binned = False for j in range(offset, len(file_list)): if len(curr_bin) == 0: curr_bin.append(file_list[j]) curr_bin_size += file_list[j]["file-size"] offset += 1 else: if curr_bin_size <= target_bin_size: curr_bin.append(file_list[j]) curr_bin_size += file_list[j]["file-size"] offset += 1 else: bins[i] = curr_bin has_been_binned = True break if not has_been_binned: bins[i] = curr_bin bins[i] = curr_bin bins = [i for i in bins if i != []] # create the configs for the given bins for i, packed_bin in enumerate(bins): config_n = deepcopy(config) config_n.pop("tables") config_n["tables"] = {} for table in packed_bin: curr_table_name = table.pop("table-name") if config_n["tables"].get(curr_table_name): # it exists, so just add to matched files config_n["tables"][curr_table_name]["matched_files"].append( table["file-name"] ) else: # it doesn't exist, do a full copy of all attributes mfile = table.pop("file-name") table.pop("file-size") config_n["tables"][curr_table_name] = deepcopy(table) config_n["tables"][curr_table_name]["matched_files"] = [] config_n["tables"][curr_table_name]["matched_files"].append(mfile) # upload the config to temp storage, into config with tempfile.NamedTemporaryFile( suffix=".yml", prefix="config_" ) as tmp_file: with open(tmp_file.name, "w") as yaml_out: yaml.dump(config_n, yaml_out, default_flow_style=False) tmp_file_name = tmp_file.name.split("/")[-1] s3_out_path = os.path.join(s3_temp_path, str(i), tmp_file_name) local_file_to_s3(tmp_file.name, s3_out_path) else: raise ValueError("Local land path not supported for parrallel running")
def s3_to_local(s3_path: str, local_path: str): s3_client = boto3.client("s3") bucket, key = s3_path_to_bucket_key(s3_path) with open(local_path, "wb") as opened_file: s3_client.download_fileobj(bucket, key, opened_file)
def test_s3_path_to_bucket_key(s3_path, exp_bucket, exp_key): bucket, key = s3_path_to_bucket_key(s3_path) assert bucket == exp_bucket assert key == exp_key