def get_files_to_sample(config, s3_files, max_files): """ Returns the list of files for sampling, it checks the s3_files whether any zip or gz file exists or not if exists then extract if and append in the list of files Args: config dict(): Configuration s3_files list(): List of S3 Bucket files Returns: list(dict()) : List of Files for sampling |_ s3_path str(): S3 Bucket File path |_ file_handle StreamingBody(): file object |_ type str(): Type of file which is used for extracted file |_ extension str(): extension of file (for normal files only) """ global skipped_files_count sampled_files = [] OTHER_FILES = ["csv", "gz", "jsonl", "txt"] for s3_file in s3_files: file_key = s3_file.get('key') if len(sampled_files) >= max_files: break if file_key: file_name = file_key.split("/").pop() extension = file_name.split(".").pop().lower() file_handle = get_file_handle(config, file_key) # Check whether file is without extension or not if not extension or file_name.lower() == extension: LOGGER.warning( '"%s" without extension will not be sampled.', file_key) skipped_files_count = skipped_files_count + 1 elif file_key.endswith(".tar.gz"): LOGGER.warning( 'Skipping "%s" file as .tar.gz extension is not supported', file_key) skipped_files_count = skipped_files_count + 1 elif extension == "zip": files = compression.infer( io.BytesIO(file_handle.read()), file_name) # Add only those extracted files which are supported by tap # Prepare dictionary contains the zip file name, type i.e. unzipped and file object of extracted file sampled_files.extend([{"type": "unzipped", "s3_path": file_key, "file_handle": de_file} for de_file in files if de_file.name.split( ".")[-1].lower() in OTHER_FILES and not de_file.name.endswith(".tar.gz")]) elif extension in OTHER_FILES: # Prepare dictionary contains the s3 file path, extension of file and file object sampled_files.append( {"s3_path": file_key, "file_handle": file_handle, "extension": extension}) else: LOGGER.warning( '"%s" having the ".%s" extension will not be sampled.', file_key, extension) skipped_files_count = skipped_files_count + 1 return sampled_files
def sync_compressed_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing Compressed file "%s".', s3_path) records_streamed = 0 s3_file_handle = s3.get_file_handle(config, s3_path) decompressed_files = compression.infer(io.BytesIO(s3_file_handle.read()), s3_path) for decompressed_file in decompressed_files: extension = decompressed_file.name.split(".")[-1].lower() if extension in ["csv", "jsonl", "gz", "txt"]: # Append the extracted file name with zip file. s3_file_path = s3_path + "/" + decompressed_file.name records_streamed += handle_file(config, s3_file_path, table_spec, stream, extension, file_handler=decompressed_file) return records_streamed