def deleted_archived_with_prefix(s3_bucket_name, prefix): """ Delete data from archive with given prefix. Args: s3_bucket_name (str): The s3 bucket name prefix (str): The prefix for deletion """ s3_resource = get_s3_resource() s3_bucket = s3_resource.Bucket(s3_bucket_name) object_keys = [{ "Key": s3_object.key } for s3_object in s3_bucket.objects.filter(Prefix=prefix)] batch_size = 1000 # AWS S3 delete API limits to 1000 objects per request. for batch_number in range(math.ceil(len(object_keys) / batch_size)): batch_start = batch_size * batch_number batch_end = batch_start + batch_size object_keys_batch = object_keys[batch_start:batch_end] s3_bucket.delete_objects(Delete={"Objects": object_keys_batch}) remaining_objects = list(s3_bucket.objects.filter(Prefix=prefix)) if remaining_objects: LOG.warning( "Found %s objects after attempting to delete all objects with prefix %s", len(remaining_objects), prefix)
def get_file_keys_from_s3_with_manifest_id(self, request_id, s3_path, manifest_id, context={}): """ Get all files in a given prefix that match the given manifest_id. """ if not settings.ENABLE_PARQUET_PROCESSING: return [] keys = [] if s3_path: try: s3_resource = get_s3_resource() existing_objects = s3_resource.Bucket( settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path) for obj_summary in existing_objects: existing_object = obj_summary.Object() metadata = existing_object.metadata manifest = metadata.get("manifestid") manifest_id_str = str(manifest_id) key = existing_object.key if manifest == manifest_id_str: keys.append(key) except (EndpointConnectionError, ClientError) as err: msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return keys
def convert_csv_to_parquet( # noqa: C901 self, request_id, s3_csv_path, s3_parquet_path, local_path, manifest_id, csv_filename, converters={}, post_processor=None, context={}, report_type=None, ): """ Convert CSV files to parquet on S3. """ if s3_csv_path is None or s3_parquet_path is None or local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}." ) LOG.error(log_json(request_id, msg, context)) return False msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}." LOG.info(log_json(request_id, msg, context)) kwargs = {} parquet_file = None csv_file = f"{s3_csv_path}/{csv_filename}" if csv_filename.lower().endswith(CSV_EXT): ext = -len(CSV_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" elif csv_filename.lower().endswith(CSV_GZIP_EXT): ext = -len(CSV_GZIP_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" kwargs = {"compression": "gzip"} else: msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped." LOG.warn(log_json(request_id, msg, context)) return False Path(local_path).mkdir(parents=True, exist_ok=True) tmpfile = f"{local_path}/{csv_filename}" try: s3_resource = get_s3_resource() csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME, key=csv_file) csv_obj.download_file(tmpfile) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False output_file = f"{local_path}/{parquet_file}" try: col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns converters.update( {col: str for col in col_names if col not in converters}) data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs) if post_processor: data_frame = post_processor(data_frame) data_frame.to_parquet(output_file, allow_truncated_timestamps=True, coerce_timestamps="ms") except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False try: with open(output_file, "rb") as fin: data = BytesIO(fin.read()) copy_data_to_s3_bucket(request_id, s3_parquet_path, parquet_file, data, manifest_id=manifest_id, context=context) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) s3_key = f"{s3_parquet_path}/{parquet_file}" msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False s3_hive_table_path = get_hive_table_path(context.get("account"), self._provider_type, report_type=report_type) if not self.presto_table_exists.get(report_type): self.create_parquet_table( context.get("account"), context.get("provider_uuid"), manifest_id, s3_hive_table_path, output_file, report_type, ) shutil.rmtree(local_path, ignore_errors=True) return True
def delete_archived_data(schema_name, provider_type, provider_uuid): """ Delete archived data from our S3 bucket for a given provider. This function chiefly follows the deletion of a provider. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to delete this archived data. Args: schema_name (str): Koku user account (schema) name. provider_type (str): Koku backend provider type identifier. provider_uuid (UUID): Koku backend provider UUID. """ if not schema_name or not provider_type or not provider_uuid: # Sanity-check all of these inputs in case somehow any receives an # empty value such as None or '' because we need to minimize the risk # of deleting unrelated files from our S3 bucket. messages = [] if not schema_name: message = "missing required argument: schema_name" LOG.error(message) messages.append(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) messages.append(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) messages.append(message) raise TypeError("delete_archived_data() %s", ", ".join(messages)) if not settings.ENABLE_S3_ARCHIVING: LOG.info("Skipping delete_archived_data. Upload feature is disabled.") return # We need to normalize capitalization and "-local" dev providers. account = schema_name[4:] path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.CSV_DATA_TYPE}" prefix = f"{path_prefix}/{account}/{provider_uuid}/" LOG.info("attempting to delete our archived data in S3 under %s", prefix) s3_resource = get_s3_resource() s3_bucket = s3_resource.Bucket(settings.S3_BUCKET_NAME) object_keys = [{"Key": s3_object.key} for s3_object in s3_bucket.objects.filter(Prefix=prefix)] batch_size = 1000 # AWS S3 delete API limits to 1000 objects per request. for batch_number in range(math.ceil(len(object_keys) / batch_size)): batch_start = batch_size * batch_number batch_end = batch_start + batch_size object_keys_batch = object_keys[batch_start:batch_end] s3_bucket.delete_objects(Delete={"Objects": object_keys_batch}) remaining_objects = list(s3_bucket.objects.filter(Prefix=prefix)) if remaining_objects: LOG.warning( "Found %s objects after attempting to delete all objects with prefix %s", len(remaining_objects), prefix )