def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: request_id (str): The request id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ daily_file_names = [] if settings.ENABLE_S3_ARCHIVING: daily_files = divide_csv_daily(filepath, filename) for daily_file in daily_files: # Push to S3 s3_csv_path = get_path_prefix(account, provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, daily_file.get("filepath"), daily_file.get("filename"), manifest_id, start_date, context, ) daily_file_names.append(daily_file.get("filename")) os.remove(daily_file.get("filepath")) return daily_file_names
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ s3_filename = key.split("/")[-1] directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}" local_s3_filename = utils.get_local_file_name(key) msg = f"Local S3 filename: {local_s3_filename}" LOG.info(log_json(self.request_id, msg, self.context)) full_file_path = f"{directory_path}/{local_s3_filename}" # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag = None try: s3_file = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=key) s3_etag = s3_file.get("ETag") except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": msg = "Unable to find {} in S3 Bucket: {}".format( s3_filename, self.report.get("S3Bucket")) LOG.info(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if not self._check_size(key, check_inflate=True): raise AWSReportDownloaderError( f"Insufficient disk space to download file: {s3_file}") if s3_etag != stored_etag or not os.path.isfile(full_file_path): LOG.debug("Downloading key: %s to file path: %s", key, full_file_path) self.s3_client.download_file(self.report.get("S3Bucket"), key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) utils.remove_files_not_in_set_from_s3_bucket( self.request_id, s3_csv_path, manifest_id) return full_file_path, s3_etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" try: blob = self._azure_client.get_cost_export_for_key(key, self.container_name) etag = blob.etag except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.request_id, msg, self.context)) raise AzureReportDownloaderError(msg) if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ local_s3_filename = utils.get_local_file_name(key) directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}" full_file_path = f"{directory_path}/{local_s3_filename}" if not os.path.isfile(key): log_msg = f"Unable to locate {key} in {self.bucket_path}" raise AWSReportDownloaderNoFileError(log_msg) # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag_hasher = hashlib.new("ripemd160") s3_etag_hasher.update(bytes(local_s3_filename, "utf-8")) s3_etag = s3_etag_hasher.hexdigest() file_creation_date = None if s3_etag != stored_etag or not os.path.isfile(full_file_path): msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.tracing_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): utils.remove_files_not_in_set_from_s3_bucket( self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) return full_file_path, s3_etag, file_creation_date, []
def create_daily_archives( request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={} ): if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account): s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE) # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}" copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context ) return [file_path]
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key( key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) blob = self._azure_client.download_cost_export( key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() file_creation_date = None if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ daily_file_names = [] if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_OCP, account): if context.get("version"): daily_files = [{"filepath": filepath, "filename": filename}] else: daily_files = divide_csv_daily(filepath, filename) for daily_file in daily_files: # Push to S3 s3_csv_path = get_path_prefix(account, Provider.PROVIDER_OCP, provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( tracing_id, s3_csv_path, daily_file.get("filepath"), daily_file.get("filename"), manifest_id, start_date, context, ) daily_file_names.append(daily_file.get("filepath")) return daily_file_names
def write_csv_to_s3(self, date, data, cols, tracing_id=None): """ Generates an HCS CSV from the specified schema and provider. :param date :param data :param cols :param tracing_id :return none """ my_df = pd.DataFrame(data) filename = f"hcs_{date}.csv" month = date.strftime("%m") year = date.strftime("%Y") s3_csv_path = ( f"hcs/csv/{self._schema_name}/{self._provider}/source={self._provider_uuid}/year={year}/month={month}" ) LOG.info(log_json(tracing_id, "preparing to write file to object storage")) my_df.to_csv(filename, header=cols, index=False) copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, filename, filename, "", date) os.remove(filename)
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, last_export_time, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ download_hash = None daily_file_names = [] if last_export_time: download_hash = hashlib.md5(str(last_export_time).encode()) download_hash = download_hash.hexdigest() if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_GCP, account): dh = DateHelper() directory = os.path.dirname(filepath) try: data_frame = pd.read_csv(filepath) except Exception as error: LOG.error( f"File {filepath} could not be parsed. Reason: {str(error)}") raise error for invoice_month in data_frame["invoice.month"].unique(): # daily_files = [] invoice_filter = data_frame["invoice.month"] == invoice_month invoice_data = data_frame[invoice_filter] unique_times = invoice_data.partition_date.unique() days = list({cur_dt[:10] for cur_dt in unique_times}) daily_data_frames = [{ "data_frame": invoice_data[invoice_data.partition_date.str.contains( cur_day)], "date": cur_day } for cur_day in days] start_of_invoice = dh.gcp_invoice_month_start(invoice_month) s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP, provider_uuid, start_of_invoice, Config.CSV_DATA_TYPE) for daily_data in daily_data_frames: day = daily_data.get("date") df = daily_data.get("data_frame") if download_hash: day_file = f"{invoice_month}_{day}_{download_hash}.csv" else: day_file = f"{invoice_month}_{day}.csv" day_filepath = f"{directory}/{day_file}" df.to_csv(day_filepath, index=False, header=True) copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, day_filepath, day_file, manifest_id, start_date, context) daily_file_names.append(day_filepath) return daily_file_names