def test_remove_files_not_in_set_from_s3_bucket(self): """Test remove_files_not_in_set_from_s3_bucket.""" removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", None, "manifest_id") self.assertEqual(removed, []) date_accessor = DateAccessor() start_date = date_accessor.today_with_timezone("utc").replace(day=1) s3_csv_path = get_path_prefix("account", Provider.PROVIDER_AWS, "provider_uuid", start_date, Config.CSV_DATA_TYPE) expected_key = "removed_key" mock_object = Mock(metadata={}, key=expected_key) mock_summary = Mock() mock_summary.Object.return_value = mock_object with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: mock_s3.return_value.Bucket.return_value.objects.filter.return_value = [ mock_summary ] removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", s3_csv_path, "manifest_id") self.assertEqual(removed, [expected_key]) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: mock_s3.side_effect = ClientError({}, "Error") removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", s3_csv_path, "manifest_id") self.assertEqual(removed, [])
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ s3_filename = key.split("/")[-1] directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}" local_s3_filename = utils.get_local_file_name(key) msg = f"Local S3 filename: {local_s3_filename}" LOG.info(log_json(self.request_id, msg, self.context)) full_file_path = f"{directory_path}/{local_s3_filename}" # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag = None try: s3_file = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=key) s3_etag = s3_file.get("ETag") except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": msg = "Unable to find {} in S3 Bucket: {}".format( s3_filename, self.report.get("S3Bucket")) LOG.info(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if not self._check_size(key, check_inflate=True): raise AWSReportDownloaderError( f"Insufficient disk space to download file: {s3_file}") if s3_etag != stored_etag or not os.path.isfile(full_file_path): LOG.debug("Downloading key: %s to file path: %s", key, full_file_path) self.s3_client.download_file(self.report.get("S3Bucket"), key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) utils.remove_files_not_in_set_from_s3_bucket( self.request_id, s3_csv_path, manifest_id) return full_file_path, s3_etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ local_s3_filename = utils.get_local_file_name(key) directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}" full_file_path = f"{directory_path}/{local_s3_filename}" if not os.path.isfile(key): log_msg = f"Unable to locate {key} in {self.bucket_path}" raise AWSReportDownloaderNoFileError(log_msg) # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag_hasher = hashlib.new("ripemd160") s3_etag_hasher.update(bytes(local_s3_filename, "utf-8")) s3_etag = s3_etag_hasher.hexdigest() file_creation_date = None if s3_etag != stored_etag or not os.path.isfile(full_file_path): msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.tracing_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): utils.remove_files_not_in_set_from_s3_bucket( self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) return full_file_path, s3_etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key( key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) blob = self._azure_client.download_cost_export( key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() file_creation_date = None if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def convert_to_parquet(request_id, account, provider_uuid, provider_type, start_date, manifest_id, files=[], context={}): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. Args: request_id (str): The associated request id (ingress or celery task id) account (str): The account string provider_uuid (UUID): The provider UUID start_date (str): The report start time (YYYY-mm-dd) manifest_id (str): The identifier for the report manifest context (dict): A context object for logging """ if not context: context = {"account": account, "provider_uuid": provider_uuid} if not settings.ENABLE_S3_ARCHIVING: msg = "Skipping convert_to_parquet. S3 archiving feature is disabled." LOG.info(log_json(request_id, msg, context)) return if not request_id or not account or not provider_uuid: if not request_id: message = "missing required argument: request_id" LOG.error(message) if not account: message = "missing required argument: account" LOG.error(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) return if not start_date: msg = "S3 archiving feature is enabled, but no start_date was given for processing." LOG.warn(log_json(request_id, msg, context)) return try: cost_date = parser.parse(start_date) except ValueError: msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format." LOG.warn(log_json(request_id, msg, context)) return s3_csv_path = get_path_prefix(account, provider_uuid, cost_date, Config.CSV_DATA_TYPE) local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}" s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE) if not files: file_keys = get_file_keys_from_s3_with_manifest_id( request_id, s3_csv_path, manifest_id, context) files = [os.path.basename(file_key) for file_key in file_keys] if not files: msg = "S3 archiving feature is enabled, but no files to process." LOG.info(log_json(request_id, msg, context)) return post_processor = None # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if provider_type != Provider.PROVIDER_OCP: remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path, manifest_id, context) if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]: post_processor = aws_post_processor failed_conversion = [] for csv_filename in files: kwargs = {} parquet_path = s3_parquet_path if provider_type == Provider.PROVIDER_OCP: for report_type in REPORT_TYPES.keys(): if report_type in csv_filename: parquet_path = f"{s3_parquet_path}/{report_type}" kwargs["report_type"] = report_type break converters = get_column_converters(provider_type, **kwargs) result = convert_csv_to_parquet( request_id, s3_csv_path, parquet_path, local_path, manifest_id, csv_filename, converters, post_processor, context, ) if not result: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(request_id, msg, context)) return
def convert_to_parquet(self): # noqa: C901 """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. """ parquet_base_filename = "" if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}." ) LOG.error(log_json(self.tracing_id, msg, self.error_context)) return "", pd.DataFrame() manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(self.manifest_id) # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if not manifest_accessor.get_s3_parquet_cleared(manifest) and self.provider_type not in ( Provider.PROVIDER_OCP, Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL, ): remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_ocp_on_cloud_path_s3, self.manifest_id, self.error_context ) manifest_accessor.mark_s3_parquet_cleared(manifest) failed_conversion = [] daily_data_frames = [] for csv_filename in self.file_list: if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None: msg = f"Could not establish report type for {csv_filename}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) failed_conversion.append(csv_filename) continue parquet_base_filename, daily_frame, success = self.convert_csv_to_parquet(csv_filename) daily_data_frames.extend(daily_frame) if self.provider_type not in (Provider.PROVIDER_AZURE): self.create_daily_parquet(parquet_base_filename, daily_frame) if not success: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return parquet_base_filename, daily_data_frames