def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ local_s3_filename = utils.get_local_file_name(key) directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}" full_file_path = f"{directory_path}/{local_s3_filename}" if not os.path.isfile(key): log_msg = f"Unable to locate {key} in {self.bucket_path}" raise AWSReportDownloaderNoFileError(log_msg) # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag_hasher = hashlib.new("ripemd160") s3_etag_hasher.update(bytes(local_s3_filename, "utf-8")) s3_etag = s3_etag_hasher.hexdigest() file_creation_date = None if s3_etag != stored_etag or not os.path.isfile(full_file_path): msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.tracing_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): utils.remove_files_not_in_set_from_s3_bucket( self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) return full_file_path, s3_etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key( key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) blob = self._azure_client.download_cost_export( key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() file_creation_date = None if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
class ReportManifestDBAccessorTest(IamTestCase): """Test cases for the ReportManifestDBAccessor.""" def setUp(self): """Set up the test class.""" super().setUp() self.schema = self.schema_name self.billing_start = DateAccessor().today_with_timezone("UTC").replace( day=1) self.manifest_dict = { "assembly_id": "1234", "billing_period_start_datetime": self.billing_start, "num_total_files": 2, "provider_uuid": self.provider_uuid, } self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Tear down the test class.""" super().tearDown() with schema_context(self.schema): manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table) def test_get_manifest(self): """Test that the right manifest is returned.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) assembly_id = self.manifest_dict.get("assembly_id") provider_uuid = self.manifest_dict.get("provider_uuid") manifest = self.manifest_accessor.get_manifest( assembly_id, provider_uuid) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) self.assertEqual(manifest.assembly_id, assembly_id) self.assertEqual(manifest.provider_id, provider_uuid) self.assertEqual(manifest.num_total_files, self.manifest_dict.get("num_total_files")) def test_get_manifest_by_id(self): """Test that the right manifest is returned by id.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) manifest = self.manifest_accessor.get_manifest_by_id( added_manifest.id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) def test_mark_manifest_as_updated(self): """Test that the manifest is marked updated.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone("UTC") self.manifest_accessor.mark_manifest_as_updated(manifest) self.assertGreater(manifest.manifest_updated_datetime, now) def test_mark_manifest_as_updated_none_manifest(self): """Test that a none manifest doesn't update failure.""" try: self.manifest_accessor.mark_manifest_as_updated(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_mark_manifest_as_completed_none_manifest(self): """Test that a none manifest doesn't complete failure.""" try: self.manifest_accessor.mark_manifest_as_completed(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_get_manifest_list_for_provider_and_bill_date(self): """Test that all manifests are returned for a provider and bill.""" bill_date = self.manifest_dict["billing_period_start_datetime"].date() manifest_dict = copy.deepcopy(self.manifest_dict) self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 1) manifest_dict["assembly_id"] = "2345" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 2) manifest_dict["assembly_id"] = "3456" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 3) def test_get_last_seen_manifest_ids(self): """Test that get_last_seen_manifest_ids returns the appropriate assembly_ids.""" # test that the most recently seen manifests that haven't been processed are returned manifest_dict2 = { "assembly_id": "5678", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": "00000000-0000-0000-0000-000000000002", } manifest = self.manifest_accessor.add(**self.manifest_dict) manifest2 = self.manifest_accessor.add(**manifest_dict2) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id, manifest2.assembly_id]) # test that when the manifest's files have been processed - it is no longer returned manifest2_helper = ManifestCreationHelper( manifest2.id, manifest_dict2.get("num_total_files"), manifest_dict2.get("assembly_id")) manifest2_helper.generate_test_report_files() manifest2_helper.process_all_files() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) # test that of two manifests with the same provider_ids - that only the most recently # seen is returned manifest_dict3 = { "assembly_id": "91011", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": self.provider_uuid, } manifest3 = self.manifest_accessor.add(**manifest_dict3) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest3.assembly_id]) # test that manifests for a different billing month are not returned current_month = self.billing_start calculated_month = current_month + relativedelta(months=-2) manifest3.billing_period_start_datetime = calculated_month manifest3.save() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) def test_is_last_completed_datetime_null(self): """Test is last completed datetime is null.""" manifest_id = 123456789 self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) baker.make(CostUsageReportManifest, id=manifest_id) baker.make(CostUsageReportStatus, manifest_id=manifest_id, last_completed_datetime=None) self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) CostUsageReportStatus.objects.filter(manifest_id=manifest_id).update( last_completed_datetime=FAKE.date()) self.assertFalse( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) def test_get_s3_csv_cleared(self): """Test that s3 CSV clear status is reported.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) status = self.manifest_accessor.get_s3_csv_cleared(manifest) self.assertFalse(status) self.manifest_accessor.mark_s3_csv_cleared(manifest) status = self.manifest_accessor.get_s3_csv_cleared(manifest) self.assertTrue(status) def test_get_s3_parquet_cleared(self): """Test that s3 CSV clear status is reported.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) status = self.manifest_accessor.get_s3_parquet_cleared(manifest) self.assertFalse(status) self.manifest_accessor.mark_s3_parquet_cleared(manifest) status = self.manifest_accessor.get_s3_parquet_cleared(manifest) self.assertTrue(status)
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ s3_filename = key.split("/")[-1] directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}" local_s3_filename = utils.get_local_file_name(key) msg = f"Local S3 filename: {local_s3_filename}" LOG.info(log_json(self.tracing_id, msg, self.context)) full_file_path = f"{directory_path}/{local_s3_filename}" # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag = None file_creation_date = None try: s3_file = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=key) s3_etag = s3_file.get("ETag") file_creation_date = s3_file.get("LastModified") except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": msg = "Unable to find {} in S3 Bucket: {}".format( s3_filename, self.report.get("S3Bucket")) LOG.info(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) if ex.response["Error"]["Code"] == "AccessDenied": msg = "Unable to access S3 Bucket {}: (AccessDenied)".format( self.report.get("S3Bucket")) LOG.info(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if not self._check_size(key, check_inflate=True): msg = f"Insufficient disk space to download file: {s3_file}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderError(msg) if s3_etag != stored_etag or not os.path.isfile(full_file_path): msg = f"Downloading key: {key} to file path: {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) self.s3_client.download_file(self.report.get("S3Bucket"), key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.tracing_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): utils.remove_files_not_in_set_from_s3_bucket( self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Download complete for {key}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, s3_etag, file_creation_date, []