def convert_to_parquet(request_id, account, provider_uuid, provider_type, start_date, manifest_id, files=[], context={}): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. Args: request_id (str): The associated request id (ingress or celery task id) account (str): The account string provider_uuid (UUID): The provider UUID start_date (str): The report start time (YYYY-mm-dd) manifest_id (str): The identifier for the report manifest context (dict): A context object for logging """ if not context: context = {"account": account, "provider_uuid": provider_uuid} if not settings.ENABLE_S3_ARCHIVING: msg = "Skipping convert_to_parquet. S3 archiving feature is disabled." LOG.info(log_json(request_id, msg, context)) return if not request_id or not account or not provider_uuid: if not request_id: message = "missing required argument: request_id" LOG.error(message) if not account: message = "missing required argument: account" LOG.error(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) return if not start_date: msg = "S3 archiving feature is enabled, but no start_date was given for processing." LOG.warn(log_json(request_id, msg, context)) return try: cost_date = parser.parse(start_date) except ValueError: msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format." LOG.warn(log_json(request_id, msg, context)) return s3_csv_path = get_path_prefix(account, provider_uuid, cost_date, Config.CSV_DATA_TYPE) local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}" s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE) if not files: file_keys = get_file_keys_from_s3_with_manifest_id( request_id, s3_csv_path, manifest_id, context) files = [os.path.basename(file_key) for file_key in file_keys] if not files: msg = "S3 archiving feature is enabled, but no files to process." LOG.info(log_json(request_id, msg, context)) return post_processor = None # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if provider_type != Provider.PROVIDER_OCP: remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path, manifest_id, context) if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]: post_processor = aws_post_processor failed_conversion = [] for csv_filename in files: kwargs = {} parquet_path = s3_parquet_path if provider_type == Provider.PROVIDER_OCP: for report_type in REPORT_TYPES.keys(): if report_type in csv_filename: parquet_path = f"{s3_parquet_path}/{report_type}" kwargs["report_type"] = report_type break converters = get_column_converters(provider_type, **kwargs) result = convert_csv_to_parquet( request_id, s3_csv_path, parquet_path, local_path, manifest_id, csv_filename, converters, post_processor, context, ) if not result: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(request_id, msg, context)) return
def test_convert_csv_to_parquet(self): """Test convert_csv_to_parquet.""" result = self.report_processor.convert_csv_to_parquet( "request_id", None, "s3_parquet_path", "local_path", "manifest_id", "csv_filename") self.assertFalse(result) result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename") self.assertFalse(result) with patch("masu.processor.parquet.parquet_report_processor.settings", ENABLE_S3_ARCHIVING=True): with patch( "masu.processor.parquet.parquet_report_processor.get_s3_resource" ) as mock_s3: with patch( "masu.processor.parquet.parquet_report_processor.shutil.rmtree" ): with patch( "masu.processor.parquet.parquet_report_processor.Path" ): mock_s3.side_effect = ClientError({}, "Error") result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv", ) self.assertFalse(result) with patch("masu.processor.parquet.parquet_report_processor.settings", ENABLE_S3_ARCHIVING=True): with patch( "masu.processor.parquet.parquet_report_processor.get_s3_resource" ): with patch( "masu.processor.parquet.parquet_report_processor.shutil.rmtree" ): with patch( "masu.processor.parquet.parquet_report_processor.Path" ): result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertFalse(result) with patch("masu.processor.parquet.parquet_report_processor.settings", ENABLE_S3_ARCHIVING=True): with patch( "masu.processor.parquet.parquet_report_processor.get_s3_resource" ): with patch( "masu.processor.parquet.parquet_report_processor.shutil.rmtree" ): with patch( "masu.processor.parquet.parquet_report_processor.Path" ): with patch( "masu.processor.parquet.parquet_report_processor.pd" ) as mock_pd: with patch( "masu.processor.parquet.parquet_report_processor.open" ) as mock_open: mock_pd.read_csv.return_value.__enter__.return_value = [ 1, 2, 3 ] mock_open.side_effect = ValueError() result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertFalse(result) with patch("masu.processor.parquet.parquet_report_processor.settings", ENABLE_S3_ARCHIVING=True): with patch( "masu.processor.parquet.parquet_report_processor.get_s3_resource" ): with patch( "masu.processor.parquet.parquet_report_processor.Path" ): with patch( "masu.processor.parquet.parquet_report_processor.shutil.rmtree" ): with patch( "masu.processor.parquet.parquet_report_processor.pd" ): with patch( "masu.processor.parquet.parquet_report_processor.open" ): with patch( "masu.processor.parquet.parquet_report_processor.BytesIO" ): with patch( "masu.processor.parquet.parquet_report_processor.copy_data_to_s3_bucket" ): with patch( "masu.processor.parquet.parquet_report_processor.ParquetReportProcessor." "create_parquet_table"): result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertTrue(result) with patch( "masu.processor.parquet.parquet_report_processor.get_s3_resource" ): with patch("masu.processor.parquet.parquet_report_processor.Path"): with patch( "masu.processor.parquet.parquet_report_processor.shutil.rmtree" ): with patch( "masu.processor.parquet.parquet_report_processor.copy_data_to_s3_bucket" ): with patch( "masu.processor.parquet.parquet_report_processor.ParquetReportProcessor." "create_parquet_table"): test_report_test_path = "./koku/masu/test/data/test_cur.csv.gz" temp_dir = tempfile.mkdtemp() test_report = f"{temp_dir}/test_cur.csv.gz" shutil.copy2(test_report_test_path, test_report) local_path = "/tmp/parquet" Path(local_path).mkdir(parents=True, exist_ok=True) converters = get_column_converters( Provider.PROVIDER_AWS) result = self.report_processor.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", local_path, "manifest_id", test_report, converters=converters, post_processor=aws_post_processor, report_type=Provider.PROVIDER_AWS, ) self.assertTrue(result) shutil.rmtree(local_path, ignore_errors=True) shutil.rmtree(temp_dir)