Exemplo n.º 1
0
def convert_to_parquet(request_id,
                       account,
                       provider_uuid,
                       provider_type,
                       start_date,
                       manifest_id,
                       files=[],
                       context={}):
    """
    Convert archived CSV data from our S3 bucket for a given provider to Parquet.

    This function chiefly follows the download of a providers data.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to convert the archived data.

    Args:
        request_id (str): The associated request id (ingress or celery task id)
        account (str): The account string
        provider_uuid (UUID): The provider UUID
        start_date (str): The report start time (YYYY-mm-dd)
        manifest_id (str): The identifier for the report manifest
        context (dict): A context object for logging

    """
    if not context:
        context = {"account": account, "provider_uuid": provider_uuid}

    if not settings.ENABLE_S3_ARCHIVING:
        msg = "Skipping convert_to_parquet. S3 archiving feature is disabled."
        LOG.info(log_json(request_id, msg, context))
        return

    if not request_id or not account or not provider_uuid:
        if not request_id:
            message = "missing required argument: request_id"
            LOG.error(message)
        if not account:
            message = "missing required argument: account"
            LOG.error(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
        return

    if not start_date:
        msg = "S3 archiving feature is enabled, but no start_date was given for processing."
        LOG.warn(log_json(request_id, msg, context))
        return

    try:
        cost_date = parser.parse(start_date)
    except ValueError:
        msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format."
        LOG.warn(log_json(request_id, msg, context))
        return

    s3_csv_path = get_path_prefix(account, provider_uuid, cost_date,
                                  Config.CSV_DATA_TYPE)
    local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}"
    s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date,
                                      Config.PARQUET_DATA_TYPE)

    if not files:
        file_keys = get_file_keys_from_s3_with_manifest_id(
            request_id, s3_csv_path, manifest_id, context)
        files = [os.path.basename(file_key) for file_key in file_keys]
        if not files:
            msg = "S3 archiving feature is enabled, but no files to process."
            LOG.info(log_json(request_id, msg, context))
            return

    post_processor = None
    # OCP data is daily chunked report files.
    # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
    if provider_type != Provider.PROVIDER_OCP:
        remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path,
                                               manifest_id, context)

    if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]:
        post_processor = aws_post_processor

    failed_conversion = []
    for csv_filename in files:
        kwargs = {}
        parquet_path = s3_parquet_path
        if provider_type == Provider.PROVIDER_OCP:
            for report_type in REPORT_TYPES.keys():
                if report_type in csv_filename:
                    parquet_path = f"{s3_parquet_path}/{report_type}"
                    kwargs["report_type"] = report_type
                    break
        converters = get_column_converters(provider_type, **kwargs)
        result = convert_csv_to_parquet(
            request_id,
            s3_csv_path,
            parquet_path,
            local_path,
            manifest_id,
            csv_filename,
            converters,
            post_processor,
            context,
        )
        if not result:
            failed_conversion.append(csv_filename)

    if failed_conversion:
        msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
        LOG.warn(log_json(request_id, msg, context))
        return
Exemplo n.º 2
0
    def test_convert_csv_to_parquet(self):
        """Test convert_csv_to_parquet."""
        result = self.report_processor.convert_csv_to_parquet(
            "request_id", None, "s3_parquet_path", "local_path", "manifest_id",
            "csv_filename")
        self.assertFalse(result)

        result = self.report_processor.convert_csv_to_parquet(
            "request_id", "s3_csv_path", "s3_parquet_path", "local_path",
            "manifest_id", "csv_filename")
        self.assertFalse(result)

        with patch("masu.processor.parquet.parquet_report_processor.settings",
                   ENABLE_S3_ARCHIVING=True):
            with patch(
                    "masu.processor.parquet.parquet_report_processor.get_s3_resource"
            ) as mock_s3:
                with patch(
                        "masu.processor.parquet.parquet_report_processor.shutil.rmtree"
                ):
                    with patch(
                            "masu.processor.parquet.parquet_report_processor.Path"
                    ):
                        mock_s3.side_effect = ClientError({}, "Error")
                        result = self.report_processor.convert_csv_to_parquet(
                            "request_id",
                            "s3_csv_path",
                            "s3_parquet_path",
                            "local_path",
                            "manifest_id",
                            "csv_filename.csv",
                        )
                        self.assertFalse(result)

        with patch("masu.processor.parquet.parquet_report_processor.settings",
                   ENABLE_S3_ARCHIVING=True):
            with patch(
                    "masu.processor.parquet.parquet_report_processor.get_s3_resource"
            ):
                with patch(
                        "masu.processor.parquet.parquet_report_processor.shutil.rmtree"
                ):
                    with patch(
                            "masu.processor.parquet.parquet_report_processor.Path"
                    ):
                        result = self.report_processor.convert_csv_to_parquet(
                            "request_id",
                            "s3_csv_path",
                            "s3_parquet_path",
                            "local_path",
                            "manifest_id",
                            "csv_filename.csv.gz",
                        )
                        self.assertFalse(result)

        with patch("masu.processor.parquet.parquet_report_processor.settings",
                   ENABLE_S3_ARCHIVING=True):
            with patch(
                    "masu.processor.parquet.parquet_report_processor.get_s3_resource"
            ):
                with patch(
                        "masu.processor.parquet.parquet_report_processor.shutil.rmtree"
                ):
                    with patch(
                            "masu.processor.parquet.parquet_report_processor.Path"
                    ):
                        with patch(
                                "masu.processor.parquet.parquet_report_processor.pd"
                        ) as mock_pd:
                            with patch(
                                    "masu.processor.parquet.parquet_report_processor.open"
                            ) as mock_open:
                                mock_pd.read_csv.return_value.__enter__.return_value = [
                                    1, 2, 3
                                ]
                                mock_open.side_effect = ValueError()
                                result = self.report_processor.convert_csv_to_parquet(
                                    "request_id",
                                    "s3_csv_path",
                                    "s3_parquet_path",
                                    "local_path",
                                    "manifest_id",
                                    "csv_filename.csv.gz",
                                )
                                self.assertFalse(result)

        with patch("masu.processor.parquet.parquet_report_processor.settings",
                   ENABLE_S3_ARCHIVING=True):
            with patch(
                    "masu.processor.parquet.parquet_report_processor.get_s3_resource"
            ):
                with patch(
                        "masu.processor.parquet.parquet_report_processor.Path"
                ):
                    with patch(
                            "masu.processor.parquet.parquet_report_processor.shutil.rmtree"
                    ):
                        with patch(
                                "masu.processor.parquet.parquet_report_processor.pd"
                        ):
                            with patch(
                                    "masu.processor.parquet.parquet_report_processor.open"
                            ):
                                with patch(
                                        "masu.processor.parquet.parquet_report_processor.BytesIO"
                                ):
                                    with patch(
                                            "masu.processor.parquet.parquet_report_processor.copy_data_to_s3_bucket"
                                    ):
                                        with patch(
                                                "masu.processor.parquet.parquet_report_processor.ParquetReportProcessor."
                                                "create_parquet_table"):
                                            result = self.report_processor.convert_csv_to_parquet(
                                                "request_id",
                                                "s3_csv_path",
                                                "s3_parquet_path",
                                                "local_path",
                                                "manifest_id",
                                                "csv_filename.csv.gz",
                                            )
                                            self.assertTrue(result)

        with patch(
                "masu.processor.parquet.parquet_report_processor.get_s3_resource"
        ):
            with patch("masu.processor.parquet.parquet_report_processor.Path"):
                with patch(
                        "masu.processor.parquet.parquet_report_processor.shutil.rmtree"
                ):
                    with patch(
                            "masu.processor.parquet.parquet_report_processor.copy_data_to_s3_bucket"
                    ):
                        with patch(
                                "masu.processor.parquet.parquet_report_processor.ParquetReportProcessor."
                                "create_parquet_table"):
                            test_report_test_path = "./koku/masu/test/data/test_cur.csv.gz"
                            temp_dir = tempfile.mkdtemp()
                            test_report = f"{temp_dir}/test_cur.csv.gz"
                            shutil.copy2(test_report_test_path, test_report)
                            local_path = "/tmp/parquet"
                            Path(local_path).mkdir(parents=True, exist_ok=True)
                            converters = get_column_converters(
                                Provider.PROVIDER_AWS)

                            result = self.report_processor.convert_csv_to_parquet(
                                "request_id",
                                "s3_csv_path",
                                "s3_parquet_path",
                                local_path,
                                "manifest_id",
                                test_report,
                                converters=converters,
                                post_processor=aws_post_processor,
                                report_type=Provider.PROVIDER_AWS,
                            )
                            self.assertTrue(result)
                            shutil.rmtree(local_path, ignore_errors=True)
                            shutil.rmtree(temp_dir)