Пример #1
0
class DateHelperTest(TestCase):
    """Test the DateHelper."""
    def setUp(self):
        """Test setup."""
        self.date_helper = DateHelper()
        self.date_helper._now = datetime.datetime(1970, 1, 10, 12, 59, 59)

    def test_this_hour(self):
        """Test this_hour property."""
        expected = datetime.datetime(1970, 1, 10, 12, 0, 0, 0)
        self.assertEqual(self.date_helper.this_hour, expected)

    def test_next_hour(self):
        """Test next_hour property."""
        expected = datetime.datetime(1970, 1, 10, 13, 0, 0, 0)
        self.assertEqual(self.date_helper.next_hour, expected)

    def test_prev_hour(self):
        """Test previous_hour property."""
        expected = datetime.datetime(1970, 1, 10, 11, 0, 0, 0)
        self.assertEqual(self.date_helper.previous_hour, expected)

    def test_today(self):
        """Test today property."""
        expected = datetime.datetime(1970, 1, 10, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.today, expected)

    def test_yesterday(self):
        """Test yesterday property."""
        date_helper = DateHelper()
        date_helper._now = datetime.datetime(1970, 1, 1, 12, 59, 59)
        expected = datetime.datetime(1969, 12, 31, 0, 0, 0, 0)
        self.assertEqual(date_helper.yesterday, expected)

    def test_tomorrow(self):
        """Test tomorrow property."""
        expected = datetime.datetime(1970, 1, 11, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.tomorrow, expected)

    def test_this_month_start(self):
        """Test this_month_start property."""
        expected = datetime.datetime(1970, 1, 1, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.this_month_start, expected)

    def test_this_month_end(self):
        """Test this_month_end property."""
        expected = datetime.datetime(1970, 1, 31, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.this_month_end, expected)

    def test_next_month_start(self):
        """Test next_month_start property."""
        expected = datetime.datetime(1970, 2, 1, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.next_month_start, expected)

    def test_next_month_end(self):
        """Test next_month_end property."""
        expected = datetime.datetime(1970, 2, 28, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.next_month_end, expected)

    def test_last_month_start(self):
        """Test last_month_start property."""
        expected = datetime.datetime(1969, 12, 1, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.last_month_start, expected)

    def test_last_month_end(self):
        """Test last_month_end property."""
        expected = datetime.datetime(1969, 12, 31, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.last_month_end, expected)

    def test_next_month(self):
        """Test the next_month method."""
        current_month = datetime.datetime.now().replace(microsecond=0,
                                                        second=0,
                                                        minute=0,
                                                        hour=0,
                                                        day=1)
        last_month = current_month - relativedelta(months=1)
        self.assertEqual(current_month, DateHelper().next_month(last_month))

    def test_previous_month(self):
        """Test the previous_month method."""
        current_month = datetime.datetime.now().replace(microsecond=0,
                                                        second=0,
                                                        minute=0,
                                                        hour=0,
                                                        day=1)
        last_month = current_month - relativedelta(months=1)
        self.assertEqual(last_month,
                         DateHelper().previous_month(current_month))

    def test_list_days(self):
        """Test the list_days method."""
        first = datetime.datetime.now().replace(microsecond=0,
                                                second=0,
                                                minute=0,
                                                hour=0,
                                                day=1)
        second = first.replace(day=2)
        third = first.replace(day=3)
        expected = [first, second, third]
        self.assertEqual(self.date_helper.list_days(first, third), expected)

    def test_list_months(self):
        """Test the list_months method."""
        first = datetime.datetime(1970, 1, 1)
        second = datetime.datetime(1970, 2, 1)
        third = datetime.datetime(1970, 3, 1)
        expected = [first, second, third]
        self.assertEqual(self.date_helper.list_months(first, third), expected)

    def test_n_days_ago(self):
        """Test the n_days_ago method."""
        delta_day = datetime.timedelta(days=1)
        today = timezone.now().replace(microsecond=0,
                                       second=0,
                                       minute=0,
                                       hour=0)
        two_days_ago = (today - delta_day) - delta_day
        self.assertEqual(self.date_helper.n_days_ago(today, 2), two_days_ago)

    def test_month_start(self):
        """Test month start method."""
        today = self.date_helper.today
        expected = datetime.datetime(1970, 1, 1, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.month_start(today), expected)

    def test_month_end(self):
        """Test month end method."""
        today = self.date_helper.today
        expected = datetime.datetime(1970, 1, 31, 0, 0, 0, 0)
        self.assertEqual(self.date_helper.month_end(today), expected)

        today_date = today.date()
        expected = datetime.date(1970, 1, 31)
        self.assertEqual(self.date_helper.month_end(today_date), expected)

    def test_midnight(self):
        """Test midnight property."""
        expected = datetime.time(0, 0, 0, 0)
        self.assertEqual(self.date_helper.midnight, expected)
Пример #2
0
class Forecast:
    """Base forecasting class."""

    # the minimum number of data points needed to use the current month's data.
    # if we have fewer than this many data points, fall back to using the previous month's data.
    #
    # this number is chosen in part because statsmodels.stats.stattools.omni_normtest() needs at least eight data
    # points to test for normal distribution.
    MINIMUM = 8

    # the precision of the floats returned in the forecast response.
    PRECISION = 8

    REPORT_TYPE = "costs"

    def __init__(self, query_params):  # noqa: C901
        """Class Constructor.

        Instance Attributes:
            - cost_summary_table (Model)
            - aggregates (dict)
            - filters (QueryFilterCollection)
            - query_range (tuple)
        """
        self.dh = DateHelper()
        self.params = query_params

        # select appropriate model based on access
        access = query_params.get("access", {})
        access_key = "default"
        self.cost_summary_table = self.provider_map.views.get("costs").get(access_key)
        if access:
            access_key = tuple(access.keys())
            filter_fields = self.provider_map.provider_map.get("filters")
            materialized_view = self.provider_map.views.get("costs").get(access_key)
            if materialized_view:
                # We found a matching materialized view, use that
                self.cost_summary_table = materialized_view
            else:
                # We have access constraints, but no view to accomodate, default to daily summary table
                self.cost_summary_table = self.provider_map.query_table

        self.forecast_days_required = (self.dh.this_month_end - self.dh.yesterday).days

        # forecasts use a rolling window
        self.query_range = (self.dh.n_days_ago(self.dh.yesterday, 30), self.dh.yesterday)

        self.filters = QueryFilterCollection()
        self.filters.add(field="usage_start", operation="gte", parameter=self.query_range[0])
        self.filters.add(field="usage_end", operation="lte", parameter=self.query_range[1])

        # filter queries based on access
        if access_key != "default":
            for q_param, filt in filter_fields.items():
                access = query_params.get_access(q_param, list())
                if access:
                    self.set_access_filters(access, filt, self.filters)

    @property
    def provider_map(self):
        """Return the provider map instance."""
        return self.provider_map_class(self.provider, self.REPORT_TYPE)

    @property
    def total_cost_term(self):
        """Return the provider map value for total cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("cost_total")

    @property
    def supplementary_cost_term(self):
        """Return the provider map value for total supplemenatry cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("sup_total")

    @property
    def infrastructure_cost_term(self):
        """Return the provider map value for total inftrastructure cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("infra_total")

    def predict(self):
        """Define ORM query to run forecast and return prediction."""
        cost_predictions = {}
        with tenant_context(self.params.tenant):
            data = (
                self.cost_summary_table.objects.filter(self.filters.compose())
                .order_by("usage_start")
                .values("usage_start")
                .annotate(
                    total_cost=self.total_cost_term,
                    supplementary_cost=self.supplementary_cost_term,
                    infrastructure_cost=self.infrastructure_cost_term,
                )
            )

            for fieldname in ["total_cost", "infrastructure_cost", "supplementary_cost"]:
                uniq_data = self._uniquify_qset(data.values("usage_start", fieldname), field=fieldname)
                cost_predictions[fieldname] = self._predict(uniq_data)

            cost_predictions = self._key_results_by_date(cost_predictions)
            return self.format_result(cost_predictions)

    def _predict(self, data):
        """Handle pre and post prediction work.

        This function handles arranging incoming data to conform with statsmodels requirements.
        Then after receiving the forecast output, this function handles formatting to conform to
        API reponse requirements.

        Args:
            data (list) a list of (datetime, float) tuples

        Returns:
            (LinearForecastResult) linear forecast results object
        """
        LOG.debug("Forecast input data: %s", data)

        if len(data) < self.MINIMUM:
            LOG.warning(
                "Number of data elements (%s) is fewer than the minimum (%s). Unable to generate forecast.",
                len(data),
                self.MINIMUM,
            )
            return []

        dates, costs = zip(*data)

        X = self._enumerate_dates(dates)
        Y = [float(c) for c in costs]

        # calculate x-values for the prediction range
        pred_x = [i for i in range(X[-1] + 1, X[-1] + 1 + self.forecast_days_required)]

        # run the forecast
        results = self._run_forecast(X, Y, to_predict=pred_x)

        result_dict = {}
        for i, value in enumerate(results.prediction):
            # extrapolate confidence intervals to align with prediction.
            # this reduces the confidence interval below 95th percentile, but is a better UX.
            if i < len(results.confidence_lower):
                lower = results.confidence_lower[i]
            else:
                lower = results.confidence_lower[-1] + results.slope * (i - len(results.confidence_lower))

            if i < len(results.confidence_upper):
                upper = results.confidence_upper[i]
            else:
                upper = results.confidence_upper[-1] + results.slope * (i - len(results.confidence_upper))

            # ensure that there are no negative numbers.
            result_dict[self.dh.today.date() + timedelta(days=i)] = {
                "total_cost": max((value, 0)),
                "confidence_min": max((lower, 0)),
                "confidence_max": max((upper, 0)),
            }

        return (result_dict, results.rsquared, results.pvalues)

    def _enumerate_dates(self, date_list):
        """Given a list of dates, return a list of integers.

        This method works in conjunction with _remove_outliers(). This method works to preserve any gaps
        in the data created by _remove_outliers() so that the integers used for the X-axis are aligned
        appropriately.

        Example:
            If _remove_outliers() returns {"2000-01-01": 1.0, "2000-01-03": 1.5}
            then _enumerate_dates() returns [0, 2]
        """
        days = self.dh.list_days(
            datetime.combine(date_list[0], self.dh.midnight), datetime.combine(date_list[-1], self.dh.midnight)
        )
        out = [i for i, day in enumerate(days) if day.date() in date_list]
        return out

    def _remove_outliers(self, data):
        """Remove outliers from our dateset before predicting.

        We use a box plot method without plotting the box.
        """
        values = list(data.values())
        if values:
            third_quartile, first_quartile = np.percentile(values, [Decimal(75), Decimal(25)])
            interquartile_range = third_quartile - first_quartile

            upper_boundary = third_quartile + (Decimal(1.5) * interquartile_range)
            lower_boundary = first_quartile - (Decimal(1.5) * interquartile_range)

            return {key: value for key, value in data.items() if (value >= lower_boundary and value <= upper_boundary)}
        return data

    def _key_results_by_date(self, results, check_term="total_cost"):
        """Take results formatted by cost type, and return results keyed by date."""
        results_by_date = defaultdict(dict)
        date_based_dict = results[check_term][0] if results[check_term] else []
        for date in date_based_dict:
            for cost_term in results:
                if results[cost_term][0].get(date):
                    results_by_date[date][cost_term] = (
                        results[cost_term][0][date],
                        {"rsquared": results[cost_term][1]},
                        {"pvalues": results[cost_term][2]},
                    )
        return results_by_date

    def format_result(self, results):
        """Format results for API consumption."""
        f_format = f"%.{self.PRECISION}f"  # avoid converting floats to e-notation
        units = "USD"

        response = []
        for key in results:
            if key > self.dh.this_month_end.date():
                continue
            dikt = {
                "date": key,
                "values": [
                    {
                        "date": key,
                        "infrastructure": {
                            "total": {
                                "value": round(results[key]["infrastructure_cost"][0]["total_cost"], 3),
                                "units": units,
                            },
                            "confidence_max": {
                                "value": round(results[key]["infrastructure_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["infrastructure_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {
                                "value": f_format % results[key]["infrastructure_cost"][1]["rsquared"],
                                "units": None,
                            },
                            "pvalues": {"value": results[key]["infrastructure_cost"][2]["pvalues"], "units": None},
                        },
                        "supplementary": {
                            "total": {
                                "value": round(results[key]["supplementary_cost"][0]["total_cost"], 3),
                                "units": units,
                            },
                            "confidence_max": {
                                "value": round(results[key]["supplementary_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["supplementary_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {
                                "value": f_format % results[key]["supplementary_cost"][1]["rsquared"],
                                "units": None,
                            },
                            "pvalues": {"value": results[key]["supplementary_cost"][2]["pvalues"], "units": None},
                        },
                        "cost": {
                            "total": {"value": round(results[key]["total_cost"][0]["total_cost"], 3), "units": units},
                            "confidence_max": {
                                "value": round(results[key]["total_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["total_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {"value": f_format % results[key]["total_cost"][1]["rsquared"], "units": None},
                            "pvalues": {"value": results[key]["total_cost"][2]["pvalues"], "units": None},
                        },
                    }
                ],
            }
            response.append(dikt)
        return response

    def _run_forecast(self, x, y, to_predict=None):
        """Apply the forecast model.

        Args:
            x (list) a list of exogenous variables
            y (list) a list of endogenous variables
            to_predict (list) a list of exogenous variables used in the forecast results

        Note:
            both x and y MUST be the same number of elements

        Returns:
            (tuple)
                (numpy.ndarray) prediction values
                (numpy.ndarray) confidence interval lower bound
                (numpy.ndarray) confidence interval upper bound
                (float) R-squared value
                (list) P-values
        """
        x = sm.add_constant(x)
        to_predict = sm.add_constant(to_predict)
        model = sm.OLS(y, x)
        results = model.fit()
        return LinearForecastResult(results, exog=to_predict)

    def _uniquify_qset(self, qset, field="total_cost"):
        """Take a QuerySet list, sum costs within the same day, and arrange it into a list of tuples.

        Args:
            qset (QuerySet)
            field (str) - field name in the QuerySet to be summed

        Returns:
            [(date, cost), ...]
        """
        # FIXME: this QuerySet->dict->list conversion probably isn't ideal.
        # FIXME: there's probably a way to aggregate multiple sources by date using just the ORM.
        result = defaultdict(Decimal)
        for item in qset:
            result[item.get("usage_start")] += Decimal(item.get(field, 0.0))
        result = self._remove_outliers(result)
        out = [(k, v) for k, v in result.items()]
        return out

    def set_access_filters(self, access, filt, filters):
        """Set access filters to ensure RBAC restrictions adhere to user's access and filters.

        Args:
            access (list) the list containing the users relevant access
            filt (list or dict) contains the filters to be updated
            filters (QueryFilterCollection) the filter collection to add the new filters to
        returns:
            None
        """
        if isinstance(filt, list):
            for _filt in filt:
                _filt["operation"] = "in"
                q_filter = QueryFilter(parameter=access, **_filt)
                filters.add(q_filter)
        else:
            filt["operation"] = "in"
            q_filter = QueryFilter(parameter=access, **filt)
            filters.add(q_filter)