Пример #1
0
 def meets_sufficiency_or_error(self, df):
     if np.sum(np.isfinite(
             df['usage'])) < self.min_fraction_coverage * len(df):
         raise model_exceptions.DataSufficiencyException(
             "Insufficient coverage")
     if len(df) < self.min_contiguous_months * 30:
         raise model_exceptions.DataSufficiencyException(
             "Insufficient data")
     return
Пример #2
0
    def fit(self, input_data):
        if isinstance(input_data, tuple):
            raise model_exceptions.DataSufficiencyException(
                "Billing data is not appropriate for this model")
        self.input_data = input_data
        input_data_daily = input_data.resample('D').apply({
            'energy':
            pd.Series.sum,
            'tempF':
            pd.Series.mean
        })
        self.caltrack_model.fit(input_data_daily)

        self.params = {
            "coefficients": self.caltrack_model.model_res.params.to_dict(),
            "formula": self.caltrack_model.formula,
            "cdd_bp": self.caltrack_model.fit_bp_cdd,
            "hdd_bp": self.caltrack_model.fit_bp_hdd,
            "X_design_info": self.caltrack_model.X.design_info,
        }

        output = {
            "r2": self.caltrack_model.r2,
            "model_params": self.caltrack_model.params,
            "rmse": self.caltrack_model.rmse,
            "cvrmse": self.caltrack_model.cvrmse,
            "nmbe": self.caltrack_model.nmbe,
            "n": self.caltrack_model.n,
        }
        return output
Пример #3
0
    def ami_to_daily(self, df):
        ''' Convert from daily usage and temperature to monthly
        usage per day and average HDD/CDD. '''

        # Throw out any duplicate indices
        df = df[~df.index.duplicated(keep='last')].sort_index()

        # Create arrays to hold computed CDD and HDD for each
        # balance point temperature.
        cdd = {i: [0] for i in self.bp_cdd}
        hdd = {i: [0] for i in self.bp_hdd}

        # If there isn't any data, throw an exception
        if len(df.index) == 0:
            raise model_exceptions.DataSufficiencyException(
                "No energy trace data")

        # Check whether we are creating a demand fixture.
        is_demand_fixture = 'energy' not in df.columns

        for bp in self.bp_cdd:
            cdd[bp] = pd.Series(np.maximum(df.tempF - bp, 0), index=df.index)
        for bp in self.bp_hdd:
            hdd[bp] = pd.Series(np.maximum(bp - df.tempF, 0), index=df.index)

        # spread out over the month
        ndays = pd.Series((is_demand_fixture or np.isfinite(df.energy))
                          & np.isfinite(hdd[self.bp_hdd[0]]),
                          dtype=int)

        # Create output data frame
        if not is_demand_fixture:
            df_dict = {'upd': df.energy, 'usage': df.energy, 'ndays': ndays}
        else:
            df_dict = {'upd': ndays * 0, 'usage': ndays * 0, 'ndays': ndays}
        df_dict.update({'CDD_' + str(bp): cdd[bp] for bp in cdd.keys()})
        df_dict.update({'HDD_' + str(bp): hdd[bp] for bp in hdd.keys()})
        output = pd.DataFrame(df_dict, index=df.index)
        return output
Пример #4
0
    def fit(self, input_data):

        self.input_data = input_data
        if isinstance(input_data, tuple):
            raise model_exceptions.DataSufficiencyException(
                "Billing data is not appropriate for this model")
        else:
            df = self.ami_to_daily(self.input_data)
        self.df = df

        self.meets_sufficiency_or_error(df)

        # Fit the intercept-only model
        (int_formula, int_mod, int_res, int_rsquared,
         int_qualified) = _fit_intercept(df)

        # CDD-only
        if self.fit_cdd:
            (cdd_formula, cdd_mod, cdd_res, cdd_rsquared, cdd_qualified,
             cdd_bp) = _fit_cdd_only(df)
        else:
            cdd_formula = None
            cdd_mod = None
            cdd_res = None
            cdd_rsquared = 0
            cdd_qualified = False
            cdd_bp = None

        # HDD-only
        (hdd_formula, hdd_mod, hdd_res, hdd_rsquared, hdd_qualified,
         hdd_bp) = _fit_hdd_only(df)

        # CDD+HDD
        if self.fit_cdd:
            (full_formula, full_mod, full_res, full_rsquared, full_qualified,
             full_hdd_bp, full_cdd_bp) = _fit_full(df)
        else:
            full_formula = None
            full_mod = None
            full_res = None
            full_rsquared = 0
            full_qualified = False
            full_hdd_bp = None
            full_cdd_bp = None

        # Now we take the best qualified model.
        if (full_qualified or hdd_qualified or cdd_qualified
                or int_qualified) is False:
            raise model_exceptions.ModelFitException(
                "No candidate model fit to data successfully")

        use_full = (full_qualified and (full_rsquared > max([
            int(hdd_qualified) * hdd_rsquared,
            int(cdd_qualified) * cdd_rsquared,
            int(int_qualified) * int_rsquared,
        ])))

        use_hdd_only = (hdd_qualified and (hdd_rsquared > max([
            int(full_qualified) * full_rsquared,
            int(cdd_qualified) * cdd_rsquared,
            int(int_qualified) * int_rsquared,
        ])))

        use_cdd_only = (cdd_qualified and (cdd_rsquared > max([
            int(full_qualified) * full_rsquared,
            int(hdd_qualified) * hdd_rsquared,
            int(int_qualified) * int_rsquared,
        ])))

        fit_bp_hdd, fit_bp_cdd = None, None

        if use_full:
            # Use the full model
            y, X = patsy.dmatrices(full_formula, df, return_type='dataframe')
            estimated = full_res.fittedvalues
            r2, rmse = full_rsquared, np.sqrt(full_res.ssr / full_res.nobs)
            model_obj, model_res, formula = full_mod, full_res, full_formula
            fit_bp_hdd, fit_bp_cdd = full_hdd_bp, full_cdd_bp

        elif use_hdd_only:
            y, X = patsy.dmatrices(hdd_formula, df, return_type='dataframe')
            estimated = hdd_res.fittedvalues
            r2, rmse = hdd_rsquared, np.sqrt(hdd_res.ssr / hdd_res.nobs)
            model_obj, model_res, formula = hdd_mod, hdd_res, hdd_formula
            fit_bp_hdd = hdd_bp

        elif use_cdd_only:
            y, X = patsy.dmatrices(cdd_formula, df, return_type='dataframe')
            estimated = cdd_res.fittedvalues
            r2, rmse = cdd_rsquared, np.sqrt(cdd_res.ssr / cdd_res.nobs)
            model_obj, model_res, formula = cdd_mod, cdd_res, cdd_formula
            fit_bp_cdd = cdd_bp

        else:
            # Use Intercept-only
            y, X = patsy.dmatrices(int_formula, df, return_type='dataframe')
            estimated = int_res.fittedvalues
            r2, rmse = int_rsquared, np.sqrt(int_res.ssr / int_res.nobs)
            model_obj, model_res, formula = int_mod, int_res, int_formula

        if y.mean != 0:
            cvrmse = rmse / float(y.values.ravel().mean())
            nmbe = np.nanmean(model_res.resid) / float(y.values.ravel().mean())
        else:
            cvrmse = np.nan
            nmbe = np.nan

        n = estimated.shape[0]

        self.y, self.X = y, X
        self.estimated = estimated
        self.r2, self.rmse = r2, rmse
        self.model_obj, self.model_res, self.formula = model_obj, model_res, formula
        self.cvrmse = cvrmse
        self.nmbe = nmbe
        self.fit_bp_hdd, self.fit_bp_cdd = fit_bp_hdd, fit_bp_cdd
        self.n = n
        self.params = {
            "coefficients": self.model_res.params.to_dict(),
            "formula": self.formula,
            "cdd_bp": self.fit_bp_cdd,
            "hdd_bp": self.fit_bp_hdd,
            "X_design_info": self.X.design_info,
        }

        output = {
            "r2": self.r2,
            "model_params": self.params,
            "rmse": self.rmse,
            "cvrmse": self.cvrmse,
            "nmbe": self.nmbe,
            "n": self.n,
        }
        return output
Пример #5
0
    def billing_to_monthly_avg(self, trace_and_temp):
        ''' Helper function to handle monthly billing or other irregular data.
        '''
        (energy_data, temp_data) = trace_and_temp

        # Handle empty series
        if energy_data.empty:
            raise model_exceptions.DataSufficiencyException(
                "No energy trace data")
        if temp_data.empty:
            raise model_exceptions.DataSufficiencyException(
                "No temperature data")

        # Convert billing multiindex to straight index
        temp_data.index = temp_data.index.droplevel()

        # Resample temperature data to daily
        temp_data_daily = temp_data.resample('D').apply(np.mean)[0]

        # Drop any duplicate indices
        energy_data = energy_data[~energy_data.index.duplicated(
            keep='last')].sort_index()

        # Check for empty series post-resampling and deduplication
        if energy_data.empty:
            raise model_exceptions.DataSufficiencyException(
                "No energy trace data after deduplication")
        if temp_data_daily.empty:
            raise model_exceptions.DataSufficiencyException(
                "No temperature data after resampling")

        # get daily mean values
        upd_data_daily_mean_values = [
            value / (e - s).days for value, s, e in zip(
                energy_data, energy_data.index, energy_data.index[1:])
        ] + [
            np.nan
        ]  # add missing last data point, which is null by convention anyhow
        usage_data_daily_mean_values = [
            value for value, s, e in zip(energy_data, energy_data.index,
                                         energy_data.index[1:])
        ] + [
            np.nan
        ]  # add missing last data point, which is null by convention anyhow

        # Create arrays to hold computed CDD and HDD for each
        # balance point temperature.
        cdd = {i: [0] for i in self.bp_cdd}
        hdd = {i: [0] for i in self.bp_hdd}
        for bp in self.bp_cdd:
            cdd[bp] = pd.Series(np.maximum(temp_data_daily - bp, 0),
                                index=temp_data_daily.index)
        for bp in self.bp_hdd:
            hdd[bp] = pd.Series(np.maximum(bp - temp_data_daily, 0),
                                index=temp_data_daily.index)

        ndays_data_daily_mean_values = []
        hdd_data_daily_mean_values = {}
        cdd_data_daily_mean_values = {}

        for s, e in zip(energy_data.index, energy_data.index[1:]):
            thisn = np.sum(np.isfinite(temp_data_daily[s:e]))
            ndays_data_daily_mean_values.append(thisn)
            if thisn >= 15:
                for bp in self.bp_cdd:
                    thismean = np.nanmean(cdd[bp][s:e])
                    if bp not in cdd_data_daily_mean_values.keys():
                        cdd_data_daily_mean_values[bp] = []
                    cdd_data_daily_mean_values[bp].append(thismean)
                for bp in self.bp_hdd:
                    thismean = np.nanmean(hdd[bp][s:e])
                    if bp not in hdd_data_daily_mean_values.keys():
                        hdd_data_daily_mean_values[bp] = []
                    hdd_data_daily_mean_values[bp].append(thismean)
            else:
                for bp in self.bp_cdd:
                    if bp not in cdd_data_daily_mean_values.keys():
                        cdd_data_daily_mean_values[bp] = []
                    cdd_data_daily_mean_values[bp].append(np.nan)
                for bp in self.bp_hdd:
                    if bp not in hdd_data_daily_mean_values.keys():
                        hdd_data_daily_mean_values[bp] = []
                    hdd_data_daily_mean_values[bp].append(np.nan)

        # spread out over the month
        upd_data = pd.Series(upd_data_daily_mean_values,
                             index=energy_data.index)
        usage_data = pd.Series(usage_data_daily_mean_values,
                               index=energy_data.index)
        ndays_data = pd.Series(ndays_data_daily_mean_values + [np.nan],
                               index=energy_data.index)
        cdd_data = {}
        hdd_data = {}
        for bp in self.bp_cdd:
            cdd_data[bp] = pd.Series(cdd_data_daily_mean_values[bp] + [np.nan],
                                     index=energy_data.index)
        for bp in self.bp_hdd:
            hdd_data[bp] = pd.Series(hdd_data_daily_mean_values[bp] + [np.nan],
                                     index=energy_data.index)

        model_data = {
            'upd': upd_data,
            'usage': usage_data,
            'ndays': ndays_data,
        }
        model_data.update(
            {'CDD_' + str(bp): cdd_data[bp]
             for bp in cdd_data.keys()})
        model_data.update(
            {'HDD_' + str(bp): hdd_data[bp]
             for bp in hdd_data.keys()})

        return pd.DataFrame(model_data)
Пример #6
0
    def meets_sufficiency_or_error(self, df):
        # Caltrack sufficiency requirement of number of contiguous months

        # choose first hdd as a proxy for temperature data
        upd = df['upd'].values
        hdd_col = [col for col in df.columns if col.startswith('HDD')][0]
        temp = df[hdd_col].values

        def n_non_nan(values):
            return np.sum(~np.isnan(values))

        reason = None
        mp_type = self.modeling_period_interpretation
        if mp_type == 'baseline':

            _n = self.min_contiguous_baseline_months
            # In the baseline period, require the last N months be non-nan.
            last_month_nan = np.isnan(upd[-1])
            direction = "last"

            if last_month_nan:
                upd_contig = upd[-(_n + 1):-1]
                temp_contig = temp[-(_n + 1):-1]
            else:
                upd_contig = upd[-_n:]
                temp_contig = temp[-_n:]

        elif mp_type == 'reporting':

            _n = self.min_contiguous_reporting_months
            # In the reporting period, require the first N months be non-nan.
            first_month_nan = np.isnan(df['upd'].values[0])
            direction = "first"

            if first_month_nan:
                upd_contig = upd[1:_n + 1]
                temp_contig = temp[1:_n + 1]
            else:
                upd_contig = upd[:_n]
                temp_contig = temp[:_n]
        else:
            raise ValueError(
                'Unexpected modeling period interpretation {}'.format(mp_type))

        n_months = len(upd_contig)
        if n_months < _n:
            reason = (
                'The {direction} {req} months of a {mp} period must have'
                ' non-NaN energy and temperature values. In this case, there'
                ' were only {n} months in the series.'.format(
                    direction=direction, req=_n, mp=mp_type, n=n_months))
        else:
            upd_n_non_nan = n_non_nan(upd_contig)
            temp_n_non_nan = n_non_nan(temp_contig)
            upd_ok = (upd_n_non_nan == _n)
            temp_ok = (temp_n_non_nan == _n)
            if upd_ok and not temp_ok:
                reason = (
                    'The {direction} {req} months of a {mp} period must have'
                    ' at least 15 valid days of energy and temperature data.'
                    ' In this case, only {n} of the {direction} {req} months'
                    ' of temperature data met that requirement.'.format(
                        direction=direction,
                        req=_n,
                        mp=mp_type,
                        n=temp_n_non_nan,
                    ))
            elif not upd_ok and temp_ok:
                reason = (
                    'The {direction} {req} months of a {mp} period must have'
                    ' at least 15 valid days of energy and temperature data.'
                    ' In this case, only {n} of the {direction} {req} months'
                    ' of energy data met that requirement.'.format(
                        direction=direction,
                        req=_n,
                        mp=mp_type,
                        n=upd_n_non_nan,
                    ))
            elif not upd_ok and not temp_ok:
                reason = (
                    'The {direction} {req} months of a {mp} period must have'
                    ' at least 15 valid days of energy and temperature data.'
                    ' In this case, only {upd_n} and {temp_n} of the'
                    ' {direction} {req} months of energy and temperature data'
                    ' met that requirement, respectively.'.format(
                        direction=direction,
                        req=_n,
                        mp=mp_type,
                        upd_n=upd_n_non_nan,
                        temp_n=temp_n_non_nan))

        if reason is not None:
            raise model_exceptions.DataSufficiencyException(
                'Data does not meet minimum contiguous months requirement. {}'.
                format(reason))

        if not np.nansum(upd) > 0.01:
            raise model_exceptions.DataSufficiencyException(
                "Energy trace data is all or nearly all zero")

        return
Пример #7
0
    def daily_to_monthly_avg(self, df):
        ''' Convert from daily usage and temperature to monthly
        usage per day and average HDD/CDD. '''

        # Throw out any duplicate indices
        df = df[~df.index.duplicated(keep='last')].sort_index()

        # Create arrays to hold computed CDD and HDD for each
        # balance point temperature.
        cdd = {i: [0] for i in self.bp_cdd}
        hdd = {i: [0] for i in self.bp_hdd}

        # If there isn't any data, throw an exception
        if len(df.index) == 0:
            raise model_exceptions.DataSufficiencyException(
                "No energy trace data")

        # Create the arrays to hold our monthly output
        ndays, usage, upd, output_index = [0], [0], [0], [df.index[0]]
        this_yr, this_mo = output_index[0].year, output_index[0].month

        # Check whether we are creating a demand fixture.
        is_demand_fixture = 'energy' not in df.columns

        # TODO use groupby here? e.g. df.groupby(pd.TimeGrouper('MS'))
        # Loop through the daily input data frame populating monthly arrays
        for idx, row in df.iterrows():
            # Check whether we are in a new month.
            new_month = (this_yr != idx.year or this_mo != idx.month)
            if new_month:
                ndays.append(0)
                usage.append(0)
                upd.append(0)
                for i in cdd.keys():
                    cdd[i].append(0)
                for i in hdd.keys():
                    hdd[i].append(0)
                this_yr, this_mo = idx.year, idx.month
                output_index.append(idx)

            # If this day is valid, add it to the usage and CDD/HDD arrays.
            day_is_valid = (
                (is_demand_fixture or
                 (np.isfinite(row['energy']) and row['energy'] >= 0))
                and np.isfinite(row['tempF']))
            if day_is_valid:
                ndays[-1] = ndays[-1] + 1
                usage[-1] = usage[-1] + (row['energy']
                                         if not is_demand_fixture else 0)
                for bp in cdd.keys():
                    cdd[bp][-1] += np.maximum(row['tempF'] - bp, 0)
                for bp in hdd.keys():
                    hdd[bp][-1] += np.maximum(bp - row['tempF'], 0)

        # Caltrack sufficiency requirement of >=15 days per month
        for i in range(len(usage)):
            misses_req = (ndays[i] < 15)
            if misses_req:
                upd[i] = np.nan
                for bp in cdd.keys():
                    cdd[bp][i] = np.nan
                for bp in hdd.keys():
                    hdd[bp][i] = np.nan
            else:
                upd[i] = (usage[i] / ndays[i])
                for bp in cdd.keys():
                    cdd[bp][i] = cdd[bp][i] / ndays[i]
                for bp in hdd.keys():
                    hdd[bp][i] = hdd[bp][i] / ndays[i]

        # Create output data frame
        df_dict = {'upd': upd, 'usage': usage, 'ndays': ndays}
        df_dict.update({'CDD_' + str(bp): cdd[bp] for bp in cdd.keys()})
        df_dict.update({'HDD_' + str(bp): hdd[bp] for bp in hdd.keys()})
        output = pd.DataFrame(df_dict, index=output_index)
        return output
Пример #8
0
    def fit(self, weather_source):
        ''' Fit all models associated with this trace.

        Parameters
        ----------
        weather_source : eemeter.weather.ISDWeatherSource
            Weather source to use in creating covariate data.
        '''

        for modeling_period_label, modeling_period in \
                self.modeling_period_set.iter_modeling_periods():

            filtered_data = self._filter_by_modeling_period(
                self.trace, modeling_period)
            filtered_trace = EnergyTrace(self.trace.interpretation,
                                         data=filtered_data,
                                         unit=self.trace.unit)

            model = self.model_mapping[modeling_period_label]

            outputs = {
                "status": None,
                "traceback": None,
                "input_data": None,
                "start_date": None,
                "end_date": None,
                "n_rows": None,
                "model_fit": {},
            }

            # fail with DataSufficiencyException if bad weather source
            if weather_source is None:
                message = (
                    'No weather source found for trace {} in {} period'.format(
                        self.trace.trace_id, modeling_period_label))
                logger.warn(message)
                try:
                    raise model_exceptions.DataSufficiencyException(message)
                except:
                    outputs.update({
                        "status": "FAILURE",
                        "traceback": traceback.format_exc(),
                    })
                    self.fit_outputs[modeling_period_label] = outputs
                continue

            # attempt to create input data
            try:
                input_data = self.formatter.create_input(
                    filtered_trace, weather_source)
            except:
                logger.warn(
                    'Input data formatting failed for trace {} in {} period.'.
                    format(self.trace.trace_id, modeling_period_label))
                outputs.update({
                    "status": "FAILURE",
                    "traceback": traceback.format_exc(),
                })
            else:
                input_description = self.formatter.describe_input(input_data)
                input_serialization = self.formatter.serialize_input(
                    input_data)
                input_mask = self.formatter.get_input_data_mask(input_data)
                outputs.update({
                    "input_data_serialization":
                    input_serialization,
                    "input_mask":
                    input_mask,  # missing days
                    "start_date":
                    input_description.get('start_date'),
                    "end_date":
                    input_description.get('end_date'),
                    "n_rows":
                    input_description.get('n_rows'),
                    "trace":
                    filtered_trace,
                })

                try:
                    model_fit = model.fit(input_data)
                except:
                    tb = traceback.format_exc()
                    logger.warn(
                        '{} fit failed for trace {} in {} period.'.format(
                            model, self.trace.trace_id, modeling_period_label))

                    outputs.update({
                        "status": "FAILURE",
                        "traceback": tb,
                    })
                else:
                    logger.debug(
                        '{} fit successful for trace {} in {} period.'.format(
                            model, self.trace.trace_id, modeling_period_label))
                    outputs["model_fit"].update(model_fit)
                    outputs.update({
                        "status": "SUCCESS",
                    })

            self.fit_outputs[modeling_period_label] = outputs

        return self.fit_outputs