def meets_sufficiency_or_error(self, df): if np.sum(np.isfinite( df['usage'])) < self.min_fraction_coverage * len(df): raise model_exceptions.DataSufficiencyException( "Insufficient coverage") if len(df) < self.min_contiguous_months * 30: raise model_exceptions.DataSufficiencyException( "Insufficient data") return
def fit(self, input_data): if isinstance(input_data, tuple): raise model_exceptions.DataSufficiencyException( "Billing data is not appropriate for this model") self.input_data = input_data input_data_daily = input_data.resample('D').apply({ 'energy': pd.Series.sum, 'tempF': pd.Series.mean }) self.caltrack_model.fit(input_data_daily) self.params = { "coefficients": self.caltrack_model.model_res.params.to_dict(), "formula": self.caltrack_model.formula, "cdd_bp": self.caltrack_model.fit_bp_cdd, "hdd_bp": self.caltrack_model.fit_bp_hdd, "X_design_info": self.caltrack_model.X.design_info, } output = { "r2": self.caltrack_model.r2, "model_params": self.caltrack_model.params, "rmse": self.caltrack_model.rmse, "cvrmse": self.caltrack_model.cvrmse, "nmbe": self.caltrack_model.nmbe, "n": self.caltrack_model.n, } return output
def ami_to_daily(self, df): ''' Convert from daily usage and temperature to monthly usage per day and average HDD/CDD. ''' # Throw out any duplicate indices df = df[~df.index.duplicated(keep='last')].sort_index() # Create arrays to hold computed CDD and HDD for each # balance point temperature. cdd = {i: [0] for i in self.bp_cdd} hdd = {i: [0] for i in self.bp_hdd} # If there isn't any data, throw an exception if len(df.index) == 0: raise model_exceptions.DataSufficiencyException( "No energy trace data") # Check whether we are creating a demand fixture. is_demand_fixture = 'energy' not in df.columns for bp in self.bp_cdd: cdd[bp] = pd.Series(np.maximum(df.tempF - bp, 0), index=df.index) for bp in self.bp_hdd: hdd[bp] = pd.Series(np.maximum(bp - df.tempF, 0), index=df.index) # spread out over the month ndays = pd.Series((is_demand_fixture or np.isfinite(df.energy)) & np.isfinite(hdd[self.bp_hdd[0]]), dtype=int) # Create output data frame if not is_demand_fixture: df_dict = {'upd': df.energy, 'usage': df.energy, 'ndays': ndays} else: df_dict = {'upd': ndays * 0, 'usage': ndays * 0, 'ndays': ndays} df_dict.update({'CDD_' + str(bp): cdd[bp] for bp in cdd.keys()}) df_dict.update({'HDD_' + str(bp): hdd[bp] for bp in hdd.keys()}) output = pd.DataFrame(df_dict, index=df.index) return output
def fit(self, input_data): self.input_data = input_data if isinstance(input_data, tuple): raise model_exceptions.DataSufficiencyException( "Billing data is not appropriate for this model") else: df = self.ami_to_daily(self.input_data) self.df = df self.meets_sufficiency_or_error(df) # Fit the intercept-only model (int_formula, int_mod, int_res, int_rsquared, int_qualified) = _fit_intercept(df) # CDD-only if self.fit_cdd: (cdd_formula, cdd_mod, cdd_res, cdd_rsquared, cdd_qualified, cdd_bp) = _fit_cdd_only(df) else: cdd_formula = None cdd_mod = None cdd_res = None cdd_rsquared = 0 cdd_qualified = False cdd_bp = None # HDD-only (hdd_formula, hdd_mod, hdd_res, hdd_rsquared, hdd_qualified, hdd_bp) = _fit_hdd_only(df) # CDD+HDD if self.fit_cdd: (full_formula, full_mod, full_res, full_rsquared, full_qualified, full_hdd_bp, full_cdd_bp) = _fit_full(df) else: full_formula = None full_mod = None full_res = None full_rsquared = 0 full_qualified = False full_hdd_bp = None full_cdd_bp = None # Now we take the best qualified model. if (full_qualified or hdd_qualified or cdd_qualified or int_qualified) is False: raise model_exceptions.ModelFitException( "No candidate model fit to data successfully") use_full = (full_qualified and (full_rsquared > max([ int(hdd_qualified) * hdd_rsquared, int(cdd_qualified) * cdd_rsquared, int(int_qualified) * int_rsquared, ]))) use_hdd_only = (hdd_qualified and (hdd_rsquared > max([ int(full_qualified) * full_rsquared, int(cdd_qualified) * cdd_rsquared, int(int_qualified) * int_rsquared, ]))) use_cdd_only = (cdd_qualified and (cdd_rsquared > max([ int(full_qualified) * full_rsquared, int(hdd_qualified) * hdd_rsquared, int(int_qualified) * int_rsquared, ]))) fit_bp_hdd, fit_bp_cdd = None, None if use_full: # Use the full model y, X = patsy.dmatrices(full_formula, df, return_type='dataframe') estimated = full_res.fittedvalues r2, rmse = full_rsquared, np.sqrt(full_res.ssr / full_res.nobs) model_obj, model_res, formula = full_mod, full_res, full_formula fit_bp_hdd, fit_bp_cdd = full_hdd_bp, full_cdd_bp elif use_hdd_only: y, X = patsy.dmatrices(hdd_formula, df, return_type='dataframe') estimated = hdd_res.fittedvalues r2, rmse = hdd_rsquared, np.sqrt(hdd_res.ssr / hdd_res.nobs) model_obj, model_res, formula = hdd_mod, hdd_res, hdd_formula fit_bp_hdd = hdd_bp elif use_cdd_only: y, X = patsy.dmatrices(cdd_formula, df, return_type='dataframe') estimated = cdd_res.fittedvalues r2, rmse = cdd_rsquared, np.sqrt(cdd_res.ssr / cdd_res.nobs) model_obj, model_res, formula = cdd_mod, cdd_res, cdd_formula fit_bp_cdd = cdd_bp else: # Use Intercept-only y, X = patsy.dmatrices(int_formula, df, return_type='dataframe') estimated = int_res.fittedvalues r2, rmse = int_rsquared, np.sqrt(int_res.ssr / int_res.nobs) model_obj, model_res, formula = int_mod, int_res, int_formula if y.mean != 0: cvrmse = rmse / float(y.values.ravel().mean()) nmbe = np.nanmean(model_res.resid) / float(y.values.ravel().mean()) else: cvrmse = np.nan nmbe = np.nan n = estimated.shape[0] self.y, self.X = y, X self.estimated = estimated self.r2, self.rmse = r2, rmse self.model_obj, self.model_res, self.formula = model_obj, model_res, formula self.cvrmse = cvrmse self.nmbe = nmbe self.fit_bp_hdd, self.fit_bp_cdd = fit_bp_hdd, fit_bp_cdd self.n = n self.params = { "coefficients": self.model_res.params.to_dict(), "formula": self.formula, "cdd_bp": self.fit_bp_cdd, "hdd_bp": self.fit_bp_hdd, "X_design_info": self.X.design_info, } output = { "r2": self.r2, "model_params": self.params, "rmse": self.rmse, "cvrmse": self.cvrmse, "nmbe": self.nmbe, "n": self.n, } return output
def billing_to_monthly_avg(self, trace_and_temp): ''' Helper function to handle monthly billing or other irregular data. ''' (energy_data, temp_data) = trace_and_temp # Handle empty series if energy_data.empty: raise model_exceptions.DataSufficiencyException( "No energy trace data") if temp_data.empty: raise model_exceptions.DataSufficiencyException( "No temperature data") # Convert billing multiindex to straight index temp_data.index = temp_data.index.droplevel() # Resample temperature data to daily temp_data_daily = temp_data.resample('D').apply(np.mean)[0] # Drop any duplicate indices energy_data = energy_data[~energy_data.index.duplicated( keep='last')].sort_index() # Check for empty series post-resampling and deduplication if energy_data.empty: raise model_exceptions.DataSufficiencyException( "No energy trace data after deduplication") if temp_data_daily.empty: raise model_exceptions.DataSufficiencyException( "No temperature data after resampling") # get daily mean values upd_data_daily_mean_values = [ value / (e - s).days for value, s, e in zip( energy_data, energy_data.index, energy_data.index[1:]) ] + [ np.nan ] # add missing last data point, which is null by convention anyhow usage_data_daily_mean_values = [ value for value, s, e in zip(energy_data, energy_data.index, energy_data.index[1:]) ] + [ np.nan ] # add missing last data point, which is null by convention anyhow # Create arrays to hold computed CDD and HDD for each # balance point temperature. cdd = {i: [0] for i in self.bp_cdd} hdd = {i: [0] for i in self.bp_hdd} for bp in self.bp_cdd: cdd[bp] = pd.Series(np.maximum(temp_data_daily - bp, 0), index=temp_data_daily.index) for bp in self.bp_hdd: hdd[bp] = pd.Series(np.maximum(bp - temp_data_daily, 0), index=temp_data_daily.index) ndays_data_daily_mean_values = [] hdd_data_daily_mean_values = {} cdd_data_daily_mean_values = {} for s, e in zip(energy_data.index, energy_data.index[1:]): thisn = np.sum(np.isfinite(temp_data_daily[s:e])) ndays_data_daily_mean_values.append(thisn) if thisn >= 15: for bp in self.bp_cdd: thismean = np.nanmean(cdd[bp][s:e]) if bp not in cdd_data_daily_mean_values.keys(): cdd_data_daily_mean_values[bp] = [] cdd_data_daily_mean_values[bp].append(thismean) for bp in self.bp_hdd: thismean = np.nanmean(hdd[bp][s:e]) if bp not in hdd_data_daily_mean_values.keys(): hdd_data_daily_mean_values[bp] = [] hdd_data_daily_mean_values[bp].append(thismean) else: for bp in self.bp_cdd: if bp not in cdd_data_daily_mean_values.keys(): cdd_data_daily_mean_values[bp] = [] cdd_data_daily_mean_values[bp].append(np.nan) for bp in self.bp_hdd: if bp not in hdd_data_daily_mean_values.keys(): hdd_data_daily_mean_values[bp] = [] hdd_data_daily_mean_values[bp].append(np.nan) # spread out over the month upd_data = pd.Series(upd_data_daily_mean_values, index=energy_data.index) usage_data = pd.Series(usage_data_daily_mean_values, index=energy_data.index) ndays_data = pd.Series(ndays_data_daily_mean_values + [np.nan], index=energy_data.index) cdd_data = {} hdd_data = {} for bp in self.bp_cdd: cdd_data[bp] = pd.Series(cdd_data_daily_mean_values[bp] + [np.nan], index=energy_data.index) for bp in self.bp_hdd: hdd_data[bp] = pd.Series(hdd_data_daily_mean_values[bp] + [np.nan], index=energy_data.index) model_data = { 'upd': upd_data, 'usage': usage_data, 'ndays': ndays_data, } model_data.update( {'CDD_' + str(bp): cdd_data[bp] for bp in cdd_data.keys()}) model_data.update( {'HDD_' + str(bp): hdd_data[bp] for bp in hdd_data.keys()}) return pd.DataFrame(model_data)
def meets_sufficiency_or_error(self, df): # Caltrack sufficiency requirement of number of contiguous months # choose first hdd as a proxy for temperature data upd = df['upd'].values hdd_col = [col for col in df.columns if col.startswith('HDD')][0] temp = df[hdd_col].values def n_non_nan(values): return np.sum(~np.isnan(values)) reason = None mp_type = self.modeling_period_interpretation if mp_type == 'baseline': _n = self.min_contiguous_baseline_months # In the baseline period, require the last N months be non-nan. last_month_nan = np.isnan(upd[-1]) direction = "last" if last_month_nan: upd_contig = upd[-(_n + 1):-1] temp_contig = temp[-(_n + 1):-1] else: upd_contig = upd[-_n:] temp_contig = temp[-_n:] elif mp_type == 'reporting': _n = self.min_contiguous_reporting_months # In the reporting period, require the first N months be non-nan. first_month_nan = np.isnan(df['upd'].values[0]) direction = "first" if first_month_nan: upd_contig = upd[1:_n + 1] temp_contig = temp[1:_n + 1] else: upd_contig = upd[:_n] temp_contig = temp[:_n] else: raise ValueError( 'Unexpected modeling period interpretation {}'.format(mp_type)) n_months = len(upd_contig) if n_months < _n: reason = ( 'The {direction} {req} months of a {mp} period must have' ' non-NaN energy and temperature values. In this case, there' ' were only {n} months in the series.'.format( direction=direction, req=_n, mp=mp_type, n=n_months)) else: upd_n_non_nan = n_non_nan(upd_contig) temp_n_non_nan = n_non_nan(temp_contig) upd_ok = (upd_n_non_nan == _n) temp_ok = (temp_n_non_nan == _n) if upd_ok and not temp_ok: reason = ( 'The {direction} {req} months of a {mp} period must have' ' at least 15 valid days of energy and temperature data.' ' In this case, only {n} of the {direction} {req} months' ' of temperature data met that requirement.'.format( direction=direction, req=_n, mp=mp_type, n=temp_n_non_nan, )) elif not upd_ok and temp_ok: reason = ( 'The {direction} {req} months of a {mp} period must have' ' at least 15 valid days of energy and temperature data.' ' In this case, only {n} of the {direction} {req} months' ' of energy data met that requirement.'.format( direction=direction, req=_n, mp=mp_type, n=upd_n_non_nan, )) elif not upd_ok and not temp_ok: reason = ( 'The {direction} {req} months of a {mp} period must have' ' at least 15 valid days of energy and temperature data.' ' In this case, only {upd_n} and {temp_n} of the' ' {direction} {req} months of energy and temperature data' ' met that requirement, respectively.'.format( direction=direction, req=_n, mp=mp_type, upd_n=upd_n_non_nan, temp_n=temp_n_non_nan)) if reason is not None: raise model_exceptions.DataSufficiencyException( 'Data does not meet minimum contiguous months requirement. {}'. format(reason)) if not np.nansum(upd) > 0.01: raise model_exceptions.DataSufficiencyException( "Energy trace data is all or nearly all zero") return
def daily_to_monthly_avg(self, df): ''' Convert from daily usage and temperature to monthly usage per day and average HDD/CDD. ''' # Throw out any duplicate indices df = df[~df.index.duplicated(keep='last')].sort_index() # Create arrays to hold computed CDD and HDD for each # balance point temperature. cdd = {i: [0] for i in self.bp_cdd} hdd = {i: [0] for i in self.bp_hdd} # If there isn't any data, throw an exception if len(df.index) == 0: raise model_exceptions.DataSufficiencyException( "No energy trace data") # Create the arrays to hold our monthly output ndays, usage, upd, output_index = [0], [0], [0], [df.index[0]] this_yr, this_mo = output_index[0].year, output_index[0].month # Check whether we are creating a demand fixture. is_demand_fixture = 'energy' not in df.columns # TODO use groupby here? e.g. df.groupby(pd.TimeGrouper('MS')) # Loop through the daily input data frame populating monthly arrays for idx, row in df.iterrows(): # Check whether we are in a new month. new_month = (this_yr != idx.year or this_mo != idx.month) if new_month: ndays.append(0) usage.append(0) upd.append(0) for i in cdd.keys(): cdd[i].append(0) for i in hdd.keys(): hdd[i].append(0) this_yr, this_mo = idx.year, idx.month output_index.append(idx) # If this day is valid, add it to the usage and CDD/HDD arrays. day_is_valid = ( (is_demand_fixture or (np.isfinite(row['energy']) and row['energy'] >= 0)) and np.isfinite(row['tempF'])) if day_is_valid: ndays[-1] = ndays[-1] + 1 usage[-1] = usage[-1] + (row['energy'] if not is_demand_fixture else 0) for bp in cdd.keys(): cdd[bp][-1] += np.maximum(row['tempF'] - bp, 0) for bp in hdd.keys(): hdd[bp][-1] += np.maximum(bp - row['tempF'], 0) # Caltrack sufficiency requirement of >=15 days per month for i in range(len(usage)): misses_req = (ndays[i] < 15) if misses_req: upd[i] = np.nan for bp in cdd.keys(): cdd[bp][i] = np.nan for bp in hdd.keys(): hdd[bp][i] = np.nan else: upd[i] = (usage[i] / ndays[i]) for bp in cdd.keys(): cdd[bp][i] = cdd[bp][i] / ndays[i] for bp in hdd.keys(): hdd[bp][i] = hdd[bp][i] / ndays[i] # Create output data frame df_dict = {'upd': upd, 'usage': usage, 'ndays': ndays} df_dict.update({'CDD_' + str(bp): cdd[bp] for bp in cdd.keys()}) df_dict.update({'HDD_' + str(bp): hdd[bp] for bp in hdd.keys()}) output = pd.DataFrame(df_dict, index=output_index) return output
def fit(self, weather_source): ''' Fit all models associated with this trace. Parameters ---------- weather_source : eemeter.weather.ISDWeatherSource Weather source to use in creating covariate data. ''' for modeling_period_label, modeling_period in \ self.modeling_period_set.iter_modeling_periods(): filtered_data = self._filter_by_modeling_period( self.trace, modeling_period) filtered_trace = EnergyTrace(self.trace.interpretation, data=filtered_data, unit=self.trace.unit) model = self.model_mapping[modeling_period_label] outputs = { "status": None, "traceback": None, "input_data": None, "start_date": None, "end_date": None, "n_rows": None, "model_fit": {}, } # fail with DataSufficiencyException if bad weather source if weather_source is None: message = ( 'No weather source found for trace {} in {} period'.format( self.trace.trace_id, modeling_period_label)) logger.warn(message) try: raise model_exceptions.DataSufficiencyException(message) except: outputs.update({ "status": "FAILURE", "traceback": traceback.format_exc(), }) self.fit_outputs[modeling_period_label] = outputs continue # attempt to create input data try: input_data = self.formatter.create_input( filtered_trace, weather_source) except: logger.warn( 'Input data formatting failed for trace {} in {} period.'. format(self.trace.trace_id, modeling_period_label)) outputs.update({ "status": "FAILURE", "traceback": traceback.format_exc(), }) else: input_description = self.formatter.describe_input(input_data) input_serialization = self.formatter.serialize_input( input_data) input_mask = self.formatter.get_input_data_mask(input_data) outputs.update({ "input_data_serialization": input_serialization, "input_mask": input_mask, # missing days "start_date": input_description.get('start_date'), "end_date": input_description.get('end_date'), "n_rows": input_description.get('n_rows'), "trace": filtered_trace, }) try: model_fit = model.fit(input_data) except: tb = traceback.format_exc() logger.warn( '{} fit failed for trace {} in {} period.'.format( model, self.trace.trace_id, modeling_period_label)) outputs.update({ "status": "FAILURE", "traceback": tb, }) else: logger.debug( '{} fit successful for trace {} in {} period.'.format( model, self.trace.trace_id, modeling_period_label)) outputs["model_fit"].update(model_fit) outputs.update({ "status": "SUCCESS", }) self.fit_outputs[modeling_period_label] = outputs return self.fit_outputs