def fit_model(self, clf, scale=True, clean=False, sample_weight=None, *args, **kwargs): """Fit an sklearn estimator object. Parameters ---------- clf: estimator object Must have fit method. feature_cols: str Column names of features to use in fit. scale: bool Scale model before fit. Returns ------- clf: estimator object Fitted estimator. """ if scale: self.scale_model() utils.calc_all_window_metrics(self.df, self.window, self.meas_col, self.model_col, overwrite=True) # if clean: # self.filter_labels(*args, **kwargs) # do not rescale - only remove 'bad' cloudy periods # # self.scale_model() # # utils.calc_all_window_metrics(self.df, self.window, self.meas_col, self.model_col, overwrite=True) # else: # self.df['mask'] = True self.filter_labels(**kwargs, overwrite=True) # clf.fit(self.df[self.df['mask']][self.features_], self.df[self.df['mask']][self.target_col]) train_df = self.get_masked_df() # if sample_weight is not None: # clf.fit(train_df[self.features_].values, train_df[self.target_col].values, train_df[sample_weight].values) # else: clf.fit(train_df[self.features_].values, train_df[self.target_col].astype(int).values) return clf
def filter_labels(self, ratio_mean_val=None, diff_mean_val=None, mask_label='quality_mask', overwrite=False): """Generate mask to remove incorrectly labeled points from training set. Resulting mask should remove the cloudy points that 'look' clear. It should include every period originally labeled as clear. By default, no filtering is done. Must actively choose to do so. """ if mask_label in self.masks_ and not overwrite: raise RuntimeError('Label already exists as mask. Change name or allow overwrite.') if not all(i in self.df.keys() for i in self.features_): self.scale_model() utils.calc_all_window_metrics(self.df, self.window, self.meas_col, self.model_col, overwrite=True) self.df[mask_label] = True if ratio_mean_val is not None and diff_mean_val is not None: # self.df.loc[(~self.df['sky_status']) & # ((self.df['GHI/GHIcs mean'] >= ratio_mean_val) | # (np.abs(self.df['GHI-GHIcs mean']) <= diff_mean_val)), mask_label] = False self.df.loc[(~self.df['sky_status']) & ((np.abs(1 - self.df['GHI/GHIcs mean']) <= ratio_mean_val) | (np.abs(self.df['GHI-GHIcs mean']) <= diff_mean_val)), mask_label] = False if mask_label not in self.masks_: self.masks_.append(mask_label)
from sklearn import neighbors # In[52]: nn = neighbors.KNeighborsClassifier(n_neighbors=8) # In[53]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[54]: utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) # In[55]: from sklearn import preprocessing # In[56]: ss = preprocessing.StandardScaler() X_std = ss.fit_transform(train.df[feature_cols]) # In[57]: nn.fit(X_std, train.df[target_cols].values.ravel())
# # 1. Scale model clearsky (PVLib) # 2. Calculate training metrics # 3. Train model # In[13]: nsrdb_srrl.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') nsrdb_abq.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') nsrdb_ornl.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[14]: utils.calc_all_window_metrics(nsrdb_srrl.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) utils.calc_all_window_metrics(nsrdb_abq.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) utils.calc_all_window_metrics(nsrdb_ornl.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) # In[20]:
ground3 = cs_detection.ClearskyDetection(ground2.df) ground3.trim_dates('07-01-2011', '07-15-2011') # In[25]: vis = visualize.Visualizer() vis.add_line_ser(ground3.df['GHI'], 'Grnd GHI') vis.add_line_ser(ground3.df['Clearsky GHI pvlib'], 'Grnd GHIcs') vis.add_circle_ser(ground3.df[ground3.df['sky_status']]['GHI'], 'Clear') vis.show() # In[26]: utils.calc_all_window_metrics(ground2.df, 11, 'GHI', 'Clearsky GHI pvlib', overwrite=True) # In[27]: ground2_train = ground2.df[ground2.df.index.minute % 30 == 0] # In[28]: ground2_train = ground2_train[ground2_train.index < '07-01-2011'] # In[29]: from sklearn import tree from sklearn import ensemble
def calc_all_metrics(self): """Wrapper function for utils.calc_all_window_metrics for investigating features. """ utils.calc_all_window_metrics(self.df, self.window, self.meas_col, self.model_col, overwrite=True) self.time_from_solar_noon()
# In[13]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[14]: clf = ensemble.RandomForestClassifier(random_state=42) # In[15]: utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) %load_ext line_profiler%lprun -f utils.calc_all_window_metrics utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) # In[16]: train.df.keys() # In[17]: feature_cols = [ 'tfn', # 'ghi_status', # 'abs_ideal_ratio_diff',