def eval_model_ensemble(models,x,y_ref=None,is_class=False,verbose=False): """Evaluate ensemble of models. Parameters ---------- models : single or iterable set of scikit-learn model instances model(s) to be evaluated x : numpy.array model inputs with m-row observations and n-column features y_ref : numpy.array (Default value = None) reference target output for observations in X is_class: bool (Default value = False) indication if classification problem (only needed when Y!=None) Returns ------- model output : numpy array of len(X) if Y!=None: list of length 2 [mean error to reference Y, model outputs] """ # model evaluation if len(np.array(x).shape)==1: # single observation input if data_func.is_iterable(models)==False: # single model y_pred = models.predict(x) else: y_pred = np.zeros(len(models)) # multiple models for m,mo in enumerate(models): y_pred[m] = mo.predict(x) elif len(np.array(x).shape)==2: if data_func.is_iterable(models)==False: # single model y_pred = models.predict(x) else: y_pred = np.zeros((len(x),len(models))) for m,mo in enumerate(models): y_pred[:,m] = mo.predict(x) else: raise ValueError('Feature imput dimension greater than 2.') # error evaluation if y_ref==None: return y_pred else: if is_class==False: # regression problem y_err = np.mean(np.abs(y_pred-y_ref)) else: # classification problem y_err = np.mean(np.abs(y_pred!=y_ref)) if verbose==True: print '\nMean model error: {0}.'.format(np.round(y_err,2)) return [y_pred, y_err]
def get_feat_importance(W, max_norm=100.): """max-normed feature importance and its variance across models. Parameters ---------- W : numpy.array input data max_norm : float, (Default value = 100) max norm Returns ------- impo : number or array max-normed mean values (axis=0) error : number or array max-normed sample standard deviation (axis=0) """ # mean variable importance and its standard deviation as measure of variation mean_imp,imp_norm,imp_sd =\ np.nanmean(W,0),max(np.nanmean(W,0)),np.nanstd(W,0,ddof=1) impo = max_norm*mean_imp/imp_norm error = max_norm*imp_sd/imp_norm return np.array([impo,error])
elif type(config.data_trafos) == dict: trafos = [ config.data_trafos[name] for name in [config.target] + config.features ] elif data_func.is_iterable(config.data_trafos) == True: trafos = [ config.data_trafos[i] for i in range(len([config.target] + config.features)) ] else: raise ValueError('Invalid data transformation type.') # 1-hot-encoding of categorical variables for cat in config.categorical: # data trafos i_cat_trafo = int(np.where(np.array(config.features) == cat)[0]) cat_trafo = trafos[i_cat_trafo] del trafos[i_cat_trafo] # get indicator frames cat_rawData = pat.dmatrix(cat, raw_data, return_type='dataframe').iloc[:, 1:] # append columns for col in cat_rawData.columns: raw_data[col] = cat_rawData[col] config.features.append(col) trafos.append(cat_trafo) config.features.remove(cat) #%% get transformed data data_shifted = data_func.data_framer(data=raw_data.copy(),target=config.target,features=config.features,\ index=config.time_var,start_i=config.start_time,end_i=config.end_time,\
def ML_train_tester(df,target,features,method,m_test=1,n_boot=500,is_class=False,is_zero_one=False,\ to_norm=None,CV_name=None,CV_value=None,counter_fact=False,\ horizon=None,save_out=False,save_models=True,file_name='',verbose=False): """Machine learning wrapper for bootstrapped training and testing. Parameters ---------- df : pandas.DataFrame (input data) target : str LHS variable features : list of str RHS variable(s) method : str model m_test : int or index mask, optional (Default value = 1, "jackknife") size of test data or index max of test set. If mask, n_boot is set to 1 n_boot : int, optional (Default value = 500) number of bootstraps is_class : bool, optional (Default value = False) if True, maps to integer output is_zero_one : bool, optional (Default value = True) if True, maps to Boolean output to_norm : list, optional (Default value = None) variables to norm (z-scores) CV_name : str, optional (Default value = None) name of cross-validation parameter CV_value : float, optional (Default value = None) value for cross-validation parameter counter_fact : bool, optional (Default value = False) if True, variable importance by leaving one feature out at a time horizon : int, optional (Default value = None) lead-lag size for projection model (only used for VAR) save_out : bool, optional (Default value = False) if True save output to file save_models : bool, optional (Default value = True) if True, include models in output file (could use lots of space) file_name : str, optional (Default value = '') name of output file verbose : bool, optional (Default value = False) if True, print basic fit results to screen Returns ------- dict, keyed by testPred : numpy.array prediction on test set testErr : numpy.array test error over all bootstraps meanTestErr : float mean error over all bootstraps ID : str identifier y_test : numpy.array test target over all bootstraps weights : numpy.array feature importances testInd : numpy.array indix mask of test samples for each bootstrap trainErr : numpy.array training error over all bootstraps """ # definitions and initialisations m, n_col = len(df), len(features)+1 if data_func.is_iterable(m_test)==True: n_boot=1 elif m_test==1: n_boot=m # one fit for each observation if method=='VAR': n_boot=m_test=1 # empty fields for bootstrapped model output test_ref_Y, test_pred_Y = np.array([]), np.array([]) # test target values and out-of-sample predictions train_ref_Y, train_pred_Y = np.array([]), np.array([]) # training target values and in-sample predictions train_error, test_error = np.array([]), np.array([]) # in and out-of-sample errors boot_errors, models = np.array([]), np.array([]) # mean bootstrap error and bootstrap models feat_weights, test_indices = np.zeros((n_boot,n_col-1)), np.zeros((n_boot,m)) # weights for feature importance, test_index over bootstraps # input data inputs = df.copy() if not to_norm==None: # normalise data (z-scores) for var in to_norm: if var in inputs.columns: vals = inputs[var].values inputs[var] = (vals-vals.mean(0))/vals.std(0,ddof=1) else: raise ValueError("Norm error: Variable '{0}' not in dataframe.".format(var)) # loop over bootstrapped samples for t in range(n_boot): # get training and testing data if data_func.is_iterable(m_test)==True: df_train, df_test = inputs[~m_test], inputs[m_test] test_indices[t,:] = m_test else: df_train, df_test, is_train = train_test_split(inputs,m_test=m_test,t=t) # random split test_indices[t,:] = ~is_train # get values x_train, y_train = df_train[features].values, df_train[target].values x_test, y_test = df_test[features].values, df_test[target].values # set learning methods if not method=='VAR': # VAR part of statsmodels library (treated differently) ML = model_selection(method,n_HN=n_col-1,CV_name=CV_name,CV_value=CV_value) # n_HN only used for neural network # (nNeurons=nFeatures in each layer) else: # can only be used with m_test==1 input_data = inputs[[target]+features].values ML = model_selection(method,input_data) y_train = y_test = input_data[:,0] if CV_name==None: model = ML.fit(maxlags=1) # model fit, defaults VAR with one lag else: exec('model = ML.fit('+CV_name+'='+str(CV_value)+')') # fit model and train/test predictions if method=='VAR': # fit at method selection step (CV_name needed) in_pred = np.zeros(m)*np.nan for r in range(m): start_values = input_data[r,:] fcast = model.forecast(start_values.reshape((1,len(features)+1)),horizon)[-1,0] if r+horizon<m: in_pred[r+horizon] = fcast out_pred = in_pred else: model_clone = skl_base.clone(ML) model = ML.fit(x_train,y_train) # model fit out_pred = model.predict(x_test) in_pred = model.predict(x_train) # get discrete class output & get bootstrap error if is_class==True: # target should be an integer if is_zero_one==True: # map to Boolean in_pred = data_func.to_zero_one(in_pred).astype(bool) out_pred = data_func.to_zero_one(out_pred).astype(bool) else: # map to integer in_pred = np.round(in_pred).astype(int) out_pred = np.round(out_pred).astype(int) boot_errors = np.hstack((boot_errors,np.mean(out_pred!=y_test))) else: if method=='VAR': boot_errors = np.nanmean(np.abs(out_pred-y_test)) else: boot_errors = np.hstack((boot_errors,np.mean(np.abs(out_pred-y_test)))) models = np.hstack((models,model)) # store model # feature importance if counter_fact==False: if method in ['Tree-rgr','Tree-clf','Forest-rgr','Forest-clf']: feat_weights[t] = model.feature_importances_ # feature importance through "counter_factual" analysis (leave one variable out and compare) elif counter_fact==True: # may slow things down for f,feat in enumerate(features): model_clone_II = skl_base.clone(model_clone) temp_features = list(features) temp_features.remove(feat) # get training and testing data x_train, x_test = df_train[temp_features].values, df_test[temp_features].values temp_model = model_clone_II.fit(x_train,y_train) temp_pred = temp_model.predict(x_test) if is_class==True: feat_weights[t,f] = np.mean(temp_pred!=y_test) else: feat_weights[t,f] = np.mean(np.abs(temp_pred-y_test)) # train Ys train_pred_Y = np.hstack((train_pred_Y, in_pred)) train_ref_Y = np.hstack((train_ref_Y, y_train)) # test Ys test_pred_Y = np.hstack((test_pred_Y, out_pred)) test_ref_Y = np.hstack((test_ref_Y, y_test)) # get errors if is_class==True: train_error = np.mean(train_pred_Y!=train_ref_Y) test_error = np.mean(test_pred_Y!=test_ref_Y) else: train_error = np.mean(np.abs(train_pred_Y-train_ref_Y)) test_error = np.mean(np.abs(test_pred_Y-test_ref_Y)) # verbose ID = target+'-'+method+'-'+str(m_test)+'-'+str(n_boot) if verbose==True: print '\nTraining Summary' print 'ID:',ID print '\tin-sample error:',round(train_error,3) print '\tout-of-sample error:',round(test_error,3) print '\terror variance:',round(np.std(boot_errors,ddof=1),3) print '\terror signal-to-noise:', print round(test_error/np.std(boot_errors,ddof=1),3) # package output out_dict = {'ID' : ID,\ 'mean_train_err' : train_error, 'mean_test_err' : test_error,\ 'train_pred_Y' : train_pred_Y, 'test_pred_Y' : test_pred_Y,\ 'train_ref_Y' : train_ref_Y, 'test_ref_Y' : test_ref_Y,\ 'feat_weights' : feat_weights, 'test_ind' : test_indices} if save_models==True: out_dict['models']=np.array(models) if save_out==True: pk.dump(out_dict,open(file_name,'wb')) if save_models==False: # if not saved, keep models in temp (full) output out_dict['models']=np.array(models) # return output dictionary return out_dict
def plot_feat_importance(weights,variance=None,corrs=None,features=None,last=False,\ y_label=None,x_mark=None,x_mark_label='',title='',color_dict=None,\ y_mark=None,y_lim=None,color_map='rainbow',\ save=False,save_name='feature_importance.png'): """Plot feature importance: time series or last. Parameters ---------- weights : pandas.DataFrame feature importance scores variance : array, optional (Default value = None) error bands of feature importance scores corrs : array, optional (Default value = None) correlation between features and target features : list of str, optional (Default value = None) names of features last : bool, optional (Default value = False) if True, horizontal bar-chart of feature importance, else time series y_label : str, optional (Default value = None) y-axis label x_mark : value, optional (Default value = None) index value for x-axis reference value x_mark_label : str, optional (Default value = '') label of x-axes reference title : str, optional (Default value = '') plot title color_dict : dict, optional (Default value = None) dictionary keyed by features and values providing color (if last== False) y_mark : values, optional (Default value = None) index value for y-axis reference value y_lim : [min_value,max_value], optional (Default value = None) y-boundaries of plot color_map : str, optional (Default value = 'rainbow') colormap, see also https://matplotlib.org/examples/color/colormaps_reference.html save : bool, optional (Default value = True) if True, save plot save_name : str, optional (Default value = 'feature_importance.png') file name under which to save plot (incl directory) Note: plot can be further adjusted by modifying code below. """ fsize = 15 # reference fontsize if features == None: features = weights.columns if last == False: # plot time series if color_dict == None: fig = weights[features].plot(figsize=(8.5, 6), lw=2, rot=30) else: color_seq = [color_dict[f] for f in features] fig = weights[features].plot(figsize=(8.5, 6), color=color_seq, lw=2, rot=30) if not x_mark == None: x_mark = list(weights.index).index(x_mark) plt.axvline(x_mark, ls='--', lw=2, c='k', label=x_mark_label) if not y_mark == None: plt.axvline(y_mark, ls='-', lw=1, c='k') lgd = fig.legend(bbox_to_anchor=(1.4, 1.02), prop={'size': fsize - 1}) fig.tick_params(axis='x', labelsize=fsize - 2) fig.tick_params(axis='y', labelsize=fsize - 2) if not y_lim == None: axes = plt.gca() axes.set_ylim(y_lim) if y_label == None: plt.ylabel('max-normed feature importance', fontsize=fsize) else: plt.ylabel(y_label, fontsize=fsize) plt.xlabel('date', fontsize=fsize) plt.title(title) if save == True: plt.savefig(save_name, dpi=200, bbox_extra_artists=(lgd, ), bbox_inches='tight') else: # get feature importance and order values largest first if type(weights) == pd.core.frame.DataFrame: impo = weights.values[-1, :] else: impo = weights order = impo.argsort() ranks = order.argsort() if type(variance) == pd.core.frame.DataFrame: error = variance.values[-1, :] else: error = variance error = error[order] fig, ax = plt.subplots(figsize=(8.5, 6)) # get correlation color if not np.array(corrs).shape == (): CMAP = cm = plt.get_cmap(color_map) cNorm = colors.Normalize(vmin=-1, vmax=1) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=color_map) COL = colorVal = scalarMap.to_rgba(corrs) ax.barh(ranks, impo, xerr=error, color=COL, align='center') scalarMap.set_array([-1, 1]) cb = fig.colorbar(scalarMap, ax=ax, ticks=np.arange(-1, 1.1, .5)) cb.set_label('target-feature correlation', rotation=270, fontsize=fsize - 2) else: ax.barh(ranks, impo, xerr=error, color='r', align='center', alpha=0.4) xl = ax.get_xlim() if xl[1] > 97: ax.set_xlim([0, 110]) # axes & ticks plt.yticks(ranks, features, fontsize=fsize - 2) plt.axvline(100, ls='--', lw=0.5, color='k') plt.xlabel('max-normed feature importance', fontsize=fsize) axes = plt.gca() axes.set_xlim(left=0) axes.set_ylim([-1, len(features)]) plt.title(title, fontsize=fsize) if save == True: plt.savefig(save_name, dpi=200, bbox_inches='tight') plt.draw()
def ML_heatmap(f1,f2,df,features,target,models=None,model_outputs=None,condition='median',\ N=30,ranges=None,to_norm=None,color_norms=None,title='',\ color_map='rainbow',save=False,save_name='ml_heatmap.png'): """Heatmap of conditional 2-D model prediction. Parameters ---------- f1 : str name of first variable feature f2 : str name of second variable feature df : pandas.DataFrame input data features : list of str names of model features (RHS) target : str name of target variables (LHS) models : list-like, optional (Default value = None) models to be evaluated. If None, needs pre-computed model_outputs model_outputs : 2-d numpy.array (NxN), optional (Default value = None) pre-computed model_outputs for f1-f2 feature ranges and condition condition : str or values, optional (Default value = 'median') condition for non-variable features, options: median, mean, last or custom values N : int, optional (Default value = 30) raster density within ranges ranges : [f1_min,f1_max,f2_min,f2_max], optional (Default value = None) ranges of variable features to_norm : list of str, optional (Default value = None) variable names to be normalised (z-scores) color_norms : [vmin,vmax], optional (Default value = None) range to norm color scale title : str, optional (Default value = '') plot title color_map : str, optional (Default value = 'rainbow') colormap, see also https://matplotlib.org/examples/color/colormaps_reference.html save : bool, optional (Default value = True) if True, save plot save_name : str, optional (Default value = 'ml_heatmap.png') file name under which to save plot (incl directory) Note: plot can be further adjusted by modifying code below. Returns ------- df : 2-d numpy.array (NxN) heatmap values """ data = df.copy() # normalise input data if not to_norm == None: for var in to_norm: vals = data[var].values data[var] = (vals - vals.mean(0)) / vals.std(0, ddof=1) df1f2 = [min(data[f1]), max(data[f1]), min(data[f2]), max(data[f2])] if condition == 'median': inputs = data[features].median().values.reshape(1, -1) z = data[target].median() elif condition == 'mean': inputs = data[features].mean().values.reshape(1, -1) z = data[target].mean() elif condition == 'last': inputs = data[features].values[-1, :].reshape(1, -1) z = data[target].values[-1] elif type(condition) == int: inputs = data[features].values[condition, :].reshape(1, -1) z = data[target].values[condition] elif len(condition) == len(features): inputs = np.array(condition[1:]).reshape(1, -1) z = condition[0] else: raise (ValueError('No valid modelling condition given.')) if ranges == None: ranges = df1f2 elif not len(ranges) == 4: raise (ValueError('Invalid feature ranges.')) # model prediction for models and feature ranges i1, i2 = features.index(f1), features.index(f2) y0, x0 = inputs[0][i1], inputs[0][i2] range1 = np.linspace(ranges[0], ranges[1], N) range2 = np.linspace(ranges[2], ranges[3], N) if model_outputs == None: output = np.zeros((len(models), N, N)) for m, model in enumerate(models): for i, val1 in enumerate(range1): inputs[0, i1] = val1 for j, val2 in enumerate(range2): inputs[0, i2] = val2 output[m, i, j] = model.predict(inputs) output = np.mean(output[:, :, :], 0) # model mean else: output = model_outputs # figure parameters if color_norms == None: vals = output.flatten() vmin = min(vals) vmax = max(vals) elif len(color_norms) == 2: vmin, vmax = color_norms else: raise (ValueError('Invalid color norm.')) # plot fig, ax = plt.subplots(figsize=(8, 6)) # color map CMAP = cm = plt.get_cmap(color_map) cNorm = colors.Normalize(vmin=vmin, vmax=vmax) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=CMAP) im = ax.imshow(output, origin='lower', cmap=color_map, vmin=vmin, vmax=vmax, interpolation='hermite') ax.autoscale(False) # conditioning reference point x1 = (x0 - ranges[2]) * N / (ranges[3] - ranges[2]) - .5 y1 = (y0 - ranges[0]) * N / (ranges[1] - ranges[0]) - .5 ax.plot(x1, y1, 'wo', ms=20) # condition point COL = colorVal = scalarMap.to_rgba(z) ax.plot(x1, y1, 'o', c=COL, ms=20, markeredgecolor='w', mew=3) fsize = 15 # figure base fontsize plt.title(title, fontsize=fsize) plt.xlabel(f2, fontsize=fsize) plt.ylabel(f1, fontsize=fsize) #tix = [0,int((N-1)/4),int((N-1)/2),int(3*(N-1)/4),N-1] tix = [0, int((N - 1) / 4), int((N - 1) / 2), int(3 * (N - 1) / 4), N - 1] plt.xticks(tix, np.round(range2[tix], 1), fontsize=fsize - 2) plt.yticks(tix, np.round(range1[tix], 1), fontsize=fsize - 2) cbar = plt.colorbar(im) cbar.set_label(target, fontsize=fsize) if save == True: plt.savefig(save_name, dpi=200, bbox_inches='tight') plt.draw() return output
def cond_fan_chart(df_X,df_Y,models,ref_time,cond=True,idx=None,h_ref_line=None,data_return=False,\ two_class=False,legend_loc='best',y_lim=None,y_label=None,x_label=None,title='',\ save=False,save_name='cond_fan_chart.png'): """Percentile-based fan chart, optionally conditioned on Y-reference at reference time. Parameters ---------- df_X : pandas.DataFrame input data for models df_Y : pandas.DataFrame models : list-like, fitted models ref_time : value index value of reference time cond : bool, optional (Default value = True) if True, force model mean on reference point idx : str, optional (Default value = None) name of index if not set h_ref_line : float, optional (Default value = None) y-value for horizontal reference line data_return : bool, optional (Default value = False) if True, return plot input data two_class : bool, optional (Default value = False) if True, two-class classification is assumed legend_loc : str or int, optional (Default value = 'best') matplotlib legend location y_lim : [min_value,max_value], optional (Default value = None) y-boundaries of plot y_label : str, optional (Default value = None) y-axis label x_label : str, optional (Default value = None) x-axis label title : str, optional (Default value = '') plot title save : bool, optional (Default value = True) if True, save plot save_name : str, optional (Default value = 'cond_fan_chart.png') file name under which to save plot (incl directory) Note: plot can be further adjusted by modifying code below. Returns ------- df : pandas.DataFrame internally generated data used for plot """ # set index (df_X & df_Y need to have the same index) if not idx == None: df_X.set_index(idx, inplace=True) df_Y.set_index(idx, inplace=True) # model input values based on X and models X = np.zeros((len(models), len(df_X))) for i, model in enumerate(models): X[i, :] = model.predict(df_X) # mean and percentiles: conditioned on reference point df, refY, ref_name = df_X.copy(), df_Y.loc[ref_time][ df_Y.columns[0]], df_Y.columns[0] df['mean model'], df['median model'] = np.mean(X, axis=0), np.percentile( X, 50, axis=0) mean_off, median_off = df.loc[ref_time]['mean model'] - refY, df.loc[ ref_time]['median model'] - refY if cond == False: df['p25'], df['p75'] = np.percentile(X, 25, axis=0), np.percentile(X, 75, axis=0) df['p5'], df['p95'] = np.percentile(X, 5, axis=0), np.percentile(X, 95, axis=0) df['p0.5'], df['p99.5'] = np.percentile(X, 1, axis=0), np.percentile(X, 99, axis=0) else: df['mean model'], df['median model'] = df['mean model'] - mean_off, df[ 'median model'] - median_off df['p25'], df['p75'] = np.percentile( X, 25, axis=0) - median_off, np.percentile(X, 75, axis=0) - median_off df['p5'], df['p95'] = np.percentile( X, 5, axis=0) - median_off, np.percentile(X, 95, axis=0) - median_off df['p0.5'], df['p99.5'] = np.percentile( X, 1, axis=0) - median_off, np.percentile(X, 99, axis=0) - median_off # merge df and df_Y df = pd.concat([df_Y, df], axis=1) # plotting p=df[[ref_name,'mean model','median model']].plot(figsize=(9,6),linewidth=3,\ style=['bo-','gs-','rd-'],ms=5,rot=0,alpha=.7) # reference ref_T = list(df.index.values).index(ref_time) p.axvline(ref_T, ls='--', c='k', lw=2) p.plot([ref_T], [refY], 'o', markersize=15, color='k', alpha=.5, label='ref.: ' + str(ref_time)) p.fill_between(range(len(df)), df['p25'].values, df['p75'].values, color='r', alpha=.2) r50 = patch.Patch(color='r', alpha=.6) p.fill_between(range(len(df)), df['p5'].values, df['p95'].values, color='r', alpha=.2) r90 = patch.Patch(color='r', alpha=.4) p.fill_between(range(len(df)), df['p0.5'].values, df['p99.5'].values, color='r', alpha=.2) r99 = patch.Patch(color='r', alpha=.2) # add boundaries for two-class classification if two_class == True: p.axhline(0, ls='-', c='k', lw=.4) p.axhline(1, ls='-', c='k', lw=.4) if not y_lim == None: p.set_ylim(y_lim) else: p.set_ylim([-.25, 1.5]) p.set_yticks([0, 1]) # add reference line and adjust legend ordering if not h_ref_line == None: p.axhline(h_ref_line[0], ls='-', c='k', lw=3, alpha=.3, label=h_ref_line[1]) new_index = [0, 5, 3, 1, 6, 4, 2, 7] # for legend ordering else: new_index = [0, 4, 3, 1, 5, 2, 6] # legend fsize = 15 handles, labels = p.get_legend_handles_labels() handles += [r50, r90, r99] labels += ['p-50', 'p-90', 'p-99'] handles = np.array(handles)[new_index] labels = np.array(labels)[new_index] p.legend(handles, labels, loc=legend_loc, ncol=3, prop={'size': fsize - 2}, numpoints=1) # axes $ labels if not y_lim == None: p.set_ylim(y_lim) if not y_label == None: p.set_ylabel(y_label, fontsize=fsize) if not x_label == None: p.set_xlabel(x_label, fontsize=fsize) p.set_title(title, fontsize=fsize) p.tick_params(axis='x', labelsize=fsize - 2) p.tick_params(axis='y', labelsize=fsize - 2) # save figure if save == True: plt.savefig(save_name, dpi=200, bbox_inches='tight') plt.draw() # return underlying data if data_return == True: return df