def implement_lr_sklearn(x_train,x_test,y_train,y_test): """ Implements logistic regression from scikit learn package. Returns an output dictionary with keys : - label_test : classification predicted by LR for the test set - label_train : classification predicted by LR for the training set - thetas : coefficients of the decision boundary """ from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression print "doing grid search" C_range = 10.0 ** np.arange(-2, 5) param_grid = dict(C=C_range) grid = GridSearchCV(LogisticRegression(), param_grid=param_grid, n_jobs=-1) grid.fit(x_train.values, y_train.NumType.values.ravel()) print "The best classifier is: ", grid.best_estimator_ y_train_LR = grid.best_estimator_.predict(x_train) y_test_LR = grid.best_estimator_.predict(x_test) output = {} output['label_test'] = y_test_LR output['label_train'] = y_train_LR output['thetas'] = grid.best_estimator_.raw_coef_ return output
def plot_2f_variability(dic,x_train,y_train,x_test,y_test): """ Plots decision boundaries for a discrimination problem with 2 features in function of the training set draws. Superimposed with scatter plots of both training and test sets. """ #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8,8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] axScatter.scatter(x_test[feat_1],x_test[feat_2],c=list(y_test.NumType.values),cmap=plt.cm.gray,alpha=.2) axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.winter,alpha=.5) # Plot decision boundaries x_vec = np.arange(-1,1,.01) rates = [] for draw in sorted(dic): theta = dic[draw]['out']['thetas'] t = dic[draw]['out']['threshold'] rates.append(dic[draw]['out']['rate_test']['global']) db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec) axScatter.plot(x_vec,db,lw=1.,c=(0,0.1*draw,1)) imax = np.argmax(rates) theta = dic[imax]['out']['thetas'] t = dic[imax]['out']['threshold'] method = dic[imax]['out']['method'].upper() types = dic[imax]['out']['types'] lab = r'%s %.1f$\pm$%.1f%%'%(method,np.mean(rates),np.std(rates)) db_max = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec) axScatter.plot(x_vec,db_max,lw=3.,c='midnightblue',label=lab) axScatter.legend(loc=4) x_vec, y_vec, proba, map = class_2c_2f(theta,t) axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2) # Determine nice limits by hand bins_1, bins_2 = plot_limits(x_test,x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates pos_x = .76 pos_y_ini = .95 pas = .025 plt.figtext(pos_x,pos_y_ini,'Training set %s'%method) rate_tr,rate_tr_1, rate_tr_2 = [],[],[] for draw in sorted(dic): p = dic[draw]['out']['rate_train'] rate_tr.append(p['global']) for key in sorted(p): if key != 'global': cl, icl = key[0], key[1] if icl == 1: rate_tr_1.append(p[(cl,icl)]) else: rate_tr_2.append(p[(cl,icl)]) plt.figtext(pos_x,pos_y_ini-1*pas,'Global : %.1f$\pm$%.1f%%'%(np.mean(rate_tr),np.std(rate_tr))) plt.figtext(pos_x,pos_y_ini-2*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[0],0,np.mean(rate_tr_1),np.std(rate_tr_1))) plt.figtext(pos_x,pos_y_ini-3*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[1],1,np.mean(rate_tr_2),np.std(rate_tr_2))) pos_y_ini = pos_y_ini-4.5*pas pas = .025 plt.figtext(pos_x,pos_y_ini,'Test set %s'%method) rate_test,rate_test_1, rate_test_2 = [],[],[] for draw in sorted(dic): p = dic[draw]['out']['rate_test'] rate_test.append(p['global']) for key in sorted(p): if key != 'global': cl, icl = key[0], key[1] if icl == 1: rate_test_1.append(p[(cl,icl)]) else: rate_test_2.append(p[(cl,icl)]) plt.figtext(pos_x,pos_y_ini-1*pas,'Global : %.1f$\pm$%.1f%%'%(np.mean(rate_test),np.std(rate_test))) plt.figtext(pos_x,pos_y_ini-2*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[0],0,np.mean(rate_test_1),np.std(rate_test_1))) plt.figtext(pos_x,pos_y_ini-3*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[1],1,np.mean(rate_test_2),np.std(rate_test_2)))
def plot_2f_all(out,x_train,y_train,x_test,y_test,x_bad,out_comp=None,map_nl=None): """ Plots decision boundaries for a discrimination problem with 2 features. Superimposed with scatter plots of both training and test sets. If out_comp : comparison of the decision boundary from another method If map_nl : map of the non-linear decision boundary computed by SVM """ theta = out['thetas'] t = out['threshold'] rate = out['rate_test'] method = out['method'] str_t = out['types'] p_train = out['rate_train'] if out_comp: th_comp = out_comp['thetas'] t_comp = out_comp['threshold'] p_comp = out_comp['rate_test'] met_comp = out_comp['method'] if len(theta) > 2: NB_class = len(theta) x_vec, y_vec, proba, map = class_multi_2f(theta) elif len(theta) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta,t) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8,8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] axScatter.scatter(x_test[feat_1],x_test[feat_2],c=list(y_test.NumType.values),cmap=plt.cm.gray,alpha=.2) axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.winter,alpha=.5) axScatter.scatter(x_bad[feat_1],x_bad[feat_2],c='r',alpha=.2) # Plot decision boundaries for i in sorted(theta): db = -1./theta[i][2]*(theta[i][0]+np.log((1-t[i])/t[i])+theta[i][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=3.,c='orange') if out_comp: db = -1./th_comp[i][2]*(th_comp[i][0]+np.log((1-t_comp[i])/t_comp[i])+th_comp[i][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=3.,c='purple') if map_nl: axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3) else: axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2) label = ['%s (%.2f%%)'%(method.upper(),rate['global'])] if out_comp: label.append('%s (%.2f%%)'%(met_comp.upper(),p_comp['global'])) if map_nl: label.append('%s (%.2f%%)'%(map_nl['method'].upper(),map_nl['rate_test']['global'])) axScatter.legend(label,loc=2,prop={'size':10}) if p_train: x_pos = .7 y_pos = .95 pas = .05 axScatter.text(x_pos,y_pos,"%s %% %s"%(p_train[(str_t[0],0)],str_t[0]),color='b',transform=axScatter.transAxes) axScatter.text(x_pos,y_pos-pas,"%s %% %s"%(p_train[(str_t[1],1)],str_t[1]),color='g',transform=axScatter.transAxes) axScatter.text(x_pos,y_pos-2*pas,"%.2f %% test set"%rate['global'],transform=axScatter.transAxes) axScatter.text(x_pos,y_pos-3*pas,"%.2f %% test set"%(100-rate['global']),color='r',transform=axScatter.transAxes) # Determine nice limits by hand bins_1, bins_2 = plot_limits(x_test,x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt,out,out_comp=out_comp,map_nl=map_nl)
def plot_2f_synth_var(out,x_train,x_test,y_test): """ Plots decision boundaries for a discrimination problem with 2 classes and 2 features. """ theta_first = out[0]['thetas'] rate_first = out[0]['rate_test'] t_first = out[0]['threshold'] method = out[0]['method'] if len(theta_first) > 2: NB_class = len(theta_first) x_vec, y_vec, proba, map = class_multi_2f(theta_first) elif len(theta_first) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta_first,t_first) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8,8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray) # Plot decision boundaries # VARIABILITY OF THE LR DECISION BOUNDARY if len(out) > 1: rates = [] for i in sorted(out): theta = out[i]['thetas'] t = out[i]['threshold'] db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=1.,c=(0,0.1*i,1)) rates.append(out[i]['rate_test']['global']) imax = np.argmax(rates) theta = out[imax]['thetas'] t = out[imax]['threshold'] db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=3.,c='midnightblue') x_vec, y_vec, proba, map = class_2c_2f(theta,t) axScatter.contourf(x_vec,y_vec,map,cmap=plt.cm.gray,alpha=.2) axScatter.text(0.6*lim_plot,-0.9*lim_plot,r'%.1f$\pm$%.1f%%'%(np.mean(rates),np.std(rates))) # VARIABILITY WITH THE THRESHOLD else: #for thres in np.arange(0,1,.1): # db = -1./theta[0][1][2]*(theta[0][1][0]+np.log((1-thres)/thres)+theta[0][1][1]*x_vec[0]) # axScatter.plot(x_vec[0],db,lw=1.,c=(0,thres,1)) from LR_functions import g blue_scale = [] for i in range(10): blue_scale.append((0,i*0.1,1)) CS = axScatter.contour(x_vec,y_vec,proba,10,colors=blue_scale) axScatter.clabel(CS, inline=1, fontsize=10) theta = out[0]['thetas'] t = out[0]['threshold'] rate = out[0]['rate_test'] if NB_class == 2: db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=3.,c='midnightblue') axScatter.contourf(x_vec,y_vec,map,cmap=plt.cm.gray,alpha=.2) axScatter.text(0.6*lim_plot,-0.9*lim_plot,'LR (%.1f%%)'%rate['global']) axScatter.text(0.6*lim_plot,-0.8*lim_plot,'t = %.1f'%t[1]) # Determine nice limits by hand bins_1, bins_2 = plot_limits_synth(x_test) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt,out)
def plot_2f_nonlinear(out,x_train,x_test,y_test,y_train=None,synth=False): """ Non linear decision boundary. synth = True for synthetics. """ NB_class = len(np.unique(y_test.Type.values)) map = out['map'] rate = out['rate_test'] method = out['method'] pas = .01 x_vec = np.arange(-1,1,pas) y_vec = np.arange(-1,1,pas) x_vec, y_vec = np.meshgrid(x_vec,y_vec) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8,8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray) if y_train: axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.YlOrRd) # Plot decision boundaries axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3) label = ['%s (%.2f%%)'%(method.upper(),rate['global'])] axScatter.legend(label,loc=4,prop={'size':14}) # Determine nice limits by hand if synth: bins_1, bins_2 = plot_limits_synth(x_test) else: bins_1, bins_2 = plot_limits(x_test,x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs if synth: plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) else: plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display succes rates display_rates(plt,out)
def plot_2f_synthetics(out,x_train,x_test,y_test,y_train=None,out_comp=None,map_nl=None): """ For synthetic tests. """ theta = out['thetas'] rate = out['rate_test'] t = out['threshold'] method = out['method'] if len(theta) > 2: NB_class = len(theta) x_vec, y_vec, proba, map = class_multi_2f(theta) elif len(theta) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta,t) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8,8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray) if y_train: axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.YlOrRd) # Plot decision boundaries if out_comp: colors = ['b','c'] else: colors = ['pink'] for i in sorted(theta): db = -1./theta[i][2]*(theta[i][0]+np.log((1-t[i])/t[i])+theta[i][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=2.,c=colors[0]) if out_comp: th_comp = out_comp['thetas'] t_comp = out_comp['threshold'] db = -1./th_comp[i][2]*(th_comp[i][0]+np.log((1-t_comp[i])/t_comp[i])+th_comp[i][1]*x_vec[0]) axScatter.plot(x_vec[0],db,lw=3.,c=colors[1]) if map_nl: axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3) else: axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3) label = ['%s (%.2f%%)'%(method.upper(),rate['global'])] if out_comp: label.append('%s (%.2f%%)'%(out_comp['method'].upper(),out_comp['rate_test']['global'])) if map_nl: label.append('%s (%.2f%%)'%(map_nl['method'].upper(),map_nl['rate_test']['global'])) if len(label) == 1: s = 14 else: s = 11 axScatter.legend(label,loc=4,prop={'size':s}) # Determine nice limits by hand bins_1, bins_2 = plot_limits_synth(x_test) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt,out,out_comp=out_comp,map_nl=map_nl)
def classifier(opt): """ Classification of the different types of events. opt is an object of the class Options() """ list_attr = opt.__dict__.keys() if not 'x' in list_attr: opt.do_tri() X = opt.x Y = opt.y list_attr = opt.__dict__.keys() if 'train_x' in list_attr: X_TRAIN = opt.train_x Y_TRAIN = opt.train_y dic_results = {} for isc in sorted(opt.xs): print "==========",opt.trad[isc],"==========" subdic = {} if isc > 0: if opt.trad[isc][0] == sta_prev: marker_sta = 1 else: marker_sta = 0 sta_prev = opt.trad[isc][0] else: marker_sta = 0 sta_prev = opt.trad[isc][0] if len(opt.xs[isc]) == 0: continue # About the training set if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr: if os.path.exists(opt.opdict['train_file']): print opt.opdict['train_file'] TRAIN_Y = read_binary_file(opt.opdict['train_file']) else: TRAIN_Y = {} for tir in range(opt.opdict['boot']): TRAIN_Y[tir] = {} elif 'train_x' in list_attr: opt.x = opt.xs_train[isc] opt.y = opt.ys_train[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() g_train = opt.gaussians del opt.gaussians opt.classname2number() x_ref_train = opt.x y_ref_train = opt.y # About the test set opt.x = opt.xs[isc] opt.y = opt.ys[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime']) set['Otime'] = opt.xs[isc].index opt.classname2number() x_test = opt.x y_ref = opt.y x_ref = opt.x if opt.opdict['plot_dataset']: opt.composition_dataset() #K = len(opt.types) ### ITERATE OVER TRAINING SET DRAWS ### for b in range(opt.opdict['boot']): print "\n-------------------- # iter: %d --------------------\n"%(b+1) subsubdic = {} print "WHOLE SET", x_ref.shape, y_ref.shape ### if there is no pre-defined training set ### if 'train_x' not in list_attr: x_train = x_test.copy() if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1: if len(TRAIN_Y[b]) > 0: y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set']) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set']) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set']) y_test = y_test.dropna(how='any') else: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) TRAIN_Y[b]['training_set'] = map(int,list(y_train.index)) TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index)) TRAIN_Y[b]['test_set'] = map(int,list(y_test.index)) ### multi-stations case ### else: if marker_sta == 0: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) list_ev_train = y_train.index list_ev_cv = y_cv.index list_ev_test = y_test.index else: y_train = y_ref.reindex(index=list_ev_train) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=list_ev_cv) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=list_ev_test) y_test = y_test.dropna(how='any') x_train = x_ref.reindex(index=y_train.index) ### if a training set was pre-defined ### else: x_train = x_ref_train.copy() y_train = y_ref_train.copy() y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train) x_cv = x_ref.reindex(index=y_cv.index) x_test = x_ref.reindex(index=y_test.index) i_train = y_train.index x_train.index = range(x_train.shape[0]) y_train.index = range(y_train.shape[0]) print "TRAINING SET", x_train.shape, y_train.shape if x_train.shape[0] != y_train.shape[0]: print "Training set: Incoherence in x and y dimensions" sys.exit() i_cv = y_cv.index x_cv.index = range(x_cv.shape[0]) y_cv.index = range(y_cv.shape[0]) print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape if x_cv.shape[0] != y_cv.shape[0]: print "Cross-validation set: Incoherence in x and y dimensions" sys.exit() subsubdic['list_ev'] = np.array(y_test.index) i_test = y_test.index x_test.index = range(x_test.shape[0]) y_test.index = range(y_test.shape[0]) print "TEST SET", x_test.shape, y_test.shape if x_test.shape[0] != y_test.shape[0]: print "Test set: Incoherence in x and y dimensions" sys.exit() opt.train_x = x_train opt.x = x_test opt.train_y = y_train opt.y = y_test if opt.opdict['plot_pdf']: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if 'train_x' in list_attr: opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf']) else: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if opt.opdict['method'] == '1b1': # EXTRACTEURS print "********** EXTRACTION 1-BY-1 **********" opt.opdict['boot'] = 1 one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] == 'ova': print "********** EXTRACTION 1-VS-ALL **********" opt.opdict['boot'] = 1 one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] in ['svm','svm_nl']: # SVM print "********** SVM **********" if opt.opdict['method'] == 'svm': kern = 'Lin' else: kern = 'NonLin' out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas']) if 'map' in sorted(out): opt.map = out['map'] if 'thetas' in sorted(out): theta_vec = out['thetas'] theta,threshold = {},{} for it in range(len(theta_vec)): theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1]) threshold[it+1] = 0.5 out['thetas'] = theta out['threshold'] = threshold elif opt.opdict['method'] == 'lrsk': # LOGISTIC REGRESSION (scikit learn) print "********* Logistic regression (sklearn) **********" out = implement_lr_sklearn(x_train,x_test,y_train,y_test) threshold, theta = {},{} for it in range(len(out['thetas'])): threshold[it+1] = 0.5 theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1]) out['threshold'] = threshold out['thetas'] = theta elif opt.opdict['method'] == 'lr': # LOGISTIC REGRESSION print "********* Logistic regression **********" from LR_functions import do_all_logistic_regression out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv) theta = out['thetas'] threshold = out['threshold'] if 'learn_file' in sorted(opt.opdict): learn_filename = opt.opdict['learn_file'] if not os.path.exists(learn_filename): wtr = write_binary_file(learn_filename,i_train) CLASS_test = out['label_test'] CLASS_train = out['label_train'] # TRAINING SET print "\t *TRAINING SET" y_train_np = y_train.NumType.values.ravel() from sklearn.metrics import confusion_matrix cmat_train = confusion_matrix(y_train_np,CLASS_train) p_tr = dic_percent(cmat_train,opt.types,verbose=True) out['rate_train'] = p_tr print " Global : %.2f%%"%p_tr['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) # TEST SET print "\t *TEST SET" y_test_np = y_test.NumType.values.ravel() cmat_test = confusion_matrix(y_test_np,CLASS_test) p_test = dic_percent(cmat_test,opt.types,verbose=True) out['rate_test'] = p_test print " Global : %.2f%%"%p_test['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) if opt.opdict['plot_confusion']: plt.show() else: plt.close() # PLOT PRECISION AND RECALL if opt.opdict['plot_prec_rec']: from LR_functions import normalize,plot_precision_recall x_train, x_test = normalize(x_train,x_test) plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta) pourcentages = (p_tr['global'],p_test['global']) out['method'] = opt.opdict['method'] out['types'] = opt.types opt.out = out # PLOT DECISION BOUNDARIES n_feat = x_train.shape[1] # number of features if n_feat < 4: if opt.opdict['plot_sep'] or opt.opdict['save_sep']: print "\nPLOTTING" print "Theta values:",theta print "Threshold:", threshold # COMPARE AND PLOT LR AND SVM RESULTS out_svm, out_nl = {},{} dir = '%s_SEP'%opt.opdict['method'].upper() if opt.opdict['method']=='lr' and opt.opdict['compare']: dir = 'LR_SVM_SEP' out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin') cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) theta_svm,t_svm = {},{} for it in range(len(out_svm['thetas'])): theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1]) t_svm[it+1] = 0.5 out_svm['thetas'] = theta_svm out_svm['threshold'] = t_svm out_svm['rate_test'] = svm_pt out_svm['rate_train'] = svm_ptr out_svm['method'] = 'SVM' if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']: dir = '%s_NL_SEP'%opt.opdict['method'].upper() out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin') cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) out_nl['rate_test'] = svm_pt out_nl['rate_train'] = svm_ptr out_nl['method'] = 'SVM_NL' save_dir = os.path.join(opt.opdict['fig_path'],dir) opt.verify_and_create(save_dir) from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index) x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index) good_train = y_train.reindex(index=x_train_good.index) x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index) x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index) # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES if n_feat == 1 and len(opt.opdict['types']) == 2: name = opt.opdict['feat_list'][0] from plot_functions import plot_hyp_func_1f, histo_pdfs if opt.opdict['method']=='lr' and opt.opdict['compare']: plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train) else: #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train) plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train) # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES elif n_feat == 2: name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1]) if opt.opdict['method'] in ['lr','svm']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad) elif opt.opdict['method']=='lr' and opt.opdict['compare']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl) elif opt.opdict['method'] == 'svm_nl': from plot_2features import plot_2f_nonlinear plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train) # PLOT FOR 3 ATTRIBUTES elif n_feat == 3: from plot_functions import plot_db_3d plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set') plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set') name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2]) if opt.opdict['save_sep']: savename = '%s/CL_sep_%s.png'%(save_dir,name) print "Figure saved in %s"%savename plt.savefig(savename) if opt.opdict['plot_sep']: plt.show() else: plt.close() # WRITE RESULTS INTO A DICTIONARY subsubdic['%'] = pourcentages trad_CLASS_test = [] for i in CLASS_test: i = int(i) trad_CLASS_test.append(opt.types[i]) subsubdic['classification'] = trad_CLASS_test if opt.opdict['probas']: subsubdic['proba'] = out['probas'] if opt.opdict['plot_var']: subsubdic['out'] = out subdic[b] = subsubdic if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2: from plot_2features import plot_2f_variability plot_2f_variability(subdic,x_train,y_train,x_test,y_test) plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper())) plt.show() dic_results[opt.trad[isc]] = subdic dic_results['header'] = {} dic_results['header']['features'] = opt.opdict['feat_list'] dic_results['header']['types'] = opt.opdict['types'] dic_results['header']['catalog'] = opt.opdict['label_test'] if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']: print "Save results in file %s"%opt.opdict['result_path'] write_binary_file(opt.opdict['result_path'],dic_results) if 'train_file' in sorted(opt.opdict): if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1: write_binary_file(opt.opdict['train_file'],TRAIN_Y)
def implement_svm(x_train,x_test,y_train,y_test,types,opdict,kern='NonLin',proba=False): """ Implements SVM from scikit learn package. Options : - kernel : could be 'Lin' (for linear) or 'NonLin' (for non-linear). In the latter case, the kernel is a gaussian kernel. - proba : tells if the probability estimates must be computed Returns an output dictionary with keys : - label_test : classification predicted by SVM for the test set - label_train : classification predicted by SVM for the training set If proba is True, add the key 'probas' containing the probability estimates for each element of the dataset If kernel is linear, add the key 'thetas' containing the coefficients of the linear decision boundary If kernel is non linear, add the key "map" containing the classification map. """ from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) # do grid search from sklearn.grid_search import GridSearchCV from sklearn import svm print "doing grid search" C_range = 10.0 ** np.arange(-2, 5) if kern == 'NonLin': gamma_range = 10.0 ** np.arange(-3,3) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(svm.SVC(probability=proba), param_grid=param_grid, n_jobs=-1) elif kern == 'Lin': param_grid = dict(C=C_range) grid = GridSearchCV(svm.LinearSVC(), param_grid=param_grid, n_jobs=-1) grid.fit(x_train.values, y_train.NumType.values.ravel()) print "The best classifier is: ", grid.best_estimator_ if kern == 'NonLin': print "Number of support vectors for each class: ", grid.best_estimator_.n_support_ y_train_SVM = grid.best_estimator_.predict(x_train) y_test_SVM = grid.best_estimator_.predict(x_test) output = {} output['label_test'] = y_test_SVM output['label_train'] = y_train_SVM if proba: probabilities = grid.best_estimator_.predict_proba(x_test) output['probas'] = {} NB_class = len(types) for k in range(NB_class): output['probas'][types[k]] = probabilities[:,k] if kern == 'Lin': output['thetas'] = grid.best_estimator_.raw_coef_ elif len(x_train.columns) == 2: pas = .01 x_vec, y_vec = np.arange(-1,1,pas), np.arange(-1,1,pas) x_vec, y_vec = np.meshgrid(x_vec,y_vec) vec = np.c_[x_vec.ravel(),y_vec.ravel()] print vec.shape map = grid.best_estimator_.predict(np.c_[x_vec.ravel(),y_vec.ravel()]) output['map'] = map.reshape(x_vec.shape) return output
def plot_2f_variability(dic, x_train, y_train, x_test, y_test): """ Plots decision boundaries for a discrimination problem with 2 features in function of the training set draws. Superimposed with scatter plots of both training and test sets. """ #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8, 8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train, x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] axScatter.scatter(x_test[feat_1], x_test[feat_2], c=list(y_test.NumType.values), cmap=plt.cm.gray, alpha=.2) axScatter.scatter(x_train[feat_1], x_train[feat_2], c=list(y_train.NumType.values), cmap=plt.cm.winter, alpha=.5) # Plot decision boundaries x_vec = np.arange(-1, 1, .01) rates = [] for draw in sorted(dic): theta = dic[draw]['out']['thetas'] t = dic[draw]['out']['threshold'] rates.append(dic[draw]['out']['rate_test']['global']) db = -1. / theta[1][2] * (theta[1][0] + np.log( (1 - t[1]) / t[1]) + theta[1][1] * x_vec) axScatter.plot(x_vec, db, lw=1., c=(0, 0.1 * draw, 1)) imax = np.argmax(rates) theta = dic[imax]['out']['thetas'] t = dic[imax]['out']['threshold'] method = dic[imax]['out']['method'].upper() types = dic[imax]['out']['types'] lab = r'%s %.1f$\pm$%.1f%%' % (method, np.mean(rates), np.std(rates)) db_max = -1. / theta[1][2] * (theta[1][0] + np.log( (1 - t[1]) / t[1]) + theta[1][1] * x_vec) axScatter.plot(x_vec, db_max, lw=3., c='midnightblue', label=lab) axScatter.legend(loc=4) x_vec, y_vec, proba, map = class_2c_2f(theta, t) axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2) # Determine nice limits by hand bins_1, bins_2 = plot_limits(x_test, x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_kde(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates pos_x = .76 pos_y_ini = .95 pas = .025 plt.figtext(pos_x, pos_y_ini, 'Training set %s' % method) rate_tr, rate_tr_1, rate_tr_2 = [], [], [] for draw in sorted(dic): p = dic[draw]['out']['rate_train'] rate_tr.append(p['global']) for key in sorted(p): if key != 'global': cl, icl = key[0], key[1] if icl == 1: rate_tr_1.append(p[(cl, icl)]) else: rate_tr_2.append(p[(cl, icl)]) plt.figtext( pos_x, pos_y_ini - 1 * pas, 'Global : %.1f$\pm$%.1f%%' % (np.mean(rate_tr), np.std(rate_tr))) plt.figtext( pos_x, pos_y_ini - 2 * pas, '%s (%d) : %.1f$\pm$%.1f%%' % (types[0], 0, np.mean(rate_tr_1), np.std(rate_tr_1))) plt.figtext( pos_x, pos_y_ini - 3 * pas, '%s (%d) : %.1f$\pm$%.1f%%' % (types[1], 1, np.mean(rate_tr_2), np.std(rate_tr_2))) pos_y_ini = pos_y_ini - 4.5 * pas pas = .025 plt.figtext(pos_x, pos_y_ini, 'Test set %s' % method) rate_test, rate_test_1, rate_test_2 = [], [], [] for draw in sorted(dic): p = dic[draw]['out']['rate_test'] rate_test.append(p['global']) for key in sorted(p): if key != 'global': cl, icl = key[0], key[1] if icl == 1: rate_test_1.append(p[(cl, icl)]) else: rate_test_2.append(p[(cl, icl)]) plt.figtext( pos_x, pos_y_ini - 1 * pas, 'Global : %.1f$\pm$%.1f%%' % (np.mean(rate_test), np.std(rate_test))) plt.figtext( pos_x, pos_y_ini - 2 * pas, '%s (%d) : %.1f$\pm$%.1f%%' % (types[0], 0, np.mean(rate_test_1), np.std(rate_test_1))) plt.figtext( pos_x, pos_y_ini - 3 * pas, '%s (%d) : %.1f$\pm$%.1f%%' % (types[1], 1, np.mean(rate_test_2), np.std(rate_test_2)))
def plot_2f_all(out, x_train, y_train, x_test, y_test, x_bad, out_comp=None, map_nl=None): """ Plots decision boundaries for a discrimination problem with 2 features. Superimposed with scatter plots of both training and test sets. If out_comp : comparison of the decision boundary from another method If map_nl : map of the non-linear decision boundary computed by SVM """ theta = out['thetas'] t = out['threshold'] rate = out['rate_test'] method = out['method'] str_t = out['types'] p_train = out['rate_train'] if out_comp: th_comp = out_comp['thetas'] t_comp = out_comp['threshold'] p_comp = out_comp['rate_test'] met_comp = out_comp['method'] if len(theta) > 2: NB_class = len(theta) x_vec, y_vec, proba, map = class_multi_2f(theta) elif len(theta) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta, t) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8, 8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train, x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] axScatter.scatter(x_test[feat_1], x_test[feat_2], c=list(y_test.NumType.values), cmap=plt.cm.gray, alpha=.2) axScatter.scatter(x_train[feat_1], x_train[feat_2], c=list(y_train.NumType.values), cmap=plt.cm.winter, alpha=.5) axScatter.scatter(x_bad[feat_1], x_bad[feat_2], c='r', alpha=.2) # Plot decision boundaries for i in sorted(theta): db = -1. / theta[i][2] * (theta[i][0] + np.log( (1 - t[i]) / t[i]) + theta[i][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=3., c='orange') if out_comp: db = -1. / th_comp[i][2] * (th_comp[i][0] + np.log( (1 - t_comp[i]) / t_comp[i]) + th_comp[i][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=3., c='purple') if map_nl: axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3) else: axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2) label = ['%s (%.2f%%)' % (method.upper(), rate['global'])] if out_comp: label.append('%s (%.2f%%)' % (met_comp.upper(), p_comp['global'])) if map_nl: label.append('%s (%.2f%%)' % (map_nl['method'].upper(), map_nl['rate_test']['global'])) axScatter.legend(label, loc=2, prop={'size': 10}) if p_train: x_pos = .7 y_pos = .95 pas = .05 axScatter.text(x_pos, y_pos, "%s %% %s" % (p_train[(str_t[0], 0)], str_t[0]), color='b', transform=axScatter.transAxes) axScatter.text(x_pos, y_pos - pas, "%s %% %s" % (p_train[(str_t[1], 1)], str_t[1]), color='g', transform=axScatter.transAxes) axScatter.text(x_pos, y_pos - 2 * pas, "%.2f %% test set" % rate['global'], transform=axScatter.transAxes) axScatter.text(x_pos, y_pos - 3 * pas, "%.2f %% test set" % (100 - rate['global']), color='r', transform=axScatter.transAxes) # Determine nice limits by hand bins_1, bins_2 = plot_limits(x_test, x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_kde(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt, out, out_comp=out_comp, map_nl=map_nl)
def plot_2f_synth_var(out, x_train, x_test, y_test): """ Plots decision boundaries for a discrimination problem with 2 classes and 2 features. """ theta_first = out[0]['thetas'] rate_first = out[0]['rate_test'] t_first = out[0]['threshold'] method = out[0]['method'] if len(theta_first) > 2: NB_class = len(theta_first) x_vec, y_vec, proba, map = class_multi_2f(theta_first) elif len(theta_first) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta_first, t_first) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8, 8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train, x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray) # Plot decision boundaries # VARIABILITY OF THE LR DECISION BOUNDARY if len(out) > 1: rates = [] for i in sorted(out): theta = out[i]['thetas'] t = out[i]['threshold'] db = -1. / theta[1][2] * (theta[1][0] + np.log( (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=1., c=(0, 0.1 * i, 1)) rates.append(out[i]['rate_test']['global']) imax = np.argmax(rates) theta = out[imax]['thetas'] t = out[imax]['threshold'] db = -1. / theta[1][2] * (theta[1][0] + np.log( (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=3., c='midnightblue') x_vec, y_vec, proba, map = class_2c_2f(theta, t) axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=.2) axScatter.text(0.6 * lim_plot, -0.9 * lim_plot, r'%.1f$\pm$%.1f%%' % (np.mean(rates), np.std(rates))) # VARIABILITY WITH THE THRESHOLD else: #for thres in np.arange(0,1,.1): # db = -1./theta[0][1][2]*(theta[0][1][0]+np.log((1-thres)/thres)+theta[0][1][1]*x_vec[0]) # axScatter.plot(x_vec[0],db,lw=1.,c=(0,thres,1)) from LR_functions import g blue_scale = [] for i in range(10): blue_scale.append((0, i * 0.1, 1)) CS = axScatter.contour(x_vec, y_vec, proba, 10, colors=blue_scale) axScatter.clabel(CS, inline=1, fontsize=10) theta = out[0]['thetas'] t = out[0]['threshold'] rate = out[0]['rate_test'] if NB_class == 2: db = -1. / theta[1][2] * (theta[1][0] + np.log( (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=3., c='midnightblue') axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=.2) axScatter.text(0.6 * lim_plot, -0.9 * lim_plot, 'LR (%.1f%%)' % rate['global']) axScatter.text(0.6 * lim_plot, -0.8 * lim_plot, 't = %.1f' % t[1]) # Determine nice limits by hand bins_1, bins_2 = plot_limits_synth(x_test) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_gauss(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt, out)
def plot_2f_nonlinear(out, x_train, x_test, y_test, y_train=None, synth=False): """ Non linear decision boundary. synth = True for synthetics. """ NB_class = len(np.unique(y_test.Type.values)) map = out['map'] rate = out['rate_test'] method = out['method'] pas = .01 x_vec = np.arange(-1, 1, pas) y_vec = np.arange(-1, 1, pas) x_vec, y_vec = np.meshgrid(x_vec, y_vec) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8, 8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train, x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray) if y_train: axScatter.scatter(x_train[feat_1], x_train[feat_2], c=list(y_train.NumType.values), cmap=plt.cm.YlOrRd) # Plot decision boundaries axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3) label = ['%s (%.2f%%)' % (method.upper(), rate['global'])] axScatter.legend(label, loc=4, prop={'size': 14}) # Determine nice limits by hand if synth: bins_1, bins_2 = plot_limits_synth(x_test) else: bins_1, bins_2 = plot_limits(x_test, x_train) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs if synth: plot_histos_and_pdfs_gauss(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) else: plot_histos_and_pdfs_kde(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display succes rates display_rates(plt, out)
def plot_2f_synthetics(out, x_train, x_test, y_test, y_train=None, out_comp=None, map_nl=None): """ For synthetic tests. """ theta = out['thetas'] rate = out['rate_test'] t = out['threshold'] method = out['method'] if len(theta) > 2: NB_class = len(theta) x_vec, y_vec, proba, map = class_multi_2f(theta) elif len(theta) == 1: NB_class = 2 x_vec, y_vec, proba, map = class_2c_2f(theta, t) #### PLOT #### nullfmt = NullFormatter() # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(1, figsize=(8, 8)) fig.set_facecolor('white') axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # No labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) # Scatter plot: from LR_functions import normalize x_train, x_test = normalize(x_train, x_test) feat_1 = x_test.columns[0] feat_2 = x_test.columns[1] x = x_test[feat_1] y = x_test[feat_2] axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray) if y_train: axScatter.scatter(x_train[feat_1], x_train[feat_2], c=list(y_train.NumType.values), cmap=plt.cm.YlOrRd) # Plot decision boundaries if out_comp: colors = ['b', 'c'] else: colors = ['pink'] for i in sorted(theta): db = -1. / theta[i][2] * (theta[i][0] + np.log( (1 - t[i]) / t[i]) + theta[i][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=2., c=colors[0]) if out_comp: th_comp = out_comp['thetas'] t_comp = out_comp['threshold'] db = -1. / th_comp[i][2] * (th_comp[i][0] + np.log( (1 - t_comp[i]) / t_comp[i]) + th_comp[i][1] * x_vec[0]) axScatter.plot(x_vec[0], db, lw=3., c=colors[1]) if map_nl: axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3) else: axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3) label = ['%s (%.2f%%)' % (method.upper(), rate['global'])] if out_comp: label.append( '%s (%.2f%%)' % (out_comp['method'].upper(), out_comp['rate_test']['global'])) if map_nl: label.append('%s (%.2f%%)' % (map_nl['method'].upper(), map_nl['rate_test']['global'])) if len(label) == 1: s = 14 else: s = 11 axScatter.legend(label, loc=4, prop={'size': s}) # Determine nice limits by hand bins_1, bins_2 = plot_limits_synth(x_test) axScatter.set_xlim((bins_1[0], bins_1[-1])) axScatter.set_ylim((bins_2[0], bins_2[-1])) axScatter.set_xlabel(feat_1) axScatter.set_ylabel(feat_2) # Plot histograms and PDFs plot_histos_and_pdfs_gauss(axHistx, axHisty, bins_1, bins_2, x_test, y_test, x_train=x_train, y_train=y_train) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # Display success rates display_rates(plt, out, out_comp=out_comp, map_nl=map_nl)