def plot_sep(opt): opt.set_params() opt.opdict['fig_path'] = os.path.join(opt.opdict['outdir'],'figures') from do_classification import classifier ### LINEAR SVM ### opt.opdict['method'] = 'svm' classifier(opt) #opt.plot_PDFs() out_svm = opt.out print "SVM", out_svm['thetas'] x_train = opt.train_x x_test = opt.x y_train = opt.train_y y_test = opt.y # *** Plot *** plot_2f_synthetics(out_svm,x_train,x_test,y_test,y_train=y_train) #plt.savefig('%s/Test_%dc_%s_SVM.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() ### LOGISTIC REGRESSION ### opt.opdict['method'] = 'lr' out_lr = {} for b in range(1): opt.opdict['learn_file'] = os.path.join(opt.opdict['libdir'],'LR_%d'%b) #os.remove(opt.opdict['learn_file']) classifier(opt) out_lr[b] = opt.out print "LR", out_lr[0]['thetas'] # *** Plot *** if b == 0: print sorted(out_lr[0]) plot_2f_synthetics(out_lr[0],x_train,x_test,y_test,y_train=y_train) #plt.savefig('%s/Test_%dc_%s_LR.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() else: plot_2f_synth_var(out_lr,x_train,x_test,y_test,opt.NB_test) #plt.savefig('%s/Test_%dc_LR_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() if opt.opdict['plot_prec_rec']: plot_2f_synth_var(out_lr,x_train,x_test,y_test,opt.NB_test) #plt.savefig('%s/Test_%dc_bad_threshold.png'%(opt.opdict['fig_path'],len(opt.types))) plt.show() ### NON LINEAR SVM ### opt.opdict['method'] = 'svm_nl' classifier(opt) out_svm_nl = opt.out plot_2f_nonlinear(out_svm_nl,x_train,x_test,y_test,y_train=y_train,synth=True) #plt.savefig('%s/Test_%dc_%s_SVM_NL.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() ### COMPARE ALL 3 METHODS ON THE SAME PLOT ### plot_2f_synthetics(out_lr[0],x_train,x_test,y_test,out_comp=out_svm,y_train=y_train,map_nl=out_svm_nl) #plt.savefig('%s/Test_%dc_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show()
def plot_sep(opt): opt.set_params() opt.opdict['fig_path'] = os.path.join(opt.opdict['outdir'], 'figures') from do_classification import classifier ### LINEAR SVM ### opt.opdict['method'] = 'svm' classifier(opt) #opt.plot_PDFs() out_svm = opt.out print "SVM", out_svm['thetas'] x_train = opt.train_x x_test = opt.x y_train = opt.train_y y_test = opt.y # *** Plot *** plot_2f_synthetics(out_svm, x_train, x_test, y_test, y_train=y_train) #plt.savefig('%s/Test_%dc_%s_SVM.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() ### LOGISTIC REGRESSION ### opt.opdict['method'] = 'lr' out_lr = {} for b in range(1): opt.opdict['learn_file'] = os.path.join(opt.opdict['libdir'], 'LR_%d' % b) #os.remove(opt.opdict['learn_file']) classifier(opt) out_lr[b] = opt.out print "LR", out_lr[0]['thetas'] # *** Plot *** if b == 0: print sorted(out_lr[0]) plot_2f_synthetics(out_lr[0], x_train, x_test, y_test, y_train=y_train) #plt.savefig('%s/Test_%dc_%s_LR.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() else: plot_2f_synth_var(out_lr, x_train, x_test, y_test, opt.NB_test) #plt.savefig('%s/Test_%dc_LR_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() if opt.opdict['plot_prec_rec']: plot_2f_synth_var(out_lr, x_train, x_test, y_test, opt.NB_test) #plt.savefig('%s/Test_%dc_bad_threshold.png'%(opt.opdict['fig_path'],len(opt.types))) plt.show() ### NON LINEAR SVM ### opt.opdict['method'] = 'svm_nl' classifier(opt) out_svm_nl = opt.out plot_2f_nonlinear(out_svm_nl, x_train, x_test, y_test, y_train=y_train, synth=True) #plt.savefig('%s/Test_%dc_%s_SVM_NL.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show() ### COMPARE ALL 3 METHODS ON THE SAME PLOT ### plot_2f_synthetics(out_lr[0], x_train, x_test, y_test, out_comp=out_svm, y_train=y_train, map_nl=out_svm_nl) #plt.savefig('%s/Test_%dc_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep)) plt.show()
def classifier(opt): """ Classification of the different types of events. opt is an object of the class Options() """ list_attr = opt.__dict__.keys() if not 'x' in list_attr: opt.do_tri() X = opt.x Y = opt.y list_attr = opt.__dict__.keys() if 'train_x' in list_attr: X_TRAIN = opt.train_x Y_TRAIN = opt.train_y dic_results = {} for isc in sorted(opt.xs): print "==========",opt.trad[isc],"==========" subdic = {} if isc > 0: if opt.trad[isc][0] == sta_prev: marker_sta = 1 else: marker_sta = 0 sta_prev = opt.trad[isc][0] else: marker_sta = 0 sta_prev = opt.trad[isc][0] if len(opt.xs[isc]) == 0: continue # About the training set if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr: if os.path.exists(opt.opdict['train_file']): print opt.opdict['train_file'] TRAIN_Y = read_binary_file(opt.opdict['train_file']) else: TRAIN_Y = {} for tir in range(opt.opdict['boot']): TRAIN_Y[tir] = {} elif 'train_x' in list_attr: opt.x = opt.xs_train[isc] opt.y = opt.ys_train[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() g_train = opt.gaussians del opt.gaussians opt.classname2number() x_ref_train = opt.x y_ref_train = opt.y # About the test set opt.x = opt.xs[isc] opt.y = opt.ys[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime']) set['Otime'] = opt.xs[isc].index opt.classname2number() x_test = opt.x y_ref = opt.y x_ref = opt.x if opt.opdict['plot_dataset']: opt.composition_dataset() #K = len(opt.types) ### ITERATE OVER TRAINING SET DRAWS ### for b in range(opt.opdict['boot']): print "\n-------------------- # iter: %d --------------------\n"%(b+1) subsubdic = {} print "WHOLE SET", x_ref.shape, y_ref.shape ### if there is no pre-defined training set ### if 'train_x' not in list_attr: x_train = x_test.copy() if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1: if len(TRAIN_Y[b]) > 0: y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set']) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set']) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set']) y_test = y_test.dropna(how='any') else: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) TRAIN_Y[b]['training_set'] = map(int,list(y_train.index)) TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index)) TRAIN_Y[b]['test_set'] = map(int,list(y_test.index)) ### multi-stations case ### else: if marker_sta == 0: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) list_ev_train = y_train.index list_ev_cv = y_cv.index list_ev_test = y_test.index else: y_train = y_ref.reindex(index=list_ev_train) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=list_ev_cv) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=list_ev_test) y_test = y_test.dropna(how='any') x_train = x_ref.reindex(index=y_train.index) ### if a training set was pre-defined ### else: x_train = x_ref_train.copy() y_train = y_ref_train.copy() y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train) x_cv = x_ref.reindex(index=y_cv.index) x_test = x_ref.reindex(index=y_test.index) i_train = y_train.index x_train.index = range(x_train.shape[0]) y_train.index = range(y_train.shape[0]) print "TRAINING SET", x_train.shape, y_train.shape if x_train.shape[0] != y_train.shape[0]: print "Training set: Incoherence in x and y dimensions" sys.exit() i_cv = y_cv.index x_cv.index = range(x_cv.shape[0]) y_cv.index = range(y_cv.shape[0]) print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape if x_cv.shape[0] != y_cv.shape[0]: print "Cross-validation set: Incoherence in x and y dimensions" sys.exit() subsubdic['list_ev'] = np.array(y_test.index) i_test = y_test.index x_test.index = range(x_test.shape[0]) y_test.index = range(y_test.shape[0]) print "TEST SET", x_test.shape, y_test.shape if x_test.shape[0] != y_test.shape[0]: print "Test set: Incoherence in x and y dimensions" sys.exit() opt.train_x = x_train opt.x = x_test opt.train_y = y_train opt.y = y_test if opt.opdict['plot_pdf']: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if 'train_x' in list_attr: opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf']) else: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if opt.opdict['method'] == '1b1': # EXTRACTEURS print "********** EXTRACTION 1-BY-1 **********" opt.opdict['boot'] = 1 one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] == 'ova': print "********** EXTRACTION 1-VS-ALL **********" opt.opdict['boot'] = 1 one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] in ['svm','svm_nl']: # SVM print "********** SVM **********" if opt.opdict['method'] == 'svm': kern = 'Lin' else: kern = 'NonLin' out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas']) if 'map' in sorted(out): opt.map = out['map'] if 'thetas' in sorted(out): theta_vec = out['thetas'] theta,threshold = {},{} for it in range(len(theta_vec)): theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1]) threshold[it+1] = 0.5 out['thetas'] = theta out['threshold'] = threshold elif opt.opdict['method'] == 'lrsk': # LOGISTIC REGRESSION (scikit learn) print "********* Logistic regression (sklearn) **********" out = implement_lr_sklearn(x_train,x_test,y_train,y_test) threshold, theta = {},{} for it in range(len(out['thetas'])): threshold[it+1] = 0.5 theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1]) out['threshold'] = threshold out['thetas'] = theta elif opt.opdict['method'] == 'lr': # LOGISTIC REGRESSION print "********* Logistic regression **********" from LR_functions import do_all_logistic_regression out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv) theta = out['thetas'] threshold = out['threshold'] if 'learn_file' in sorted(opt.opdict): learn_filename = opt.opdict['learn_file'] if not os.path.exists(learn_filename): wtr = write_binary_file(learn_filename,i_train) CLASS_test = out['label_test'] CLASS_train = out['label_train'] # TRAINING SET print "\t *TRAINING SET" y_train_np = y_train.NumType.values.ravel() from sklearn.metrics import confusion_matrix cmat_train = confusion_matrix(y_train_np,CLASS_train) p_tr = dic_percent(cmat_train,opt.types,verbose=True) out['rate_train'] = p_tr print " Global : %.2f%%"%p_tr['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) # TEST SET print "\t *TEST SET" y_test_np = y_test.NumType.values.ravel() cmat_test = confusion_matrix(y_test_np,CLASS_test) p_test = dic_percent(cmat_test,opt.types,verbose=True) out['rate_test'] = p_test print " Global : %.2f%%"%p_test['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) if opt.opdict['plot_confusion']: plt.show() else: plt.close() # PLOT PRECISION AND RECALL if opt.opdict['plot_prec_rec']: from LR_functions import normalize,plot_precision_recall x_train, x_test = normalize(x_train,x_test) plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta) pourcentages = (p_tr['global'],p_test['global']) out['method'] = opt.opdict['method'] out['types'] = opt.types opt.out = out # PLOT DECISION BOUNDARIES n_feat = x_train.shape[1] # number of features if n_feat < 4: if opt.opdict['plot_sep'] or opt.opdict['save_sep']: print "\nPLOTTING" print "Theta values:",theta print "Threshold:", threshold # COMPARE AND PLOT LR AND SVM RESULTS out_svm, out_nl = {},{} dir = '%s_SEP'%opt.opdict['method'].upper() if opt.opdict['method']=='lr' and opt.opdict['compare']: dir = 'LR_SVM_SEP' out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin') cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) theta_svm,t_svm = {},{} for it in range(len(out_svm['thetas'])): theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1]) t_svm[it+1] = 0.5 out_svm['thetas'] = theta_svm out_svm['threshold'] = t_svm out_svm['rate_test'] = svm_pt out_svm['rate_train'] = svm_ptr out_svm['method'] = 'SVM' if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']: dir = '%s_NL_SEP'%opt.opdict['method'].upper() out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin') cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) out_nl['rate_test'] = svm_pt out_nl['rate_train'] = svm_ptr out_nl['method'] = 'SVM_NL' save_dir = os.path.join(opt.opdict['fig_path'],dir) opt.verify_and_create(save_dir) from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index) x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index) good_train = y_train.reindex(index=x_train_good.index) x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index) x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index) # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES if n_feat == 1 and len(opt.opdict['types']) == 2: name = opt.opdict['feat_list'][0] from plot_functions import plot_hyp_func_1f, histo_pdfs if opt.opdict['method']=='lr' and opt.opdict['compare']: plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train) else: #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train) plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train) # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES elif n_feat == 2: name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1]) if opt.opdict['method'] in ['lr','svm']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad) elif opt.opdict['method']=='lr' and opt.opdict['compare']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl) elif opt.opdict['method'] == 'svm_nl': from plot_2features import plot_2f_nonlinear plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train) # PLOT FOR 3 ATTRIBUTES elif n_feat == 3: from plot_functions import plot_db_3d plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set') plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set') name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2]) if opt.opdict['save_sep']: savename = '%s/CL_sep_%s.png'%(save_dir,name) print "Figure saved in %s"%savename plt.savefig(savename) if opt.opdict['plot_sep']: plt.show() else: plt.close() # WRITE RESULTS INTO A DICTIONARY subsubdic['%'] = pourcentages trad_CLASS_test = [] for i in CLASS_test: i = int(i) trad_CLASS_test.append(opt.types[i]) subsubdic['classification'] = trad_CLASS_test if opt.opdict['probas']: subsubdic['proba'] = out['probas'] if opt.opdict['plot_var']: subsubdic['out'] = out subdic[b] = subsubdic if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2: from plot_2features import plot_2f_variability plot_2f_variability(subdic,x_train,y_train,x_test,y_test) plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper())) plt.show() dic_results[opt.trad[isc]] = subdic dic_results['header'] = {} dic_results['header']['features'] = opt.opdict['feat_list'] dic_results['header']['types'] = opt.opdict['types'] dic_results['header']['catalog'] = opt.opdict['label_test'] if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']: print "Save results in file %s"%opt.opdict['result_path'] write_binary_file(opt.opdict['result_path'],dic_results) if 'train_file' in sorted(opt.opdict): if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1: write_binary_file(opt.opdict['train_file'],TRAIN_Y)