def crossValidation(): min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,7)) X = dataset[['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = dataset["class"] Y = np.array(Y) nfold = 25 precision = [] recall = [] fscore = [] clf = RandomForestClassifier() skf = model_selection.StratifiedKFold(n_splits=nfold) y_test_total = [] y_pred_total = [] for train_index, test_index in skf.split(X, Y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] y_test_total.extend(y_test.tolist()) model = clf.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_total.extend(y_pred.tolist()) p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted') #print(accuracy_score(y_test, y_pred)) a_score.append(accuracy_score(y_test, y_pred)) precision.append(p) recall.append(r) fscore.append(f) plot_learning_curve(clf, "Learning Curves", X, Y, ylim=None, cv=skf, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)) plt.savefig('images/RF-LearningCurve.png') return pd.Series(y_test_total), pd.Series(y_pred_total), np.mean(precision),np.mean(recall),np.mean(fscore), np.mean(a_score)
def crossValidate(document_term_matrix,labels,nfold=2): clf = None precision = [] recall = [] fscore = [] #clf = LinearSVC() #loss='hinge',tol=0.000001 loss='hinge',tol=1 loss='hinge',C=0.0001,max_iter=1000 loss='hinge',C=0.1, tol=0.001,max_iter=1000 clf = LinearSVC(loss='squared_hinge', tol=1e-4,C=1.0, max_iter=1000) #C=0.05, tol=0.1,max_iter=1000 loss='l2', penalty='l1', dual=False skf = StratifiedKFold(n_splits=nfold) y_test_total = [] y_pred_total = [] for train_index, test_index in skf.split(document_term_matrix, labels): X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index] y_train, y_test = labels[train_index], labels[test_index] y_test_total.extend(y_test.tolist()) model = clf.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_total.extend(y_pred.tolist()) p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted') print accuracy_score(y_test, y_pred) a_score.append(accuracy_score(y_test, y_pred)) precision.append(p) recall.append(r) fscore.append(f) plot_learning_curve(clf, "Learning Curves", document_term_matrix, labels, ylim=None, cv=skf, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)) plt.savefig('lc.png') return pd.Series(y_test_total), pd.Series(y_pred_total), np.mean(precision),np.mean(recall),np.mean(fscore), np.mean(a_score)
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('knnLog25.txt','a') print 'running mashable knn simulation' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) with SimpleTimer('time to train', outFile): clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target) plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4) baseScore = clf.score(test_M, dataTest.target) baseParams = clf.get_params(True) baseNeighbors = baseParams['n_neighbors'] print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors)) res = [] with SimpleTimer('time to fine tune number of neighbors', outFile): for neighbors in range(2,baseNeighbors * 10): # print 'training for neighbors %d' % neighbors clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, neighbors)) outFile.write('%d %.3f \n' % (neighbors, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestNeighbors = res[0][1] print ('best number of neighbors is %d' % bestNeighbors) outFile.write('best number of neighbors is %d and score is %.3f\n' % (bestNeighbors, res[0][0])) bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance') bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
def algomain(df): scaler = preprocessing.StandardScaler() #开头有Wh/Who且结尾有Q df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int) df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int) #标准化 popTagsNum_scale_param = scaler.fit(df['popTagsNum']) df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param) liNum_scale_param = scaler.fit(df['liNum']) df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param) codeFragNum_scale_param = scaler.fit(df['codeFragNum']) df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param) avgTI_scale_param = scaler.fit(df['avgTI']) df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param) totalTI_scale_param = scaler.fit(df['totalTI']) df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'], totalTI_scale_param) title_scale_param = scaler.fit(df['titleLength']) df['title_scaled'] = scaler.fit_transform(df['titleLength'], title_scale_param) body_scale_param = scaler.fit(df['bodyLength']) df['body_scaled'] = scaler.fit_transform(df['bodyLength'], body_scale_param) train_df = df[[ 'class', 'codeFragNum_scaled', 'liNum_scaled', 'totalTI', 'avgTI', 'popTagsNum_scaled', 'startWithWh', 'endWithQ', 'WhAndQ1', 'WhAndQ0', 'isweekend', 'cntQ', 'cntA', 'body_scaled', 'title_scaled' ]] train_np = train_df.as_matrix() tX, ty = train_np[:, 1:], train_np[:, 0] estm = LinearSVC(C=0.1, penalty='l1', dual=False) plot_learning_curve(estm, 'LinearSVC(C=0.1, penalty=l1)', tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(.1, 1, 10)) estm.fit(tX, ty) print pd.DataFrame({ 'columns': list(train_df.columns[1:]), 'coef': list(estm.coef_.T) })
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf): print 'running decision tree' outFile = open('decisionTreeLog.txt','a') outFile.write('train==> %d, %d \n'%(train_tfidf.shape[0],train_tfidf.shape[1])) outFile.write('test==> %d, %d \n'%(test_tfidf.shape[0],test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) initHeight = clf.tree_.max_depth print 'baseline score %.3f base height %d' % (baseScore, initHeight) outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight)) res = [] with SimpleTimer('time to prune', outFile): for height in range(initHeight, 40, -25): # print 'training for height %d' % height clf = DecisionTreeClassifier(max_depth=height).fit(train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, dataHold.target) res.append((score, height)) outFile.write('%d %.3f \n' % (height, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestDepth = res[0][1] print ('best height is %d' % bestDepth) outFile.write('best depth is %d and score is %.3f \n' % (bestDepth, res[0][0])) bestClf = DecisionTreeClassifier(max_depth=bestDepth) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target wrong = [] for i in range(len(results)): if not results[i]: wrong.append(i) print 'classifier got these wrong:' for i in wrong[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
def algomain(df): scaler = preprocessing.StandardScaler() #开头有Wh/Who且结尾有Q df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int) df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int) #标准化 popTagsNum_scale_param = scaler.fit(df['popTagsNum']) df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param) liNum_scale_param = scaler.fit(df['liNum']) df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param) codeFragNum_scale_param = scaler.fit(df['codeFragNum']) df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param) avgTI_scale_param = scaler.fit(df['avgTI']) df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param) totalTI_scale_param = scaler.fit(df['totalTI']) df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'], totalTI_scale_param) title_scale_param = scaler.fit(df['titleLength']) df['title_scaled'] = scaler.fit_transform(df['titleLength'], title_scale_param) body_scale_param = scaler.fit(df['bodyLength']) df['body_scaled'] = scaler.fit_transform(df['bodyLength'], body_scale_param) train_df = df[['class', 'codeFragNum_scaled', 'liNum_scaled', 'totalTI', 'avgTI', 'popTagsNum_scaled', 'startWithWh', 'endWithQ', 'WhAndQ1', 'WhAndQ0', 'isweekend', 'cntQ', 'cntA', 'body_scaled', 'title_scaled']] train_np = train_df.as_matrix() tX, ty = train_np[:, 1:], train_np[:, 0] estm = LinearSVC(C=0.1, penalty='l1', dual=False) plot_learning_curve(estm, 'LinearSVC(C=0.1, penalty=l1)', tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(.1, 1, 10)) estm.fit(tX, ty) print pd.DataFrame({'columns': list(train_df.columns[1:]), 'coef': list(estm.coef_.T)})
def make_plots(self,roc=True,lrn_crv=True,prec_rec=True,cnf_mtr=True): # Learning Curve # #ROC if roc: plot_roc(self.data, self.clf) # #Precision Recall if prec_rec: plot_precision_recall(self.data, self.clf) # #confusion matrix if cnf_mtr: local_plot_confusion_matrix(self.data, self.clf) if lrn_crv: plot_learning_curve(self.clf, self.title, self.data.x_train, np.ravel(self.data.y_train))
def ADA_Learning_Curves(X, Y, datasource, n_estimators_value): title = "ADA Learning Curves " + datasource cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626) estimator = AdaBoostClassifier(n_estimators=n_estimators_value, random_state=626) plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv) plt.show()
def MLP_Learning_Curves(X, Y, datasource, hidden_layer_sizes_value): title = "MLP Learning Curves on " + datasource cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626) estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_value, random_state=626) plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv) plt.show()
def algomain(df): scaler = preprocessing.StandardScaler() #标准化 popTagsNum_scale_param = scaler.fit(df['popTagsNum']) df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param) liNum_scale_param = scaler.fit(df['liNum']) df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param) codeFragNum_scale_param = scaler.fit(df['codeFragNum']) df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param) bodyLen_scale_param = scaler.fit(df['bodyLength']) df['bodyLen_scaled'] = scaler.fit_transform(df['bodyLength'], bodyLen_scale_param) titleLen_scale_param = scaler.fit(df['titleLength']) df['titleLen_scaled'] = scaler.fit_transform(df['titleLength'], titleLen_scale_param) train_df = df[[ 'class', 'codeFragNum_scaled', 'liNum_scaled', 'popTagsNum_scaled', 'startWithWh', 'endWithQ', 'bodyLen_scaled', 'titleLen_scaled' ]] train_np = train_df.as_matrix() tX = train_np[:, 1:] ty = train_np[:, 0] estm = SGDClassifier(loss='log', penalty='l1', alpha=0.015) plot_learning_curve(estm, "LogisticRegression(L1), cv=10-fold", tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(.1, 1, 10)) estm.fit(tX, ty) print pd.DataFrame({ 'columns': list(train_df.columns[1:]), 'coef': list(estm.coef_.T) })
def experiment(svc, X_train, y_train, X_test, y_test, tag): model_linear = svc # Plot learning-curve plot_learning_curve(svc, "Learning curve", X_train, y_train) plt.savefig(tag + 'learning_curve.png') model_linear.fit(X_train, y_train) # predict y_pred = model_linear.predict(X_test) # Confussion matrix plot_confusion_matrix(svc, X_test, y_test) plt.savefig(tag + 'confussion_matrix' + '.png') # Prescision, accuracy, sensitivity and specifity print("report:", metrics.classification_report(y_true=y_test, y_pred=y_pred), "\n")
def algomain(df): scaler = preprocessing.StandardScaler() # 标准化 popTagsNum_scale_param = scaler.fit(df["popTagsNum"]) df["popTagsNum_scaled"] = scaler.fit_transform(df["popTagsNum"], popTagsNum_scale_param) liNum_scale_param = scaler.fit(df["liNum"]) df["liNum_scaled"] = scaler.fit_transform(df["liNum"], liNum_scale_param) codeFragNum_scale_param = scaler.fit(df["codeFragNum"]) df["codeFragNum_scaled"] = scaler.fit_transform(df["codeFragNum"], codeFragNum_scale_param) bodyLen_scale_param = scaler.fit(df["bodyLength"]) df["bodyLen_scaled"] = scaler.fit_transform(df["bodyLength"], bodyLen_scale_param) titleLen_scale_param = scaler.fit(df["titleLength"]) df["titleLen_scaled"] = scaler.fit_transform(df["titleLength"], titleLen_scale_param) train_df = df[ [ "class", "codeFragNum_scaled", "liNum_scaled", "popTagsNum_scaled", "startWithWh", "endWithQ", "bodyLen_scaled", "titleLen_scaled", ] ] train_np = train_df.as_matrix() tX = train_np[:, 1:] ty = train_np[:, 0] estm = SGDClassifier(loss="log", penalty="l1", alpha=0.015) plot_learning_curve( estm, "LogisticRegression(L1), cv=10-fold", tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(0.1, 1, 10) ) estm.fit(tX, ty) print pd.DataFrame({"columns": list(train_df.columns[1:]), "coef": list(estm.coef_.T)})
def main(args): #Getting all training reports for analysis and creating json dictionary of information on file. train_reports=gen_file_lst(args.raw_results_dir) train_report_detail=extract_model_type(train_reports) with open(args.haralick_txt_params,'r') as fb: haralick_params=json.load(fb) # trn_image_dict = read_data(args.train_data_dir) tst_image_dict = read_data(args.test_data_dir) #Iterating through reports for analysis for data_combos in train_report_detail: data_combos['model_type']='svm_sgd' #Generate training numpy arrays for analysis #ipdb.set_trace() X_train, y_train = create_dataset(trn_image_dict,haralick_params,args.text_dir,data_combos['model_type']) X_test, y_test= create_dataset(tst_image_dict,haralick_params,args.text_dir,data_combos['model_type']) scaling = MinMaxScaler(feature_range=(0,1)).fit(X_train) X_train = scaling.transform(X_train) X_test = scaling.transform(X_test) #load data for analysis into dataframe tmp_arr_dict=np.load(data_combos['path'],allow_pickle=True) tmp_arr_df=tmp_arr_dict.item().get('cv_results_') tmp_arr_df=pd.DataFrame.from_dict(tmp_arr_df) tmp_arr_df['params'].apply(pd.Series) #Perform analysis for generating tmp_arr_df.sort_values('rank_test_score',ascending=True,inplace=True) trl_arr_df_params_lst=tmp_arr_df['params'][:5].tolist() #Restructure file name for analysis #ipdb.set_trace() if data_combos['model_type']!='svm_sgd': model_params_reformat=reformat_model_params(trl_arr_df_params_lst) else: model_params_reformat=trl_arr_df_params_lst #ipdb.set_trace() #Taking the top 5 performers forward for running analysis with training and testing curves. for vals in model_params_reformat: #Generating detailed tile for model performance. title2='_'.join(['_'.join((k,str(v))) for k,v in vals.items()]) title1='_'.join([v for k,v in data_combos.items() if k!='path']) title=title1+'_'+title2 tmp_estimator=gen_estimator(data_combos['model_type'],vals) tmp_fig=plot_learning_curve(tmp_estimator, title, X_train, y_train, cv=3,n_jobs=-1) #Save figure for analysis dst_dir_f=os.path.join(args.dest_dir,title+'.jpeg') tmp_fig.savefig(dst_dir_f)
def test_learning_curve(): X = data[[0, 1, 2, 3, 4]].values y = data['outcome-class'].values fig = plot_learning_curve(estimator, "50 k-NN learning curve", X, y, cv=3, verbose=2, train_sizes=np.linspace(.1, 0.99, 20)) fig.show()
def plot_learning_curve(self, name, X, y, cv=5): """画学习曲线 根据cv结果画学习曲线 :param name:标题 :param X:输入X :param y: 标签y :param cv:cv :return:plt """ plt = plot_learning_curve(self.model, name, X, y, ylim=None, cv=cv) return plt
def main_learning_curve(x, y): title = "RF Learning Curves" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) clf1 = RandomForestClassifier(n_estimators=100, max_depth=None) # plot_learning_curve(clf, title, x, y, cv=cv, train_sizes=np.logspace(-3, 0, 4), log_x=True, n_jobs=-1) # title = "Learning Curves (1000)" clf2 = RandomForestClassifier(n_estimators=1000, max_depth=None) plot_learning_curve((clf1, clf2), title, x, y, cv=cv, train_sizes=np.logspace(-3, 0, 4), log_x=True, n_jobs=-1) plt.show()
def runBoosting(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('boostingLog.txt','a') print 'running boosting algo' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) # takes a very long time to run # score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_M, test_M) bestDepth = 7 bestNum = 10000 with SimpleTimer('time to train', outFile): estimator = DecisionTreeClassifier(max_depth=bestDepth) bestClf = AdaBoostClassifier(base_estimator=estimator, n_estimators=bestNum) bestClf.fit(train_M, dataTrain.target) bestScore = bestClf.score(test_M, dataTest.target) print 'the best score %.3f' % bestScore outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore)) bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('training score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'boosting with %d trees' % bestNum, train_M, dataTrain.target, cv=3, n_jobs=4)
def plot_lc(self, algoName, inputData=XConstant.test_data, title='learning curve'): clf = self.findEstimator(algoName) if clf is not None: # load data print('start loading data...') delims = '\s+' dataType = np.str rawData = pd.read_csv(inputData, dtype=dataType, sep=delims) dim = rawData.shape print('size of data: (%d, %d)' % (dim[0], dim[1])) target = rawData.ix[:, 0].astype('float') data = rawData.ix[:, 1:dim[1]].astype('float') data = xman.oneHotEncoder(data) print('data loaded.') cv = cross_validation.ShuffleSplit(dim[0], n_iter=10, test_size=0.2, random_state=0) plt = plot_learning_curve(clf, title, data, target, ylim=(0.0, 1.01), cv=cv, n_jobs=4) #plt.show() if not os.path.exists(XConstant.lc_dir): os.mkdir(XConstant.lc_dir) plt.savefig(XConstant.lc_dir + algoName + '_' + str(time.time()) + '.png') print('learning curve drawing finished.') else: print('learning curve drawing failed.')
def run(n_folds=5, use_pickle=True, use_coref=True): # maps pubmed identifiers to token features # and corresponding labels pmids_dict, X_tokens = get_PMIDs_to_X_y(use_pickle, use_coref) ''' train / test ''' ''' * CV on PMIDs * ''' all_pmids = pmids_dict.keys() n = len(all_pmids) kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) title = "Learning Curves (SVM)" ## Learning Curve class_weights = {} class_weights[1] = 4.0462962962963 class_weights[-1] = 0.570496083550914 estimator = svm.SVC(class_weight=class_weights, cache_size=1000) train_X, _, train_y = get_features_for_pmids(pmids_dict, all_pmids) plc.plot_learning_curve(estimator, title, train_X, train_y, cv=5) plt.show() ## fold_metrics = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] # sanity check assert (len(set(train_pmids).intersection(set(test_pmids)))) == 0 train_X, _, train_y = get_features_for_pmids(pmids_dict, train_pmids) test_X, test_index_X, test_y = get_features_for_pmids( pmids_dict, test_pmids) #model = SGDClassifier(loss="hinge", penalty="l2", n_iter=250, alpha=0.0001, class_weight='balanced') class_weights = {} class_weights[1] = 4.0462962962963 class_weights[-1] = 0.570496083550914 model = svm.SVC(class_weight=class_weights, cache_size=1000) model.fit(train_X, train_y) #model = RandomForestClassifier(n_estimators = 100) #model.fit(train_X, train_y) #predict_y = list(model.predict_classes(test_X)) predict_y = list(model.predict(test_X)) r, p, accuracy, auc, tp_overlapping_tokens, fp_tokens = _evaluate_detection( test_y, predict_y, test_index_X) if p + r == 0: f1 = None else: f1 = (2 * p * r) / (p + r) tp_spans, tn_spans, fp_spans, fn_spans = _error_report( predict_y, test_y, test_index_X) cm = confusion_matrix(test_y, predict_y) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) plt.figure() plot_confusion_matrix(cm) # Normalize the confusion matrix by row (i.e by the number of samples # in each class) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) plt.figure() plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') #plt.show() print( "fold %s. precision: %s; recall: %s; f1: %s, accuracy: %s, auc: %s" % (fold_idx, p, r, f1, accuracy, auc)) #pdb.set_trace() fold_metrics.append([p, r, f1, accuracy, auc]) if use_coref: file_name_suffix = str(fold_idx) + '_with_coref_tfidf_' + str( time.time()) + '.txt' else: file_name_suffix = str(fold_idx) + '_no_coref_tfidf_' + str( time.time()) + '.txt' with open('results_true_' + file_name_suffix, 'wb') as results_true: results_true.write(str((p, r, f1, accuracy, auc)) + "\n") results_true.write(str(tp_spans) + "\n") results_true.write(str(tn_spans)) with open('results_false_' + file_name_suffix, 'wb') as results_false: results_false.write(str(fp_spans) + "\n") results_false.write(str(fn_spans)) #convert to numpy array fold_metrics = np.array(fold_metrics) print("mean: %s, variance: %s" % (np.mean(fold_metrics, axis=0), np.var(fold_metrics, axis=0)))
def KNN_Learning_Curves(X, Y, datasource, n_neighbors_number): title = "KNN Learning Curves on" + datasource cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626) estimator = KNeighborsClassifier(n_neighbors=n_neighbors_number) plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv) plt.show()
def run_boosting(training_features, training_labels, test_features, test_labels, passed_parameters = None): """ Classifies the data using sklearn's ADAboost Does not natively support pruning so max_depth is being used for the decision tree Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label max_depth: maximum tree depth to be applied (will simulate pruning) Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() #set up underlying decision tree classifier base_classifier = tree.DecisionTreeClassifier() #set up the boosting method estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier) #set up parameters for the classifier parameters = {'base_estimator__max_depth': range(1, 5), 'n_estimators' : range(10, 500, 50), 'learning_rate' : [.25, .5, .75, 1.0] } #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if(is_number(parameters[param][0])): title = 'Validation Curves \n(AdaBoost)' save_name = "Validation Curves - AdaBoost - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up parameters for the classifier if(passed_parameters is None): parameters = {'base_estimator__max_depth': range(1, 3), 'n_estimators' : range(5, 51, 5), 'learning_rate' : [1.0] } else: parameters = passed_parameters #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) #get the prediction and accuracy of the test set test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #graph the best result base_classifier = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.base_estimator_.max_depth) estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier, n_estimators = classifier.best_estimator_.n_estimators, learning_rate = classifier.best_estimator_.learning_rate) #plot the learning curve title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % (classifier.best_estimator_.base_estimator_.max_depth, classifier.best_estimator_.n_estimators, classifier.best_estimator_.learning_rate) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig(os.path.join(results_location, 'Learning Curves - AdaBoost - Decision Tree.png')) time_3 = time.time() #fit the best eetimator estimator.fit(training_features, training_labels) #plot the learning curve by number of estimators plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators, training_features, test_features, training_labels, test_labels) pylab.savefig(os.path.join(results_location, 'Estimator Curves - AdaBoost - Decision Tree.png')) #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Decision Tree Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true = test_labels, y_pred = test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true = test_labels, y_pred = test_prediction)) return test_prediction, test_accuracy
import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.model_selection import ShuffleSplit from plot_learning_curve import plot_learning_curve digits = load_digits() X, y = digits.data, digits.target # 加载样例数据 # 图一 title = r"Learning Curves (Naive Bayes)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = GaussianNB() # 建模 plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1) # 图二 title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) # 建模 plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=1) plt.show()
def test_KNN(X_whole, y_whole, X, y): # Split the initial data xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42) start=datetime.now() ### NNLearner Implementation ### knnlearner = knn.KNNLearner(n_folds=3, verbose=True) # Create a validation set - do another train/test split on the training data xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42) ########## Initial Learning Curves for Different Neighbor Sizes ########## # 2 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=2) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 2 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_2neigh.png') # 4 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=4) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 4 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_4neigh.png') # 6 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=6) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 6 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_6neigh.png') # 8 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=8) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 8 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_8neigh.png') # 10 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=10) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 10 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_10neigh.png') # Get a list of possible knn's and their respective neighbor_types flag = 0 clfs, neighbor_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the neighbor_type with highest accuracy weight_values = "NA" algorithm_types = "NA" metric_types = "NA" p_values = "NA" knn_choice_neighbor_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective weight values flag = 1 clfs, weight_values = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the weight with highest accuracy neighbor_types = "NA" algorithm_types = "NA" metric_types = "NA" p_values = "NA" knn_choice_weight_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective algorithm_types flag = 2 clfs, algorithm_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the algorithm with highest accuracy neighbor_types = "NA" weight_values = "NA" metric_types = "NA" p_values = "NA" knn_choice_algorithm_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective metric types flag = 3 clfs, metric_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the metric with highest accuracy neighbor_types = "NA" weight_values = "NA" algorithm_types = "NA" p_values = "NA" knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective p values flag = 4 clfs, p_values = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the p value with highest accuracy neighbor_types = "NA" weight_values = "NA" algorithm_types = "NA" metric_types = ['minkowski'] knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Now that we have the knn, time for tuning hyperparameters # Make a new classifier for this clf = KNeighborsClassifier() clf.fit(xtrain_val, ytrain_val) best_params = knnlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val) print("Best params are: ", best_params) # Now do one more fit based on best params above final_classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights'], algorithm=best_params['algorithm'],metric=best_params['metric'],p=best_params['p']) final_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Learning Curves (KNN)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = final_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve.png') # Now time for final accuracy score for test set knnlearner.final_test(final_classifier,xtest,ytest) print(datetime.now()-start)
def learning_curve(): estimator = KNeighborsClassifier(50) folds = KFold(n=len(X), n_folds=10, shuffle=True) fig = plot_learning_curve(estimator, "50-NN learning curve", X, y, cv=folds, verbose=2, train_sizes=np.linspace(.1, 1.0, 25)) fig.show()
param = {'n_estimators': list(np.arange(10, 150, 10)), 'min_samples_split': list(np.arange(1, 10, 2)), 'min_samples_leaf': list(np.arange(1, 10, 2))} rfc = RandomForestClassifier(n_estimators = 120, min_samples_split=5, min_samples_leaf=5) # print "GridSearchCV on RFC..." # rfc = GridSearchCV(estimator=rfc, cv=cv, param_grid=param) rfc.fit(X_train, y_train) # # summarize the results of the grid search # print(rfc.best_score_) # print "Best n_estimators found by GridSearch: ", rfc.best_estimator_.n_estimators # print "Best min_samples_split found by GridSearch: ", rfc.best_estimator_.min_samples_split # print "Best min_samples_leaf found by GridSearch: ", rfc.best_estimator_.min_samples_leaf title = "Learning curves (Random Forest Classifier)" plc.plot_learning_curve(rfc, title, X_train, y_train, cv=cv) plt.show() print "Prediction score on test set: ", rfc.score(X_test, y_test) print "Creating testing errors file..." y_pred = rfc.predict(X_test) gte.get_testing_errors(X_test, y_test, y_pred) print "Creating kaggle submission file..." predictions = rfc.predict(kaggle_ds[predictors]) submission = pd.DataFrame({"PassengerId": kaggle_ds["PassengerId"], "Survived": predictions}) submission.to_csv("submission/kaggle.csv", index=False)
def test_Boosting(X_whole, y_whole, X, y): # Split the initial data xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42) start=datetime.now() ### Boosting Implementation ### boostlearner = boost.BoostingLearner(n_folds=3, verbose=True) # Create a validation set - do another train/test split on the training data xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42) ########## Initial Learning Curves for Different Pruning Types ########## # ccp_alpha = 0.0 # Initial Fit initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0)) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0.png') # ccp_alpha = 0.0002 # Initial Fit initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0002)) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0002)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0002.png') # ccp_alpha = 0.0004 # Initial Fit initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0004)) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0004)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0004.png') # ccp_alpha = 0.0008 # Initial Fit initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0008)) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0008)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0008.png') # ccp_alpha = 0.0010 # Initial Fit initial_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0010)) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (Adaboost - ccp_alpha=0.0010)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve_initial_ccpa_0010.png') # Get a list of possible boostings and their respective alphas flag = 0 clfs, pruning_types = boostlearner.train(xtrain_val,ytrain_val,flag) # Get the boosting that is correlated to the alpha with highest accuracy number_estimators = "NA" learning_rates = "NA" boosting_choice_alpha_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag) # Get a list of possible boostings and their respective estimators flag = 1 clfs, number_estimators = boostlearner.train(xtrain_val,ytrain_val,flag) # Get the boosting that is correlated to the number of estimators with highest accuracy pruning_types = "NA" learning_rates = "NA" boosting_choice_estimators_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag) # Get a list of possible boostings and their respective learning_rates flag = 2 clfs, learning_rates = boostlearner.train(xtrain_val,ytrain_val,flag) # Get the boosting that is correlated to the learning rate with highest accuracy pruning_types = "NA" number_estimators = "NA" boosting_choice_lr_based = boostlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,pruning_types, number_estimators, learning_rates, flag) # Now that we have the boosting, time for tuning hyperparameters # Make a new classifier for this clf = AdaBoostClassifier() clf.fit(xtrain_val, ytrain_val) best_params = boostlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val) print("Best params are: ", best_params) # Now do one more fit based on best params above final_classifier = AdaBoostClassifier(base_estimator=best_params['base_estimator'],n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate']) final_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Learning Curves (Boosting)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = final_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/Boosting/boosting_learningcurve.png') # Now time for final accuracy score for test set boostlearner.final_test(final_classifier,xtest,ytest) print(datetime.now()-start)
def run_k_nearest_neighbors(training_features, training_labels, test_features, test_labels, passed_parameters = None): """ Classifies the data using sklearn's k nearest neighbors classifier Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label k: number of nearest neighbors used in the algorithm Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = neighbors.KNeighborsClassifier() #set up parameters for the classifier if(passed_parameters is None): parameters = {'n_neighbors': range(1, 11), 'weights': ['uniform', 'distance'], 'p': [1, 2] } else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if(is_number(parameters[param][0])): title = 'Validation Curves \n(kNN)' save_name = "Validation Curves - kNN - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = neighbors.KNeighborsClassifier(n_neighbors = classifier.best_estimator_.n_neighbors, weights = classifier.best_estimator_.weights, algorithm = classifier.best_estimator_.algorithm, leaf_size = classifier.best_estimator_.leaf_size, p = classifier.best_estimator_.p, metric = classifier.best_estimator_.metric) #plot the learning curve title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % (classifier.best_estimator_.n_neighbors, classifier.best_estimator_.weights, classifier.best_estimator_.algorithm, classifier.best_estimator_.leaf_size, classifier.best_estimator_.p) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png')) #plt.show() time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("kNN Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true = test_labels, y_pred = test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true = test_labels, y_pred = test_prediction)) return test_prediction, test_accuracy
def runSVMSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): kernel = "linear" outFile = open('svmSarinLog%s.txt' % kernel,'a') print 'running svm code' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) penalty = 0.025 with SimpleTimer('time to train', outFile): # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42) # clf = LinearSVC(C=1.0) clf = SVC(kernel=kernel, C=penalty, degree=1) clf.fit(train_M, dataTrain.target) baseScore = clf.score(test_M, dataTest.target) baseIter = 5 print 'baseline score %.3f base iter %d' % (baseScore, baseIter) outFile.write('baseline score %.3f base iter %d \n' % (baseScore, baseIter)) res = [] with SimpleTimer('number of iter', outFile): for pen in [1,5,10,15,20,30]: print 'training for neighbors %.3f' % pen clf = SVC(kernel=kernel, C=pen, degree=1) # clf = LinearSVC(loss='squared_hinge', C=1.0) clf.fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, pen)) trainPredict = clf.score(train_M, dataTrain.target) outFile.write('test %.3f %.3f \n' % (pen, score)) outFile.write('train %.3f %.3f \n' % (pen, trainPredict)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestPen = res[0][1] print ('best number of iter is %.3f' % bestPen) bestClf = SVC(kernel=kernel, C=penalty, degree=bestPen) bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('training score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'svm with %s kernel & penalty %.3f' % (kernel, bestPen), train_M, dataTrain.target, cv=5, n_jobs=4) '''
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier dataset = load_data.load_sgp_data("trigger_sgp_hy.nc") cape = np.loadtxt("../../data/sgp/sgp_undilute_cape.txt") lcl = np.loadtxt("../../data/sgp/sgp_undilute_lcl.txt") dataset['cape'] = cape dataset['lcl'] = lcl trig_x = dataset.iloc[:, 0:86] trig_y = dataset.iloc[:, 86] cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10) xgb = XGBClassifier(n_estimators=600, silent=True, nthread=8, max_depth=7, scale_pos_weight=3.5) title = "Learning Curves (XGBoost)" plot_learning_curve.plot_learning_curve(xgb, title, trig_x, trig_y, ylim=(0.7, 1.01), cv=cv, n_jobs=8) plt.show()
grid_scores = DataFrame(clf.grid_scores_) grid_scores.to_csv("grid_scores_nusvc.csv") print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) # Make a model with the best parameters estimator = NuSVC(kernel='rbf', gamma=clf.best_estimator_.gamma, nu=clf.best_estimator_.nu) # C=clf.best_estimator_.C) # Plot the learning curve to find a good split title = 'NuSVC' plot_learning_curve(estimator, title, X_train, y_train, cv=cv, n_jobs=4) p.savefig("supervised_learning_nusvc.pdf") # Find a good number of test samples before moving on # raw_input("Continue??") # With a good number of test samples found, predict the whole set to the model estimator.fit(X_train, y_train) y_pred = estimator.predict(X_all) DataFrame(y_pred).to_csv("supervised_prediction_labels_nusvc.csv") print(classification_report(y_all, y_pred)) print "Best params are:" + str(clf.best_params_) # Hold here raw_input("Continue??") # Now take the model found, and find the outliers
def runDecisionTreeSimulation(dataTrain, dataTest, dataHold, train_tfidf, test_tfidf, hold_tfidf): print 'running decision tree' outFile = open('decisionTreeLog.txt', 'a') outFile.write('train==> %d, %d \n' % (train_tfidf.shape[0], train_tfidf.shape[1])) outFile.write('test==> %d, %d \n' % (test_tfidf.shape[0], test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): clf = DecisionTreeClassifier().fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) initHeight = clf.tree_.max_depth print 'baseline score %.3f base height %d' % (baseScore, initHeight) outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight)) res = [] with SimpleTimer('time to prune', outFile): for height in range(initHeight, 40, -25): # print 'training for height %d' % height clf = DecisionTreeClassifier(max_depth=height).fit( train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, dataHold.target) res.append((score, height)) outFile.write('%d %.3f \n' % (height, score)) res = sorted(res, key=lambda x: x[0], reverse=True) print res[:5] bestDepth = res[0][1] print('best height is %d' % bestDepth) outFile.write('best depth is %d and score is %.3f \n' % (bestDepth, res[0][0])) bestClf = DecisionTreeClassifier(max_depth=bestDepth) bestClf.fit(train_tfidf, dataTrain.target) predicted = bestClf.predict(test_tfidf) train_predict = bestClf.predict(train_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target wrong = [] for i in range(len(results)): if not results[i]: wrong.append(i) print 'classifier got these wrong:' for i in wrong[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_tfidf, dataTrain.target, cv=5, n_jobs=4)
print(dt_clf.best_params_) # In[7]: dt_optimized.fit(X_train, y_train) dt_optimized.score(X_test, y_test) # In[8]: from sklearn.model_selection import learning_curve from plot_learning_curve import plot_learning_curve import matplotlib.pyplot as plt plot_learning_curve(dt_optimized, title='Decision Tree learning curve', X=X_train, y=y_train, cv=10) plt.show() # ## knn # In[22]: from sklearn.neighbors import KNeighborsClassifier tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': [1, 2, 5, 10, 25]}] knn_clf = GridSearchCV(KNeighborsClassifier(n_neighbors=1), tuned_parameters, cv=10) knn_clf.fit(X_train, y_train) knn_optimized = knn_clf.best_estimator_ print(knn_clf.best_params_)
print "Log loss regression (test/train) : {:.5f}/{:.5f}".format( \ log_loss(y_test, logReg.predict_proba(X_test)), \ log_loss(y_train, logReg.predict_proba(X_train))) print "Log loss p(click) = 0.5 : {:.5f}".format( \ log_loss(y_test, 0.5*np.ones(len(y_test)))) print "Log loss p(click) = {:.5f} : {:.5f}".format(1.0*y.sum()/len(y), log_loss(y_test, 1.0*y.sum()/len(y)*np.ones(len(y_test)))) # scorer for log loss logl_sc = make_scorer(log_loss,needs_proba=True,greater_is_better=False) # cross validation splitter (5x 70-30) cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) #print "Cross-val score = {:.5f}".format(\ # cross_val_score(logReg, X_train, y_train, scoring=logl_sc, cv=cv).mean()) # plot learning curve lc = plot_learning_curve(logReg, "LogReg", X_train, y_train, score=logl_sc, cv=cv, n_jobs=4) # add CTR benchmark to learning curve plot addBenchToPlot(lc) # plot regularization validation curve plot_validation_curve(logReg, X_train, y_train, title="Regularization", ylim=None, cv=cv, score=logl_sc, n_jobs=4, param_range = np.logspace(-2,0,5))
print('Training Test') for i in range(len(results)): print("name: {}; score: {}".format(results[i][0], results[i][1])) print('') #模型驗證評估 results = [] for name, model in models: kfold = KFold(n_splits=10) # K折交叉驗證器,將資料折成10份(9份訓練, 1份測試) cv_result = cross_val_score(model, X, Y, cv=kfold) #交叉驗證評估分數 results.append((name, cv_result)) cv_ShuffleSplit = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) #畫出學習曲線 plt_learn = plot_learning_curve(model, "Learn Curve for KNN Diabetes", X, Y, ylim=(0., 1.2), cv=cv_ShuffleSplit) print('Cross Validation') for i in range(len(results)): print("name: {}; cross val score: {}".format(results[i][0], results[i][1].mean())) print('') #模型之後可以用下列方法預測未知的資料 #print("predict",models[0][1].predict(X),models[0][1].predict(X).shape) #挑出兩個最佳特徵 from sklearn.feature_selection import SelectKBest
def run_support_vector_machines(training_features, training_labels, test_features, test_labels, passed_parameters = None): """ Classifies the data using sklearn's support vector machine classifier Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = svm.SVC() #set up parameters that will be used by all kernels if(passed_parameters is None): parameters = {'C': [1e0, 5e0, 1e1, 5e1]} else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if(is_number(parameters[param][0])): title = 'Validation Curves' save_name = "Validation Curves - SVC - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = svm.SVC(kernel = classifier.best_estimator_.kernel, C = classifier.best_estimator_.C, gamma = classifier.best_estimator_.gamma, degree = classifier.best_estimator_.degree) #plot the learning curve title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % (classifier.best_estimator_.kernel, classifier.best_estimator_.degree, classifier.best_estimator_.gamma, classifier.best_estimator_.C) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) save_file_name = 'Learning Curves - SVM.png' pylab.savefig(os.path.join(results_location, save_file_name)) #plt.show() time_3 = time.time() if(classifier.best_estimator_.kernel == 'linear'): coefficients = classifier.estimator.coef_ print('\n\n-----------------------') print(' Coefficients') print(coefficients) #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("SVM Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true = test_labels, y_pred = test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true = test_labels, y_pred = test_prediction)) return test_prediction, test_accuracy
import xgboost from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from plot_learning_curve import plot_learning_curve import matplotlib.pyplot as plt X, y = load_data() # Learning Curves for LogisticRegression Tuning lr_a = LogisticRegression() # C=1.0 lr_b = LogisticRegression(C=0.1) lr_c = LogisticRegression(C=0.03) plt.figure() plot_learning_curve(lr_a, X, y, 'C=1.0') plot_learning_curve(lr_b, X, y, 'C=0.1') plot_learning_curve(lr_c, X, y, 'C=0.03') plt.legend(loc=(0, 1.00), ncol=2, fontsize=11) plt.savefig('LogisticRegression_Tuning' + '.png', format='png') # Learning Curves for all the tuned classifiers xgb = xgboost.XGBClassifier(objective="multi:softprob", nthread=-1) gbrt = GradientBoostingClassifier(random_state=0) forest = RandomForestClassifier(n_jobs=-1, random_state=0) plt.figure() plot_learning_curve(xgb, X, y, 'xgb') plot_learning_curve(gbrt, X, y, 'gbrt') plot_learning_curve(forest, X, y, 'forest') plot_learning_curve(lr_c, X, y, 'LR')
grid_scores = DataFrame(clf.grid_scores_) grid_scores.to_csv("grid_scores_nusvc.csv") print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) # Make a model with the best parameters estimator = NuSVC(kernel='rbf', gamma=clf.best_estimator_.gamma, nu=clf.best_estimator_.nu) # C=clf.best_estimator_.C) # Plot the learning curve to find a good split title = 'NuSVC' plot_learning_curve(estimator, title, X_train, y_train, cv=cv, n_jobs=4) p.savefig("supervised_learning_nusvc.pdf") # Find a good number of test samples before moving on # raw_input("Continue??") # With a good number of test samples found, predict the whole set to the model estimator.fit(X_train, y_train) y_pred = estimator.predict(X_all) DataFrame(y_pred).to_csv("supervised_prediction_labels_nusvc.csv") print(classification_report(y_all, y_pred)) print "Best params are:" + str(clf.best_params_) # Hold here raw_input("Continue??")
def runSVMSimulation(dataTrain, dataTest, holdOut, train_tfidf, test_tfidf, hold_tfidf): kernel = 'poly' penalty = 1.0 outFile = open('svmLog%s.txt' % kernel, 'a') degree = 3 outFile.write('train==> %d, %d \n' % (train_tfidf.shape[0], train_tfidf.shape[1])) outFile.write('test==> %d, %d \n' % (test_tfidf.shape[0], test_tfidf.shape[1])) with SimpleTimer('time to train', outFile): # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42) clf = SVC(kernel=kernel, C=penalty, degree=degree) clf.fit(train_tfidf, dataTrain.target) baseScore = clf.score(test_tfidf, dataTest.target) baseIter = 5 print 'baseline score %.3f penalty %d' % (baseScore, baseIter) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseIter)) res = [] with SimpleTimer('number of iter', outFile): for pen in [1, 2, 3, 4, 5]: print 'training for peanalty %f' % pen # clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=itr, random_state=42) clf = SVC(kernel=kernel, C=1.0, degree=pen) clf.fit(train_tfidf, dataTrain.target) score = clf.score(hold_tfidf, holdOut.target) res.append((score, pen)) outFile.write('%.3f %.3f \n' % (pen, score)) res = sorted(res, key=lambda x: x[0], reverse=True) print res[:5] bestPen = res[0][1] print('best number of iter is %.3f' % bestPen) bestClf = SVC(kernel=kernel, C=1.0, degree=bestPen) bestClf.fit(train_tfidf, dataTrain.target) train_predict = bestClf.predict(train_tfidf) predicted = bestClf.predict(test_tfidf) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, train_predict, outFile) results = predicted == dataTest.target res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) plot_learning_curve(bestClf, 'svm with %s kernel & degree %.3f' % (kernel, bestPen), train_tfidf, dataTrain.target, cv=5, n_jobs=4) '''
def run_decision_tree(training_features, training_labels, test_features, test_labels, passed_parameters = None, headings = None): """ Classifies the data using sklearn's decision tree Does not natively support pruning so max_depth is being used Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label max_depth: maximum tree depth to be applied (will simulate pruning) Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = tree.DecisionTreeClassifier() #set up parameters for the classifier if(passed_parameters == None): parameters = {'max_depth': None} else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if(is_number(parameters[param][0])): title = 'Validation Curves \n(Decision Tree)' save_name = "Validation Curves - Decision Tree - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.max_depth, criterion = classifier.best_estimator_.criterion) estimator.fit(training_features, training_labels) #plot the learning curve title = 'Learning Curves \n(Decision Tree, max depth=%i)' %classifier.best_estimator_.max_depth plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig(os.path.join(results_location, 'Learning Curves - Decision Tree.png')) #plt.show() #save the visualization of the decision tree only use the top 5 levels for now tree_data = StringIO() tree.export_graphviz(estimator, out_file=tree_data, max_depth=5, feature_names=headings) graph = pydot.graph_from_dot_data(tree_data.getvalue()) graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf")) time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Decision Tree Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true = test_labels, y_pred = test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true = test_labels, y_pred = test_prediction)) return test_prediction, test_accuracy
train_indices = np.load(clean_data_dir + data_size + "train_indices.npy") train_labels = pd.read_csv(labels_dir + data_size + 'appetency.labels', header=None) train_labels = squeeze(train_labels.values)[train_indices] train_labels[train_labels == -1] = 0 #n_true = sum(train_labels == 1) #train_labels = np.concatenate((train_labels[train_labels == 1], train_labels[train_labels == 0][:n_true]), axis=1) #train_data = np.concatenate((train_data[train_labels == 1, :], train_data[train_labels == 0, :][:n_true, :]), axis=0) clf = None if alg == "SVM": clf = svm.SVC(probability=True, kernel='linear', class_weight='auto') elif alg == "SGD": clf = SGDClassifier(loss='log') elif alg == "GBM": clf = ensemble.GradientBoostingClassifier(max_features=max_features, subsample=subsample, learning_rate=learning_rate) log("cross_val_score") plot_title = data_size + "%s n_factors: %i, subsample: %0.2f, learning_rate: %0.4f, max_features: %0.2f" \ % (alg, n_factors, subsample, learning_rate, max_features) scores = plot_learning_curve(clf, plot_title, train_data, train_labels, cv=3, verbose=4, scoring='roc_auc') plt.show() log("done")
axis=1) encode1 = LabelEncoder() y = encode1.fit_transform(data['Loan_Status']) trainx, testx, trainy, testy = train_test_split( X, y, random_state=42, test_size=0.2) #splitting my data into trainand cross validation set '''model selection''' t1 = time() #model1=RandomForestClassifier(n_estimators=100,min_samples_split=20,max_depth=10,min_samples_leaf=10) #model1=SVC(C=1,gamma=0.05) #model1=GradientBoostingClassifier(n_estimators=100,learning_rate=0.01) model1.fit(trainx, trainy) print(time() - t1) print(model1.score(testx, testy)) fig = plot_learning_curve(model1, 'gbm', trainx, trainy.astype(int)) fig.show() predictions = model1.predict(test1) predictions = predictions.astype(str) predictions[(predictions == '1')] = 'Y' predictions[predictions == '0'] = 'N' sub = pd.DataFrame({ 'Loan_ID': test['Loan_ID'], 'Loan_Status': predictions }) sub.to_csv('av.csv', index=False)
def algomain(df): scaler = preprocessing.StandardScaler() #liNum 只看>=2的部分 df['liG2'] = (df.liNum > 2).astype(int) #开头有Wh/Who且结尾有Q df['WhAndQ1'] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int) df['WhAndQ0'] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int) #标准化 popTagsNum_scale_param = scaler.fit(df['popTagsNum']) df['popTagsNum_scaled'] = scaler.fit_transform(df['popTagsNum'], popTagsNum_scale_param) liNum_scale_param = scaler.fit(df['liNum']) df['liNum_scaled'] = scaler.fit_transform(df['liNum'], liNum_scale_param) codeFragNum_scale_param = scaler.fit(df['codeFragNum']) df['codeFragNum_scaled'] = scaler.fit_transform(df['codeFragNum'], codeFragNum_scale_param) avgTI_scale_param = scaler.fit(df['avgTI']) df['avgTI_scaled'] = scaler.fit_transform(df['avgTI'], avgTI_scale_param) totalTI_scale_param = scaler.fit(df['totalTI']) df['totalTI_scaled'] = scaler.fit_transform(df['totalTI'], totalTI_scale_param) title_scale_param = scaler.fit(df['titleLength']) df['title_scaled'] = scaler.fit_transform(df['titleLength'], title_scale_param) body_scale_param = scaler.fit(df['bodyLength']) df['body_scaled'] = scaler.fit_transform(df['bodyLength'], body_scale_param) a_scale_param = scaler.fit(df['aNum']) df['a_scaled'] = scaler.fit_transform(df['aNum'], a_scale_param) strong_scale_param = scaler.fit(df['strongNum']) df['strong_scaled'] = scaler.fit_transform(df['strongNum'], strong_scale_param) thx_scale_param = scaler.fit(df['thxNum']) df['thx_scaled'] = scaler.fit_transform(df['thxNum'], thx_scale_param) hourHot_scale_param = scaler.fit(df['hourHot']) df['hourHot_scaled'] = scaler.fit_transform(df['hourHot'], hourHot_scale_param) train_df = df[['class', 'codeFragNum_scaled', 'liNum_scaled', 'totalTI', 'avgTI', 'popTagsNum_scaled', 'startWithWh', 'endWithQ', 'WhAndQ1', 'WhAndQ0', 'isweekend', 'cntQ', 'cntA', 'body_scaled', 'title_scaled', 'a_scaled', 'strong_scaled', 'thx_scaled', 'hourHot_scaled']] train_np = train_df.as_matrix() tX, ty = train_np[:, 1:], train_np[:, 0] n_estimators = 800 learning_rate = 0.8 dt = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1) ada_real = AdaBoostClassifier( base_estimator=dt, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") plot_learning_curve(ada_real, 'AdaBoostWithDT', tX, ty, ylim=(0.5, 1.0), cv=10, train_sizes=np.linspace(.1, 1, 10))
def algomain(df): scaler = preprocessing.StandardScaler() # liNum 只看>=2的部分 df["liG2"] = (df.liNum > 2).astype(int) # 开头有Wh/Who且结尾有Q df["WhAndQ1"] = ((df.startWithWh == 1) & (df.endWithQ == 1)).astype(int) df["WhAndQ0"] = ((df.startWithWh == 0) & (df.endWithQ == 0)).astype(int) # 标准化 popTagsNum_scale_param = scaler.fit(df["popTagsNum"]) df["popTagsNum_scaled"] = scaler.fit_transform(df["popTagsNum"], popTagsNum_scale_param) codeFragNum_scale_param = scaler.fit(df["codeFragNum"]) df["codeFragNum_scaled"] = scaler.fit_transform(df["codeFragNum"], codeFragNum_scale_param) avgTI_scale_param = scaler.fit(df["avgTI"]) df["avgTI_scaled"] = scaler.fit_transform(df["avgTI"], avgTI_scale_param) totalTI_scale_param = scaler.fit(df["totalTI"]) df["totalTI_scaled"] = scaler.fit_transform(df["totalTI"], totalTI_scale_param) title_scale_param = scaler.fit(df["titleLength"]) df["title_scaled"] = scaler.fit_transform(df["titleLength"], title_scale_param) body_scale_param = scaler.fit(df["bodyLength"]) df["body_scaled"] = scaler.fit_transform(df["bodyLength"], body_scale_param) a_scale_param = scaler.fit(df["aNum"]) df["a_scaled"] = scaler.fit_transform(df["aNum"], a_scale_param) strong_scale_param = scaler.fit(df["strongNum"]) df["strong_scaled"] = scaler.fit_transform(df["strongNum"], strong_scale_param) thx_scale_param = scaler.fit(df["thxNum"]) df["thx_scaled"] = scaler.fit_transform(df["thxNum"], thx_scale_param) dayhot_scale_param = scaler.fit(df["dayHot"]) df["dayHot_scaled"] = scaler.fit_transform(df["dayHot"], dayhot_scale_param) train_df = df[ [ "class", "codeFragNum_scaled", "liNum", "totalTI", "avgTI", "popTagsNum_scaled", "startWithWh", "endWithQ", "WhAndQ1", "WhAndQ0", "isweekend", "cntQ", "cntA", "body_scaled", "title_scaled", "a_scaled", "strong_scaled", "thx_scaled", "dayHot_scaled", ] ] train_np = train_df.as_matrix() tX, ty = train_np[:, 1:], train_np[:, 0] # estm = LinearSVC(C=0.3, penalty='l1', dual=False) estm = SVC(C=0.1, kernel="linear") plot_learning_curve(estm, "LinearSVC", tX, ty, ylim=(0.5, 1.0), train_sizes=np.linspace(0.1, 1, 10)) estm.fit(tX, ty) print pd.DataFrame({"columns": list(train_df.columns[1:]), "coef": list(estm.coef_.T)})
X_test = X[m:,:] y_test = y[m:].ravel() m_val = int(X_test.shape[0] * 0.5) X_val = X_test[m_val:,:] y_val = y_test[m_val:] X_test = X_test[:m_val,:] y_test = y_test[:m_val] #initialising the MLP nn_clf = MLPClassifier(hidden_layer_sizes=(20),alpha = 0.3,activation='logistic',solver='lbfgs') #plotting the learing curve for the model using plot_learing_curve defined in scikit documentation plot_learning_curve(nn_clf, "NN Learning Curve", X, y) plt.show() #training the model nn_clf.fit(X_train,y_train) #validating on the validation set acc = nn_clf.score(X_val, y_val) print("Classifier accuracy on validation = " +str(acc * 100)+"%") #testing on the test set acc = nn_clf.score(X_test, y_test) print("Classifier accuracy on test = " +str(acc * 100)+"%") #saving the model joblib.dump(nn_clf,"nn_clf.joblib")
def SVM_Learning_Curves(X, Y, datasource, gamma_value): title = "SVM Learning Curves on " + datasource cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=626) estimator = SVC(gamma=gamma_value, random_state=626) plt = plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.05), cv=cv) plt.show()
observation = env.reset() while not done: steps += 1 action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) agent.remember(observation, action, reward, observation_, done) agent.learn() observation = observation_ score += reward alpha.append(agent.alpha) score_history.append(score) alpha_history.append(np.mean(alpha)) avg_score = np.mean(score_history[-100:]) if avg_score > best_score: best_score = avg_score agent.save_models() print('episode ', i, 'score %.1f' % score, 'avg score %1.f' % avg_score, 'steps ', steps, 'alpha ', np.mean(alpha)) plot_learning_curve(score_history, figure_file_return + '.png', color='lightgreen', avg_color='green', Ylabel='Return') plot_learning_curve(alpha_history, figure_file_alpha + '.png', color='blue', Ylabel='Temperature alpha') np.save(figure_file_return, score_history)
def test_learning_curve(): X = data[[0, 1, 2, 3, 4]].values y = data['outcome-class'].values fig = plot_learning_curve(estimator, "50 k-NN learning curve", X, y, cv=3, verbose=2, train_sizes=np.linspace(.1, 0.99, 20)) fig.show()