def curve_per_subject(subject, data_path, test_labels): d = load_train_data(data_path, subject) x, y_10m = d['x'], d['y'] n_train_examples = x.shape[0] n_timesteps = x.shape[-1] print 'n_preictal', np.sum(y_10m) print 'n_inetrictal', np.sum(y_10m - 1) x, y = reshape_data(x, y_10m) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) pred_1m = lda.predict_proba(x)[:, 1] pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) fpr, tpr, threshold = roc_curve(y_10m, pred_10m) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]][-1] print opt_threshold # ------- TEST --------------- d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = lda.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= opt_threshold)] = 1 cm = confusion_matrix(test_labels, y_pred) print print_cm(cm, labels=['interictal', 'preictal']) sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) print sn, sp sn, sp = [], [] t_list = np.arange(0.0, 1.0, 0.01) for t in t_list: y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= t)] = 1 cm = confusion_matrix(test_labels, y_pred) sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) sn.append(sn_t) sp.append(sp_t) return t_list, sn, sp
def curve_per_subject(subject, data_path, test_labels): d = load_train_data(data_path, subject) x, y_10m = d['x'], d['y'] n_train_examples = x.shape[0] n_timesteps = x.shape[-1] print('n_preictal', np.sum(y_10m)) print('n_inetrictal', np.sum(y_10m - 1)) x, y = reshape_data(x, y_10m) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) pred_1m = lda.predict_proba(x)[:, 1] pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) fpr, tpr, threshold = roc_curve(y_10m, pred_10m) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]][-1] print(opt_threshold) # ------- TEST --------------- d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = lda.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= opt_threshold)] = 1 cm = confusion_matrix(test_labels, y_pred) print(print_cm(cm, labels=['interictal', 'preictal'])) sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) print(sn, sp) sn, sp = [], [] t_list = np.arange(0.0, 1.0, 0.01) for t in t_list: y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= t)] = 1 cm = confusion_matrix(test_labels, y_pred) sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) sn.append(sn_t) sp.append(sp_t) return t_list, sn, sp
def score(train_X, train_y): X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10) clf = LDA() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_valid) return log_loss(y_valid, y_pred)
def train_predict(X,y,Xt,yt=[],c=1): if c==1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) #clf=xgb_classifier(num_round=300,eta=0.01,min_child_weight=20,depth=8, subsample=0.1,col=0.7) return clf.train_predict(X,y,Xt,yt) elif c==2: clf = LDA() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds elif c==3: clf = LogisticRegression() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds
def LDAClassify_Proba(enrollment_id, trainData, trainLabel, testData): clf = LDA(solver='lsqr') #clf = LDA() clf.fit(trainData, ravel(trainLabel)) testLabel = clf.predict_proba(testData)[:,1] saveResult(enrollment_id, testLabel, 'Proba_sklearn_LDA.csv') return testLabel
def train_predict(X,y,Xt,yt=[],c=1): if c==1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329 #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # HandStart - 0.930538668081 return clf.train_predict(X,y,Xt,yt) elif c==2: clf = LDA() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds elif c==3: clf = LogisticRegression() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds
def __call__(self, x, y, inputs, labels): classes = numpy.unique(labels) if len(classes) == 1: if y == classes[0]: return 1 else: return -1 lda = LDA().fit(inputs, labels) prob = lda.predict_proba([x])[0][lda.classes_.tolist().index(y)] return 2 * prob - 1
def train_predict(X, y, Xt, yt=[], c=1): if c == 1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) clf = xgb_classifier(num_round=500, eta=0.01, min_child_weight=20, depth=10, subsample=0.1, col=0.7) #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329 #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # HandStart - 0.930538668081 return clf.train_predict(X, y, Xt, yt) elif c == 2: clf = LDA() clf.fit(X, y) preds = clf.predict_proba(Xt)[:, 1] return preds elif c == 3: clf = LogisticRegression() clf.fit(X, y) preds = clf.predict_proba(Xt)[:, 1] return preds
def lda_model(x_train, y_train, x_test, y_test): global get_test print "LDA model learning..." start_time = time.time() #LDA assumes common variance matrix among classes, while QDA doesn't clf = LDA() #clf = QDA() clf.fit(x_train, y_train) learning_time = time.time() - start_time print "training time is: {:.5f} seconds.".format(learning_time) ''' #use LDA to do dimensionality reduction, reduce to n_class-1 dimensions x_t = clf.transform(x_train) print x_train.shape print x_t.shape print x_train[:3] print x_t[:3] ''' print "Model Prediction..." #y_predict = clf.predict(x_test) start_time = time.time() #get probability prediction y_prob = clf.predict_proba(x_test) prediction_time = time.time() - start_time print "prediction time is: {:.5f} seconds.".format(prediction_time) if get_test == True: #the data is from real test set #output to file output_result(y_prob) else: #the test set is split from the train set, compute the loss function value encoder = LabelEncoder() #encode string label 'Class_1', 'Class_2',... to [0,1,...,8] y_true = encoder.fit_transform(y_test) #the classe labels in encoder is consistent with the class labels in the classifier assert (encoder.classes_ == clf.classes_).all() #compute the value for loss function score = logloss_mc(y_true, y_prob) print( " -- Multiclass logloss on validation set: {:.5f}.".format(score))
# print (pca.explained_variance_ratio_) #---------------------End of Snippet #---------------------Snippet 3, PCA followed by a linear SVM classification # Using PCA pca_clf = svm.SVC(probability=True) pca_clf.fit(PCA_train_data, train_labels) pca_pResults = pca_clf.predict(PCA_test_data) pca_pResults_prob = pca_clf.predict_proba(PCA_test_data) print pca_pResults # Using LDA lda_pResults = lda_clf.predict(test_data) lda_pResults_prob = lda_clf.predict_proba(test_data) #---------------------Check the order of the classes, make sure we use it correctly print 'the class order of lda is: ', lda_clf.classes_ print 'the class order of pca is: ', pca_clf.classes_ #---------------------Training of DBN ''' dbn = DBN( [train_data.shape[1], 30, 30,2], learn_rates = 0.35, learn_rate_decays = 1, epochs = 30, verbose = 1, dropouts = 0.04, )
# print "-----------" # print X[2] # print "-----------" # model = LogisticRegression(penalty='l2', dual=True, tol=0.0001, # C=1, fit_intercept=True, intercept_scaling=1.0, # class_weight=None, random_state=None) model = LDA() print "Trying to construct a LDA classifier" print "Wrote the model for the LDA classifier" # print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc')) print "Retrain on all training data, predicting test labels...\n" # X_dense=X.todense() model.fit(X,y) print "Model Fitted" # X_test_dense=X_test.todense() result = model.predict_proba(X_test)[:,1] print "Model Predicted" output = pd.DataFrame( data={"id":test["id"], "sentiment":result} ) print "Model outputted" # Use pandas to write the comma-separated output file output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_lda.csv'), index=False, quoting=3) print "Wrote results to Bag_of_Words_model.csv"
gmmModels[cls].fit(classFeatures) # ====================== test GMM =========================== gmmScores = np.zeros((testData.shape[0], numClasses)) for cls in xrange(numClasses): gmmScores[:, cls] = gmmModels[cls].score(testFeatures) # =========================== KNN model ======================= knnModel = KNeighborsClassifier(n_neighbors=5) knnModel.fit(trainFeatures, truth) knnScores = knnModel.predict_proba(testFeatures) # =========================== FDA model ======================= fdaModel = LDA() fdaModel.fit(trainFeatures, truth) fdaScores = fdaModel.predict_proba(testFeatures) # ======================== Build hybrid ======================= scores = np.log(gmmScores) + np.log(knnScores) + np.log(fdaScores) targetScores = np.amax(scores, axis = 1) targetClass = np.argmax(scores, axis = 1) outFile = 'output.csv' with open(outFile, 'w') as f: f.write('ISIN, Risk_Stripe\n') for i in xrange(testData.shape[0]): line = 'ISIN{0},Stripe {1}\n' line = line.format(int(testData[i, 0]), int(targetClass[i])) f.write(line) f.close()
X_train = data_preprocess_train(X_train, subject) X_test = data_preprocess_test(X_test, subject) for i in range(6): y_train = y[:, i] print('Train subject %d, class %s' % (subject, cols[i])) lr1.fit(X_train[::subsample, :], y_train[::subsample]) lr2.fit(X_train[::subsample2, :], y_train[::subsample2]) lr3.fit(X_train[::subsample3, :], y_train[::subsample3]) lr4.fit(X_train[::subsample, :], y_train[::subsample]) lr5.fit(X_train[::subsample2, :], y_train[::subsample2]) lr6.fit(X_train[::subsample3, :], y_train[::subsample3]) pred1[:, i] = lr1.predict_proba(X_test)[:, 1] pred2[:, i] = lr2.predict_proba(X_test)[:, 1] pred3[:, i] = lr3.predict_proba(X_test)[:, 1] pred4[:, i] = lr4.predict_proba(X_test)[:, 1] pred5[:, i] = lr5.predict_proba(X_test)[:, 1] pred6[:, i] = lr6.predict_proba(X_test)[:, 1] pred[:, i] = (pred1[:, i] + pred2[:, i] + pred3[:, i] + pred4[:, i] + pred5[:, i] + pred6[:, i]) / 6.0 pred_tot.append(pred) # submission file submission_file = 'vali1_new_sub.csv' # create pandas object for sbmission submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot)) # write file
pred2 = np.empty((X_test.shape[0],6)) pred3 = np.empty((X_test.shape[0],6)) pred4 = np.empty((X_test.shape[0],6)) pred = np.empty((X_test.shape[0],6)) X_train=data_preprocess_train(X_train) X_test=data_preprocess_test(X_test) for i in range(6): y_train= y[:,i] print('Train subject %d, class %s' % (subject, cols_alt[i])) lr1.fit(X_train[::subsample,:],y_train[::subsample]) lr2.fit(X_train[::subsample,:],y_train[::subsample]) lr3.fit(X_train[::subsample2,:],y_train[::subsample2]) lr4.fit(X_train[::subsample2,:],y_train[::subsample2]) pred1[:,i] = lr1.predict_proba(X_test)[:,1] pred2[:,i] = lr2.predict_proba(X_test)[:,1] pred3[:,i] = lr3.predict_proba(X_test)[:,1] pred4[:,i] = lr4.predict_proba(X_test)[:,1] pred[:,i]=(pred1[:,i]+pred2[:,i]+pred3[:,i]+pred4[:,i])/4 #pred[:,i]=(pred1[:,i]+pred2[:,i])/2 #predictions = pred[:, 0:6] pred_tot.append(pred) # create pandas object for sbmission submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot))
y=df['2015h'] X=df.drop(['2015h'],axis=1) #build new train and test sets train,test=train_test_split(df,train_size=.9) y_train=train['2015h'] x_train=train.drop('2015h',axis=1) y_test=test['2015h'] x_test=test.drop('2015h',axis=1) #LDA Accuracy lda_classifier = LDA(n_components=2) lda_x_axis = lda_classifier.fit(x_train, y_train).transform(x_train) lda_classifier.score(x_test, y_test, sample_weight=None) #Get AUC for test proba=pd.DataFrame(lda_classifier.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Decision Tree Accuracy dt = DecisionTreeClassifier(class_weight='balanced') dt.fit(x_train,y_train) dt.score(x_test,y_test) proba=pd.DataFrame(dt.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Random Forest Accuracy (okay baseline) rf = RandomForestClassifier(class_weight='balanced') rf.fit(x_train,y_train) rf.score(x_test,y_test)
plt.show() # return the training and testing scores on each parameter value return train_scores, test_scores ########FOCUSING ON LOGISTIC REGRESSION AND LDA TEST DATA######################## from sklearn.linear_model import LogisticRegressionCV logregCV = LogisticRegressionCV(cv= 10, solver = 'lbfgs', penalty = 'l2').fit(train_standardized, target) logCV_acc = logregCV.scores_ y_pred = logregCV.predict_proba(test_standardized) ldaC = LDA().fit(train_standardized, target) y_pred = ldaC.predict_proba(test_standardized) ad_fit = ad(n_estimators = 10).fit(train_standardized, target) y_pred = ad_fit.predict_proba(test_standardized) rf_fit = rf(random_state=99).fit(train_standardized, target) splitSizes = list(range(1,10,1)) train_scores, test_scores = calc_params(train_standardized, target, rf_fit, splitSizes, 'min_samples_leaf', 5, metric = 'accuracy') pd.DataFrame(np.array([test_scores, splitSizes]).T, columns = ['Test Recall', 'Minimum Split Size']) nEst = range(1, 51, 10) train_scores, test_scores = calc_params(train_standardized, target, rf_fit, nEst, 'n_estimators', 5, metric = 'accuracy') pd.DataFrame(np.array([test_scores, nEst]).T, columns = ['Test Recall', 'Number of Estimators'])
X_test = data_preprocess_test(X_test, subject) for i in range(6): y_train = y[:, i] print(('Train subject %d, class %s' % (subject, cols[i]))) if cl == 1: lr1.fit(X_train[::subsample, :], y_train[::subsample]) pred[:, i] = lr1.predict_proba(X_test)[:, 1] if cl == 2: lr2.fit(X_train[::subsample2, :], y_train[::subsample2]) pred[:, i] = lr2.predict_proba(X_test)[:, 1] if cl == 3: lr3.fit(X_train[::subsample3, :], y_train[::subsample3]) pred[:, i] = lr3.predict_proba(X_test)[:, 1] if cl == 4: lr4.fit(X_train[::subsample, :], y_train[::subsample]) pred[:, i] = lr4.predict_proba(X_test)[:, 1] if cl == 5: lr5.fit(X_train[::subsample2, :], y_train[::subsample2]) pred[:, i] = lr5.predict_proba(X_test)[:, 1] if cl == 6: lr6.fit(X_train[::subsample3, :], y_train[::subsample3]) pred[:, i] = lr6.predict_proba(X_test)[:, 1] #pred[:,i]=(pred1[:,i]+pred2[:,i]+pred3[:,i]+pred4[:,i]+pred5[:,i]+pred6[:,i])/6.0 pred_tot.append(pred) # submission file submission_file = 'cv/try_sub%d_clf%d_trode%d.csv' % (subx, cl, electrode) # create pandas object for sbmission submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols,
redundant = corr[c][corr[c].abs() > coefficient].index - pd.Index([c]) - add remove = remove.union(redundant) print("For correlation coefficient = ", coefficient) #print(remove) #print(add) train_data = pd.DataFrame(data=train_data_g, columns = df.columns)[df.columns- remove].values test_data = pd.DataFrame(data=test_data_g, columns = df.columns)[df.columns- remove].values print("num of featurs = ", train_data.shape[1]) clf = LDA(); # This gets the time in ipython shell. print("\n\nModelling time:") %time clf.fit(train_data, train_labels) print("Modelling time ends\n\n") print("\n\nprediction time starts:") %time predicted_labels = clf.predict(test_data) print("prediction time ends:\n\n") #print(classification_report(test_labels, clf.predict(test_data))) print(classification_report(test_labels, predicted_labels)) print("num of featurs = ", train_data.shape[1]) y_true = test_labels; y_pred_proba = clf.predict_proba(test_data); fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba[:, 1]) roc_auc = auc(fpr, tpr) print("ROC AUC =", roc_auc)
angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees # filled gaussian at 2 standard deviation ell = mpl.patches.Ellipse(mean, 2 * v[0]**0.5, 2 * v[1]**0.5, 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) xx, yy = np.meshgrid(np.linspace(4, 8.5, 200), np.linspace(1.5, 4.5, 200)) X_grid = np.c_[xx.ravel(), yy.ravel()] zz_lda = lda.predict_proba(X_grid)[:, 1].reshape(xx.shape) zz_qda = qda.predict_proba(X_grid)[:, 1].reshape(xx.shape) pl.figure() splot = pl.subplot(1, 2, 1) pl.contourf(xx, yy, zz_lda > 0.5, alpha=0.5) pl.scatter(X[y == 0, 0], X[y == 0, 1], c='b', label=target_names[0]) pl.scatter(X[y == 1, 0], X[y == 1, 1], c='r', label=target_names[1]) pl.contour(xx, yy, zz_lda, [0.5], linewidths=2., colors='k') plot_ellipse(splot, lda.means_[0], lda.covariance_, 'b') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'r') pl.legend() pl.axis('tight') pl.title('Linear Discriminant Analysis') splot = pl.subplot(1, 2, 2)
model_scores=pd.DataFrame({'model':model,'score':score}) for i in model_scores.index: if model_scores.score[i]==max(model_scores.score): print model_scores.score[i] print model_scores.model[i] #using LDA model without feature selection to predict probablilites, look at confusion matrix #and plot ROC curve Accuracy= .949571 X2=X[best[0:23]] X_test2=X_test[best[0:23]] lda= LDA(n_components=2) lda_x_axis = lda.fit(X2, y).transform(X2) lda.score(X_test2, y_test, sample_weight=None) y_pred=lda.predict_proba(X_test2) proba=pd.DataFrame(y_pred)[1] proba.mean() #play with the predication threshold to see falsenegative/positive trade off y_pred2=[] for i in proba: if i>.0553: y_pred2.append(1) else: y_pred2.append(0) #(true negative) (false positive) #(false negative) (true positive) #(786)(207) #(13)(45)
# 1. Linear Discriminant Analysis from sklearn.lda import LDA # loads the library score_train = np.array([]) score_test = np.array([]) for train_index, test_index in kf: CVTrainFeats, CVTestFeats = TrainFeats[train_index], TrainFeats[test_index] CVTrainLabels, CVTestLabels = TrainLabels[train_index], TrainLabels[ test_index] model = LDA() model.fit(CVTrainFeats, CVTrainLabels) score_train = np.append( score_train, metrics.log_loss(CVTrainLabels, model.predict_proba(CVTrainFeats))) score_test = np.append( score_test, metrics.log_loss(CVTestLabels, model.predict_proba(CVTestFeats))) score = metrics.log_loss(TestLabels, model.predict_proba(TestFeats)) # To make sure we're not overfitting print(("Average CV Training Log loss: %.2f" % np.mean(score_train))) print(("Average CV Testing Log loss: %.2f" % np.mean(score_test))) print(("Testing Log loss: %.2f" % score)) print( metrics.confusion_matrix(CVTestLabels, model.predict(CVTestFeats), labels=[1, 0]))
h = dl.fit(X_train, y_train, batch_size=512, nb_epoch=20, show_accuracy=True, validation_data=(X_test, y_test), callbacks = [ EarlyStopping(verbose=True, patience=6, monitor='val_loss'), ModelCheckpoint('./BIGDATASLACNet-weights.h5', monitor='val_loss', verbose=True, save_best_only=True) ], sample_weight=weights[:n_train]) y_dl = dl.predict(X_, verbose=True).ravel() from sklearn.lda import LDA lda = LDA() lda.fit(X_train, y_train) # lda.fit(X_[selection], y_[selection]) yld = lda.predict_proba(X_) yld = yld[:, 1] DNN_kin = Likelihood2D(np.linspace(-4, 6.2, 6), np.linspace(0, 1, 50)) DNN_kin.fit((np.log(mass_nsj_likelihood + 1e-6)[signal == 1], y_dl[signal == 1]), (np.log(mass_nsj_likelihood + 1e-6)[signal == 0], y_dl[signal == 0]), weights=(weights[signal == 1], weights[signal == 0])) likelihood2 = DNN_kin.predict((np.log(mass_nsj_likelihood + 1e-6), y_dl)) add_curve(r'Deep Net', 'orange', calculate_roc(signal, y_dl, weights=weights, bins=1000000), discs) add_curve(r'Deep Net + $(m_{\mathrm{jet}}, \tau_{21})$', 'black', calculate_roc(signal, likelihood2, weights=weights, bins=1000000), discs) add_curve(r'FLD', 'green', calculate_roc(signal[selection], yld[selection], weights=weights[selection], bins=1000000), discs) fg = ROC_plotter(discs, title=r"$W' \rightarrow WZ$ vs. QCD Tagging comparison -- match $s \longrightarrow b$." + '\n' + r'Jet $p_T\in[200, 1000]$ $\mathrm{GeV},\vert\eta\vert<2$') fg.savefig(PLOT_DIR % 'dl-roc.pdf') # -- small windows..
rf = RandomForestClassifier(n_estimators=150, n_jobs=-1, criterion="entropy", random_state=1) lr = LogisticRegression() X_train, scaler = compute_features(X_train) X_test = compute_features(X_test, scaler) #pass the learned mean and std to normalized test data y = np.concatenate(y,axis=0) scores = np.empty((X_test.shape[0],6)) downsample = 40 for i in range(6): print('Train subject %d, class %s' % (subject, cols[i])) rf.fit(X_train[::downsample,:], y[::downsample,i]) lda.fit(X_train[::downsample,:], y[::downsample,i]) lr.fit(X_train[::downsample,:], y[::downsample,i]) scores[:,i] = (rf.predict_proba(X_test)[:,1] + lda.predict_proba(X_test)[:,1] + lr.predict_proba(X_test)[:,1])/3.0 scores_tot.append(scores) idx_tot.append(np.concatenate(idx)) # create pandas object for submission submission = pd.DataFrame(index=np.concatenate(idx_tot), columns=cols, data=np.concatenate(scores_tot)) # write file submission.to_csv(submission_file,index_label='id',float_format='%.3f')
X_testKaggle.loc[index,'carrier']=0 if X_testKaggle.loc[index,'color'] in goodcolors: X_testKaggle.loc[index,'color']=1 elif X_testKaggle.loc[index,'color'] in badcolors: X_testKaggle.loc[index,'color']=-1 else : X_testKaggle.loc[index,'color']=0 if X_testKaggle.loc[index,'storage'] in storage: X_testKaggle.loc[index,'storage']=storage.get(X_testKaggle.loc[index,'storage']) else : X_testKaggle.loc[index,'storage']=1 if X_testKaggle.loc[index,'productline'] in iPadlines: X_testKaggle.loc[index,'productline']=iPadlines.get(X_testKaggle.loc[index,'productline']) elif X_testKaggle.loc[index,'productline'] in iPadminilines: X_testKaggle.loc[index,'iPadmini']=iPadminilines.get(X_testKaggle.loc[index,'productline']) X_testKaggle.loc[index,'productline']=0 elif X_testKaggle.loc[index,'productline'] in iPadAirlines: X_testKaggle.loc[index,'iPadAir']=iPadAirlines.get(X_testKaggle.loc[index,'productline']) X_testKaggle.loc[index,'productline']=0 elif X_testKaggle.loc[index,'productline']=="Unknown": X_testKaggle.loc[index,'productline']=0 result=pd.DataFrame(testKaggle, columns=['UniqueID','Probability1']) y_pred=clf.predict_proba(X_testKaggle) result['Probability1']=y_pred np.savetxt("/home/reddowan/Documents/Kaggle edx MIT/resultLDA.csv",result,delimiter=",",fmt='%9f')
# Spatial filtering train print 'Filtering train data ...' trainFeats = np.empty([K.shape[0], m*2]) for i in range(0,K.shape[0]): aux = np.dot( np.dot(W.T, K[i,:,:]), W ) trainFeats[i,:] = ( np.diag(aux) ) / np.trace(aux) # Spatial filtering test print 'Filtering test data ...' testFeats = np.empty([Ktest.shape[0], m*2]) for i in range(0,Ktest.shape[0]): aux = np.dot( np.dot(W.T, Ktest[i,:,:]), W ) testFeats[i,:] = ( np.diag(aux) ) / np.trace(aux) # Classification print 'Classification ...' clf = LDA() clf.fit(trainFeats, labels) predictedProb = clf.predict_proba(testFeats) predictedProb = predictedProb[:,1] # Generate submission submission = {'ID' : testIDs, 'probability' : predictedProb } submission = pd.DataFrame(submission) submission.to_csv(outputFile, index = 0, float_format='%11.6f')
### Testing testing_data = data[testing_idx, :] testing_label = label[testing_idx] # Declare the random forest #crf = RandomForestClassifier(n_estimators=100, n_jobs=n_jobs) #crf = AdaBoostClassifier(n_estimators=100) #crf = LinearSVC() crf = LDA() # Train the classifier crf.fit(training_data, training_label) # Test the classifier pred_labels = crf.predict(testing_data) pred_probs = crf.predict_proba(testing_data) #pred_probs = crf.decision_function(testing_data) # Compute the confusion matrix cm = confusion_matrix(testing_label, pred_labels) # Compute the sensitivity and specificity sens = float(cm[1, 1]) / float(cm[1, 1] + cm[1, 0]) spec = float(cm[0, 0]) / float(cm[0, 0] + cm[0, 1]) sens_fold.append(sens) spec_fold.append(spec) # Compute the roc curve roc_exp = roc_curve(testing_label, pred_probs[:, 1]) auc_exp = roc_auc_score(testing_label, pred_probs[:, 1]) #roc_exp = roc_curve(testing_label, pred_probs) #auc_exp = roc_auc_score(testing_label, pred_probs)
X_train=data_preprocess_train(X_train,subject) X_test=data_preprocess_test(X_test,subject) for i in range(6): y_train= y[:,i] print('Train subject %d, class %s' % (subject, cols[i])) lr1.fit(X_train[::subsample,:],y_train[::subsample]) lr2.fit(X_train[::subsample2,:],y_train[::subsample2]) lr3.fit(X_train[::subsample3,:],y_train[::subsample3]) lr4.fit(X_train[::subsample,:],y_train[::subsample]) lr5.fit(X_train[::subsample2,:],y_train[::subsample2]) lr6.fit(X_train[::subsample3,:],y_train[::subsample3]) pred1[:,i] = lr1.predict_proba(X_test)[:,1] pred2[:,i] = lr2.predict_proba(X_test)[:,1] pred3[:,i] = lr3.predict_proba(X_test)[:,1] pred4[:,i] = lr4.predict_proba(X_test)[:,1] pred5[:,i] = lr5.predict_proba(X_test)[:,1] pred6[:,i] = lr6.predict_proba(X_test)[:,1] pred[:,i]=(pred1[:,i]+pred2[:,i]+pred3[:,i]+pred4[:,i]+pred5[:,i]+pred6[:,i])/6.0 pred_tot.append(pred) # submission file submission_file = 'vali30_new_sub.csv' # create pandas object for sbmission submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot)) # write file submission.to_csv(submission_file,index_label='id',float_format='%.3f')
for i in range(1,len(tokens)): ind = int(tokens[i].split(':')[0]) val = float(tokens[i].split(':')[1]) f[ind - 1] = val features[dataset].append(f) classes = set(labels['train']) if useBinary: prediction=[] proba=[] #train classifier for c in classes: lda = LDA(ldasolver) lda.fit(features['train'],np.array(labels['train'])==c) #test classifier p = np.array(lda.predict_proba(features['test'])) proba.append(p[:,1]) proba=np.transpose(np.array(proba)) prediction=np.argmax(proba,axis=1)+1 else: #train classifier lda = LDA(ldasolver) lda.fit(features['train'],labels['train']) #test classifier prediction = lda.predict(features['test']) proba = lda.predict_proba(features['test']) print('Accuracy %.2f%%' % lda.score(features['test'],labels['test'])) #output data file = open(outputFile,'w')
completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto',
kf = cross_validation.KFold(len(TrainLabels), n_folds=5, shuffle = False, random_state = 123) ## ---------------------------------------------------------------------------- # 1. Linear Discriminant Analysis from sklearn.lda import LDA # loads the library score_train = np.array([]) score_test = np.array([]) for train_index, test_index in kf: CVTrainFeats, CVTestFeats = TrainFeats[train_index], TrainFeats[test_index] CVTrainLabels, CVTestLabels = TrainLabels[train_index], TrainLabels[test_index] model = LDA() model.fit(CVTrainFeats, CVTrainLabels) score_train = np.append(score_train,metrics.log_loss(CVTrainLabels, model.predict_proba(CVTrainFeats))) score_test = np.append(score_test,metrics.log_loss(CVTestLabels, model.predict_proba(CVTestFeats))) score = metrics.log_loss(TestLabels, model.predict_proba(TestFeats)) # To make sure we're not overfitting print("Average CV Training Log loss: %.2f" % np.mean(score_train)) print("Average CV Testing Log loss: %.2f" % np.mean(score_test)) print("Testing Log loss: %.2f" % score) print metrics.confusion_matrix(CVTestLabels,model.predict(CVTestFeats),labels = [1,0]) ## ---------------------------------------------------------------------------- # 2. Logistic Regression from sklearn.linear_model import LogisticRegression
pred = np.empty((X_test.shape[0],6)) X_train=data_preprocess_train(X_train) X_test=data_preprocess_test(X_test) for i in range(6): y_train= y[:,i] print('Train subject %d, class %s' % (subject, cols[i])) # Fit models lda.fit(X_train,y_train) rf.fit(X_train, y_train) lr2.fit(X_train,y_train) # Grab predictions pred1[:,i] = lda.predict_proba(X_test)[:,1] pred2[:,i] = rf.predict_proba(X_test)[:,1] pred3[:,i] = lr2.predict_proba(X_test)[:,1] # Ensemble! pred[:,i]=(pred1[:,i] + pred2[:,i] + pred3[:,i])/3 pred_tot.append(pred) # submission file #lda_file = 'lda.csv' lda_file = 'lda_rf.csv' # create pandas object for sbmission lda = pd.DataFrame(index=np.concatenate(ids_tot),
def plot_ellipse(splot, mean, cov, color): v, w = linalg.eigh(cov) u = w[0] / linalg.norm(w[0]) angle = np.arctan(u[1]/u[0]) angle = 180 * angle / np.pi # convert to degrees # filled gaussian at 2 standard deviation ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5, 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) xx, yy = np.meshgrid(np.linspace(4, 8.5, 200), np.linspace(1.5, 4.5, 200)) X_grid = np.c_[xx.ravel(), yy.ravel()] zz_lda = lda.predict_proba(X_grid)[:,1].reshape(xx.shape) zz_qda = qda.predict_proba(X_grid)[:,1].reshape(xx.shape) pl.figure() splot = pl.subplot(1, 2, 1) pl.contourf(xx, yy, zz_lda > 0.5, alpha=0.5) pl.scatter(X[y==0,0], X[y==0,1], c='b', label=target_names[0]) pl.scatter(X[y==1,0], X[y==1,1], c='r', label=target_names[1]) pl.contour(xx, yy, zz_lda, [0.5], linewidths=2., colors='k') plot_ellipse(splot, lda.means_[0], lda.covariance_, 'b') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'r') pl.legend() pl.axis('tight') pl.title('Linear Discriminant Analysis') splot = pl.subplot(1, 2, 2)
X_test = pd.concat(test) all_ids.append(np.concatenate(idx)) X_test = X_test.drop(['id'], axis=1) X_test = np.asarray(X_test.astype(float)) current_prediction_lda = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels current_prediction_lr = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels current_prediction_qda = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels X_test = data_preprocess_test(X_test) for i in range(6): print 'testing subject_id=',subject_id current_prediction_lr[:,i] = lr.predict_proba(X_test)[:,1] current_prediction_qda[:,i] = qda.predict_proba(X_test)[:,1] current_prediction_lda[:,i] = lda.predict_proba(X_test)[:,1] # print 'predicted:',current_prediction[:,i] all_predictions_lda.append(current_prediction_lda) all_predictions_qda.append(current_prediction_qda) all_predictions_lr.append(current_prediction_lr) all_predictions_avg.append( (current_prediction_lda+current_prediction_qda+current_prediction_lr)/3 ) print 'testing complete' print 'ids ',np.concatenate(all_ids).shape print 'predictions ',np.concatenate(all_predictions_avg).shape
rnn_pred_proba = np.max(rnn_pre, axis=1) rnn_pred_proba[1000:] = 0 #print ('RNN AUC: ',str(metrics.roc_auc_score(y_test,rnn_pre))) print('RNN ACC: ', str(acc)) print( 'RNN Recall for each class: ', str(metrics.recall_score(y_test, rnn_pred, pos_label=1, average=None))) print('RNN F1-score for each class: ', str(metrics.f1_score(y_test, rnn_pred, average=None))) print('RNN Precesion for each class: ', str(metrics.precision_score(y_test, rnn_pred, average=None))) metrics.confusion_matrix(y_test, rnn_pred) ##########################LDA预测############ clf = LDA() clf.fit(x_train, y_train) lda_pre = clf.predict_proba(x_test) lda_pred = np.argmax(lda_pre, axis=1) print('lda ACC: ', str(metrics.accuracy_score(y_test, lda_pred))) print('lda Recall for each class: ', str(metrics.recall_score(y_test, lda_pred, average=None))) print('lda F1-score for each class: ', str(metrics.f1_score(y_test, lda_pred, average=None))) print('lda Precesion for each class: ', str(metrics.precision_score(y_test, lda_pred, average=None))) metrics.confusion_matrix(y_test, lda_pred) ####################朴素贝叶斯预测################ gnb = GaussianNB() gnb.fit(x_train, y_train) Bayes_pre = gnb.predict_proba(x_test)
pred = np.empty((X_test.shape[0], 6)) X_train = data_preprocess_train(X_train) X_test = data_preprocess_test(X_test) for i in range(6): y_train = y[:, i] print('Train subject %d, class %s' % (subject, cols[i])) # Fit models lda.fit(X_train, y_train) rf.fit(X_train, y_train) lr2.fit(X_train, y_train) # Grab predictions pred1[:, i] = lda.predict_proba(X_test)[:, 1] pred2[:, i] = rf.predict_proba(X_test)[:, 1] pred3[:, i] = lr2.predict_proba(X_test)[:, 1] # Ensemble! pred[:, i] = (pred1[:, i] + pred2[:, i] + pred3[:, i]) / 3 pred_tot.append(pred) # submission file #lda_file = 'lda.csv' lda_file = 'lda_rf.csv' # create pandas object for sbmission lda = pd.DataFrame(index=np.concatenate(ids_tot),
X_test=data_preprocess_test(X_test,subject) for i in range(6): y_train= y[:,i] print('Train subject %d, class %s' % (subject, cols[i])) if cl==1: lr1.fit(X_train[::subsample,:],y_train[::subsample]) pred[:,i] = lr1.predict_proba(X_test)[:,1] if cl==2: lr2.fit(X_train[::subsample2,:],y_train[::subsample2]) pred[:,i] = lr2.predict_proba(X_test)[:,1] if cl==3: lr3.fit(X_train[::subsample3,:],y_train[::subsample3]) pred[:,i] = lr3.predict_proba(X_test)[:,1] if cl==4: lr4.fit(X_train[::subsample,:],y_train[::subsample]) pred[:,i] = lr4.predict_proba(X_test)[:,1] if cl==5: lr5.fit(X_train[::subsample2,:],y_train[::subsample2]) pred[:,i] = lr5.predict_proba(X_test)[:,1] if cl==6: lr6.fit(X_train[::subsample3,:],y_train[::subsample3]) pred[:,i] = lr6.predict_proba(X_test)[:,1] #pred[:,i]=(pred1[:,i]+pred2[:,i]+pred3[:,i]+pred4[:,i]+pred5[:,i]+pred6[:,i])/6.0 pred_tot.append(pred) # submission file submission_file = 'cv/try_sub%d_clf%d_trode%d.csv'%(subx,cl,electrode) # create pandas object for sbmission submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols,
for cls in xrange(numClasses): classData = trainData[truth==cls, :] col_mean = scipy.stats.mode(classData, axis=0) ids = np.where(np.isnan(classData)) classData[ids] = np.take(col_mean, ids[1]) trainData[truth==cls, :] = classData col_mean = scipy.stats.mode(trainData, axis=0) ids = np.where(np.isnan(testData)) testData[ids] = np.take(col_mean, ids[1]) trainFeatures = np.hstack((trainData[:, 1:19], trainData[:, 20:])) testFeatures = testData[:, 1:] # =========================== FDA model ======================= model = LDA() model.fit(trainFeatures, truth) scores = model.predict_proba(testFeatures) targetScores = np.amax(scores, axis = 1) targetClass = np.argmax(scores, axis = 1) outFile = 'output.csv' with open(outFile, 'w') as f: f.write('ISIN, Risk_Stripe\n') for i in xrange(testData.shape[0]): line = 'ISIN{0},Stripe {1}\n' line = line.format(int(testData[i, 0]), int(targetClass[i])) f.write(line) f.close()
""" print '../Output/gda_results_p'+str(numParticles)+'_L'+str(Lambda)+'_m'+str(multiplier)+'_tjets.txt' f = open('../Output/gda_results_p'+str(numParticles)+'_L'+str(Lambda)+'_m'+str(multiplier)+'_tjets.txt','w') testError = 0 truthFraction = 0 testHSError = 0 testPUError = 0 truthpt = 0 totalpt = 0 HSpt = 0 totalPUpt = 0 totalPUptretained = 0 for i in range(len(test_data)): predictFalse, predictTrue = clf.predict_proba(test_data[i])[0] pt = test_data[i][ptIndex] y = test_truth[i] sign = 1 if predictTrue > 0.5 else -1 prediction = predictTrue * sign margin = prediction*y if y>0: truthFraction +=1 truthpt+=pt if y<0: totalPUpt+=pt if prediction>0: totalPUptretained+=pt if prediction>0: totalpt+=pt
completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1],
#scores3 = np.empty((X_test.shape[0],6)) #scores4 = np.empty((X_test.shape[0],6)) #scores5 = np.empty((X_test.shape[0],6)) downsample = 20 # test SVM for 2 first subjects if subject in subjects: for i in range(6): print('Train subject %d, class %s' % (subject, cols[i])) #rf.fit(X_train[::downsample,:], y[::downsample,i]) lda.fit(X_train[::, :], y[::, i]) #lr.fit(X_train[::downsample,:], y[::downsample,i]) #clf.fit(X_train[::downsample,:], y[::downsample,i]) #scores1[:,i] = rf.predict_proba(X_test)[:,1] scores2[:, i] = lda.predict_proba(X_test)[:, 1] #scores3[:,i] = lr.predict_proba(X_test)[:,1] #scores4[:,i] = clf.predict_proba(X_test)[:,1] #scores5[:,i] = clf.predict(X_test)[:,1] #scores_tot1.append(scores1) scores_tot2.append(scores2) #scores_tot3.append(scores3) #scores_tot4.append(scores4) #scores_tot5.append(scores4) idx_tot.append(np.concatenate(idx)) #%%########### submission file ################################################ submission_file = 'models/model2_ds0_low2_band1_test1.csv' # create pandas object for submission submission = pd.DataFrame(index=np.concatenate(idx_tot),
#scores3 = np.empty((X_test.shape[0],6)) #scores4 = np.empty((X_test.shape[0],6)) #scores5 = np.empty((X_test.shape[0],6)) downsample = 20 # test SVM for 2 first subjects if subject in subjects: for i in range(6): print('Train subject %d, class %s' % (subject, cols[i])) #rf.fit(X_train[::downsample,:], y[::downsample,i]) lda.fit(X_train[::,:], y[::,i]) #lr.fit(X_train[::downsample,:], y[::downsample,i]) #clf.fit(X_train[::downsample,:], y[::downsample,i]) #scores1[:,i] = rf.predict_proba(X_test)[:,1] scores2[:,i] = lda.predict_proba(X_test)[:,1] #scores3[:,i] = lr.predict_proba(X_test)[:,1] #scores4[:,i] = clf.predict_proba(X_test)[:,1] #scores5[:,i] = clf.predict(X_test)[:,1] #scores_tot1.append(scores1) scores_tot2.append(scores2) #scores_tot3.append(scores3) #scores_tot4.append(scores4) #scores_tot5.append(scores4) idx_tot.append(np.concatenate(idx)) #%%########### submission file ################################################ submission_file = 'models/model2_ds0_low2_band1_test1.csv' # create pandas object for submission submission = pd.DataFrame(index=np.concatenate(idx_tot),
def predict(train_X, train_y, test_X): clf = LDA() clf.fit(train_X, train_y) return clf.predict_proba(test_X)