def model(x_train, x_test, y_train, y_test): if len(set(list(x_train[:, 0]))) == 1: auc_train_1, auc_test_1, auc_train_2, auc_test_2 = np.nan, np.nan, np.nan, np.nan else: clf_1 = LinearDiscriminantAnalysis() clf_2 = LogisticRegression(penalty='l2', solver='liblinear', C=1) clf_1.fit(x_train, y_train) clf_2.fit(x_train, y_train) predict_train_1 = clf_1.predict_proba(x_train) predict_test_1 = clf_1.predict_proba(x_test) fpr, tpr, thresholds = metrics.roc_curve(y_train, predict_train_1[:, 1], pos_label=1) auc_train_1 = metrics.auc(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_test_1[:, 1], pos_label=1) auc_test_1 = metrics.auc(fpr, tpr) predict_train_2 = clf_2.predict_proba(x_train) predict_test_2 = clf_2.predict_proba(x_test) fpr, tpr, thresholds = metrics.roc_curve(y_train, predict_train_2[:, 1], pos_label=1) auc_train_2 = metrics.auc(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_test_2[:, 1], pos_label=1) auc_test_2 = metrics.auc(fpr, tpr) return auc_train_1, auc_test_1, auc_train_2, auc_test_2
def fiteando(ResultadosX_train,ResultadosX_test,y_train,y_test,true): F1_train = [] F1_test = [] for i in range(3,40): clf = LinearDiscriminantAnalysis() clf.fit(ResultadosX_train[:,0:i],y_train) y_predict_train = clf.predict(ResultadosX_train[:,0:i]) y_predict_test = clf.predict(ResultadosX_test[:,0:i]) if true ==0: probs_train = clf.predict_proba(ResultadosX_train[:,0:i])[:,0] else: probs_train = clf.predict_proba(ResultadosX_train[:,0:i])[:,1] precision_train, recall_train, thresholds = precision_recall_curve(y_train, probs_train, pos_label=true) Formula1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train) if true ==0: probs_test = clf.predict_proba(ResultadosX_test[:,0:i])[:,0] else: probs_test = clf.predict_proba(ResultadosX_test[:,0:i])[:,1] precision_test, recall_test, thresholds = precision_recall_curve(y_test, probs_test, pos_label=true) Formula1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test) ddte = np.argmax(Formula1_test) ddtr = np.argmax(Formula1_train) F1_train.append(Formula1_train[ddtr]) F1_test.append(Formula1_test[ddte]) return F1_train,F1_test
class myLDABinary(myModel): def make(self , make_params ): self.model = LinearDiscriminantAnalysis(**make_params ) return self def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ): if type(xtrain) == pd.core.frame.DataFrame: self.model.fit(xtrain.astype('float32') , ytrain.astype('float32') , **fit_params) else: self.model.fit(xtrain , ytrain , **fit_params) def predict(self , xs , threshold = 0.5): if type(xs) == pd.core.frame.DataFrame: return self.model.predict(xs.astype('float32')) else: return self.model.predict(xs) def predict_proba(self, xs): if type(xs) == pd.core.frame.DataFrame: return self.model.predict_proba(xs.astype('float32'))[:,1] else: if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
def linear_discriminant_analysis(x_train, y_train, x_test, y_test, n_components=2, compute_threshold=True): ''' Train Linear Discriminant Analysis (LDA) classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. n_components: Number of components (< n_classes - 1) for dimensionality reduction. ''' from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = LinearDiscriminantAnalysis(priors=None, n_components=n_components) #X_r2 = model.fit(x_train, y_train).transform(X) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
class Ensamble_LDA(object): def __init__(self): self.model = LDA() self.ERP = np.load('../erp.npy') self.ERP /= np.linalg.norm(self.ERP) self.non_ERP = np.load('../non_erp.npy') self.non_ERP /= np.linalg.norm(self.non_ERP) def fit(self, X, y, *args, **kwargs): features = [] for i in range(8): features.append(np.dot(X[:, i, :], self.ERP)) features.append(np.dot(X[:, i, :], self.non_ERP)) X = np.dstack(features)[0] self.model.fit(X, y) def predict(self, X): X = X.reshape(1, 8, SAMPLING_RATE) features = [] for i in range(8): features.append(np.dot(X[:, i, :], self.ERP)) features.append(np.dot(X[:, i, :], self.non_ERP)) X = np.dstack(features)[0] return 1 if self.model.predict_proba(X)[0][1] > 0.7 else 0 def predict_proba(self, X): X = X.reshape(1, 8, SAMPLING_RATE) features = [] for i in range(8): features.append(np.dot(X[:, i, :], self.ERP)) features.append(np.dot(X[:, i, :], self.non_ERP)) X = np.dstack(features)[0] return self.model.predict_proba(X)[0][1] def __str__(self): return 'Ensamble_LDA' def __repr__(self): return 'Ensamble_LDA'
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert_true(np.any(y_pred3 != y3), "solver %s" % solver) # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose( np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg="solver %s" % solver, ) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), "solver %s" % solver clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") with pytest.raises(NotImplementedError): clf.fit(X, y) clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()) with pytest.raises( ValueError, match=("covariance_estimator and shrinkage " "parameters are not None. " "Only one of the two can be set."), ): clf.fit(X, y) # test bad solver with covariance_estimator clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf()) with pytest.raises(ValueError, match="covariance estimator is not supported with svd"): clf.fit(X, y) # test bad covariance estimator clf = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=KMeans( n_clusters=2, n_init="auto")) with pytest.raises(ValueError): clf.fit(X, y)
def train_eval_pca_LDA(args, config, train_xs, train_ys, test_xs, test_ys, reduced_dim): mean = train_xs.mean() std = train_xs.std() train_normed = (train_xs - mean) / std test_normed = (test_xs - mean) / std n_inst = np.shape(train_normed)[0] train_normed_T = train_normed.T R = train_normed_T.dot(train_normed) / (n_inst - 1) U, S, V = np.linalg.svd(R) Eigen_values = np.square(S + 1e-5) goodness_of_fit = Eigen_values / np.sum(Eigen_values).round(3) pc_score_train = np.matmul(np.asarray(train_normed), U[:, :reduced_dim]) pc_score_test = np.matmul(np.asarray(test_normed), U[:, :reduced_dim]) lda = LDA() lda.fit(pc_score_train, train_ys) lda_pred = lda.predict(pc_score_test) pos_lda_proba = lda.predict_proba(pc_score_test)[:, 0] fpr, tpr, thresholds = metrics.roc_curve(test_ys, pos_lda_proba, pos_label=0) auroc = metrics.auc(fpr, tpr) accuracy = np.mean(np.equal(lda_pred, test_ys)) * 100 return goodness_of_fit, accuracy, auroc
class LinearDiscriminantAnalysisPredictor(PredictorBase): ''' Linear Discriminant Analysis ''' def __init__(self, animal_type): self.animal_type = animal_type self.clf = LinearDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def find_best_params(self): parameters = {'solver': ['svd', 'lsqr', 'eigen']} knn = LinearDiscriminantAnalysis() clf = grid_search.GridSearchCV(knn, parameters) train_data = get_data('../data/train.csv') train_data = select_features(train_data, self.animal_type) X = train_data.drop(['OutcomeType'], axis=1) y = train_data['OutcomeType'] clf.fit(X, y) print clf.best_params_
class LinearDiscriminantAnalysisImpl(): def __init__(self, solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001): self._hyperparams = { 'solver': solver, 'shrinkage': shrinkage, 'priors': priors, 'n_components': n_components, 'store_covariance': store_covariance, 'tol': tol} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def run_linear_discriminant_analysis(train, test, ss_split, labels): # prepare training and test data X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split, labels) clf = LinearDiscriminantAnalysis().fit(X_train, y_train) print('ML Model: Linear Discriminant Analysis') train_predictions = clf.predict(X_test) acc = accuracy_score(y_test, train_predictions) train_predictions_p = clf.predict_proba(X_test) ll = log_loss(y_test, train_predictions_p) test_predictions = clf.predict_proba(test) return test_predictions, acc, ll
def lda(np_train_x, np_train_y, np_test_x, np_test_y, verified_num, rejected_num, p): model_LDA = LinearDiscriminantAnalysis() model_LDA.fit(np_train_x, np_train_y) for prob in p: predicted_values_LDA = np.where( model_LDA.predict_proba(np_test_x)[:, 1] > prob, 1, 0) total_miss_classified_LDA = 0 reject_wrong_LDA = 0 verify_wrong_LDA = 0 for i in range(len(np_test_x)): total_miss_classified_LDA += abs(np_test_y[i] - predicted_values_LDA[i]) if np_test_y[i] == 1 and predicted_values_LDA[i] == 0: reject_wrong_LDA += 1 if np_test_y[i] == 0 and predicted_values_LDA[i] == 1: verify_wrong_LDA += 1 print("\n----------------------Linear Discriminant Analysis prob:", prob, "--------------------") print("miss-classification rate :", total_miss_classified_LDA / 25000, "\nFalse negative rate (type1 error) :", reject_wrong_LDA / verified_num, "\nFalse positive rate (type2 error) :", verify_wrong_LDA / rejected_num)
def _get_5_fold_roc_input(self, target_df, field_flag): n_sample = target_df.shape[0] data_X = np.zeros((n_sample, self._n_feature), dtype=float) for feature_idx in range(self._n_feature): pc_str = self.get_pc_str(feature_idx) data_X[:, feature_idx] = target_df[pc_str].tolist()[:] data_Y = target_df[field_flag].tolist() data_Y = np.array(data_Y) # convert multi-class to single. only consider the class with largest label num. num_classes = np.max(data_Y) + 1 estimate_prob = np.zeros((n_sample, ), dtype=float) n_fold = KFold(n_splits=5) logger.info('Run 5 fold LDA') for train_idx, test_idx in n_fold.split(data_X): data_X_train, data_X_test = data_X[train_idx], data_X[test_idx] data_Y_train, data_Y_test = data_Y[train_idx], data_Y[test_idx] lda_obj = LinearDiscriminantAnalysis(n_components=1) lda_obj.fit(data_X_train, data_Y_train) # print(data_X_test) # print(num_classes) # print(test_idx) estimate_prob[test_idx] = lda_obj.predict_proba( data_X_test)[:, int(num_classes) - 1] logger.info( f'Num of positive sample in test group {np.sum(data_Y_test)}') return estimate_prob, (data_Y == num_classes - 1).astype(int)
def linear_classification(X: np.ndarray, Y: np.ndarray, X_test: np.ndarray, Y_test: np.ndarray, n_folds: int, n_comps_max: int, threshold: float, show_plots: bool, fignum: int, figsize: Tuple[int, int], normalize: bool): # Create k-folds kf = KFold(n_splits=n_folds) # PCA - CV cum_var_ratios = np.zeros((n_folds, n_comps_max)) for i, (train_inds, val_inds) in enumerate(kf.split(X)): X_train, X_val = X[train_inds,:], X[val_inds,:] model = decomposition.PCA(n_components=n_comps_max) scores = model.fit_transform(X_train) cum_var_ratios[i,:] = np.cumsum(model.explained_variance_ratio_) cum_var_ratios = np.pad(cum_var_ratios, ((0,0),(1,0)), 'constant') cum_var_means = np.mean(cum_var_ratios, axis=0) cum_var_stds = np.std(cum_var_ratios, axis=0) # Plot CV explained variance if show_plots: plt.figure(num=fignum, figsize=figsize) plt.errorbar(np.arange(0, n_comps_max+1), cum_var_means, yerr=cum_var_stds, ecolor='r') plt.title('Explained Variance ({:d}-fold CV)'.format(n_folds)) plt.xlabel('PCs') plt.ylabel('Cumulative explained variance') plt.xlim(0, n_comps_max) plt.ylim(0, 1) plt.show() # Find number of components based on CV n_comps = np.where(cum_var_means>=threshold)[0][0] print('In linear analysis: ') print('# of PCs need to explain {:.0f}% variance in x: {}\n'.format(threshold*100, n_comps)) # PCA model pca_model = decomposition.PCA(n_components=n_comps) train_scores = pca_model.fit_transform(X) if show_plots: pca_inspection(X, Y, n_comps) test_scores = pca_model.transform(X_test) # LDA model lda_model = LinearDiscriminantAnalysis() lda_model.fit(train_scores, Y) Yhat_train = lda_model.predict(train_scores) probs_train = lda_model.predict_proba(train_scores) # Predict Yhat_test = lda_model.predict(test_scores) probs_test = lda_model.predict_proba(test_scores) # Result analysis analyze_results(Y, Yhat_train, probs_train, Y_test, Yhat_test, probs_test, normalize=normalize)
def LDA(self): # load data df = pd.read_csv('data//train.csv') Train_data_transformed = df Y = Train_data_transformed["target"] X = Train_data_transformed.drop(['target'], axis=1) X_trainval, X_test, Y_trainval, Y_test = train_test_split( X, Y, random_state=0) # X_train, X_valid, Y_train, Y_valid = train_test_split(X_trainval, Y_trainval, random_state=0) # Standarize data scaler = StandardScaler().fit(X_trainval) X_trainval_transformed = scaler.transform(X_trainval) X_test_transformed = scaler.transform(X_test) # train LDA model Eva = Evaluation.Evaluation() best_score = 0 giniscore = 0 kfolds = 5 for C in [10, 20, 30, 40, 50]: Data_pca = PCA(n_components=C).fit(X_trainval_transformed) X_train_pca = Data_pca.transform(X_trainval_transformed) X_test_pca = Data_pca.transform(X_test_transformed) lda_model = LinearDiscriminantAnalysis().fit( X_train_pca, Y_trainval) prob = lda_model.predict_proba(X_test_pca)[:, 1] giniscore = Eva.gini_score(Y_test, prob) print("When n_components=", C, ":\nMean score is", giniscore) if giniscore > best_score: best_score = giniscore best_parameter = C #Get the best model using best parameter we chosen # Selected_PCA_model = PCA(n_components=50).fit(X_trainval_transformed) Selected_PCA_model = PCA( n_components=best_parameter).fit(X_trainval_transformed) X_train_pca_best = Selected_PCA_model.transform(X_trainval_transformed) X_test_pca_best = Selected_PCA_model.transform(X_test_transformed) LDA_model = LinearDiscriminantAnalysis().fit(X_train_pca_best, Y_trainval) self.gini = Eva.gini_score( Y_test, LDA_model.predict_proba(X_test_pca_best)[:, 1]) return LDA_model, self.gini
def LDA(data, target, train_index): X_train, X_test, y_train, y_test = train_test_split( data.iloc[:train_index, :], target, test_size=0.25) clf = LinearDiscriminantAnalysis(shrinkage='auto') clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) ll = log_loss(y_test, y_pred) return ll
def predict_LDA(self, x, y, x_test, y_test): LDA_predict = np.array([]) LDA = LinearDiscriminantAnalysis() LDA.fit(x, y) LDA_predict = np.append(LDA_predict, LDA.predict(x_test)) p = LDA.predict_proba(x_test) print("Lda done", np.mean(y_test == LDA_predict)) return LDA_predict, p
def lda(df, headers, title): lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) qda = QuadraticDiscriminantAnalysis(store_covariance=True) df_train = df[:int(len(df)*0.8)].reset_index(drop=True).fillna(0) df_test = df[int(len(df)*0.8):].reset_index(drop=True).fillna(0) lda.fit(df_train[headers], df_train['cho2_b']) qda.fit(df_train[headers], df_train['cho2_b']) y_pred=lda.predict(df_test[headers]) y=df_test['cho_b'] utils.evaluate(y, y_pred, 0, 'LDA '+title) utils.plot_eval(df_test, y, y_pred, title='LDA '+title) y_pred=qda.predict(df_test[headers]) utils.evaluate(y, y_pred, 0, 'QDA '+title) utils.plot_eval(df_test, y, y_pred, title='QDA '+title) # plot areas if len(headers) == 2: cho_true = df_test[df_test['cho2_b'] == True] cho_false = df_test[df_test['cho_b'] == False] fig = plt.figure(figsize=(12, 8)) plt.subplot(2, 1, 1) plt.suptitle('LDA') plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=8, marker='o') plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=15, marker='o') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()+1/1000000000000]) Z = Z[:, 1].reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap='RdBu', norm=colors.Normalize(0., 1.), zorder=0) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white') plt.legend() plt.subplot(2, 1, 2) plt.suptitle('QDA') plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=3, marker='o') plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=5, marker='x') nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap='RdBu', norm=colors.Normalize(0., 1.), zorder=0) plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white') plt.legend() return lda, qda
def fiteando(ResultadosX_train,ResultadosX_test,y_train,y_test,true): clf = LinearDiscriminantAnalysis() clf.fit(ResultadosX_train[:,0:10],y_train) probs_test = clf.predict_proba(ResultadosX_test[:,0:10])[:,1] precision, recall, thresholds = precision_recall_curve(y_test, probs_test, pos_label=true) F1_test = 2 * (precision * recall) / (precision + recall) F1_test = F1_test[1:] return F1_test, precision, recall, thresholds
def prob_pre_recall(x_fit,x,y_fit,y): lda = LinearDiscriminantAnalysis() lda.fit(x_fit[:,0:10],y_fit) proba = lda.predict_proba(x[:,0:10])[:,1] precision, recall, threshold = skm.precision_recall_curve(y,proba,pos_label = 1) f1 = 2*precision*recall/(precision+recall) return precision, recall, threshold, f1
def find_putative(self): train_val, train_lab = self.get_training( self.training, "DEGREE", ["IBD1", "IBD2"]) self.check_error(train_lab, ["2nd", "3rd"], "degree") classif = LinearDiscriminantAnalysis().fit( train_val, train_lab) self.putative["SECOND_PROB"] = self.putative.apply( lambda x: classif.predict_proba([[x.IBD1, x.IBD2]])[0][0], axis=1) self.putative = self.putative[ self.putative["SECOND_PROB"] > threshold]
def classify_second(self, train_df, put_df): train_val, train_lab = self.get_training( train_df[train_df["DEGREE"] == "2nd"], "REL", ["HSR", "N"]) self.check_error(train_lab, ["AV", "MHS", "PHS", "GP"], "2nd degree") classif = LinearDiscriminantAnalysis().fit( train_val, train_lab) probs = classif.predict_proba(put_df[["HSR", "N"]].values.tolist()) for index, rel in enumerate(self.second): put_df[rel] = [p[index] for p in probs] return put_df
def train_eval_LDA(args, config, train_xs, train_ys, test_xs, test_ys): lda = LDA() lda.fit(train_xs, train_ys) lda_pred = lda.predict(test_xs) pos_lda_proba = lda.predict_proba(test_xs)[:, 0] fpr, tpr, thresholds = metrics.roc_curve(test_ys, pos_lda_proba, pos_label=0) auroc = metrics.auc(fpr, tpr) accuracy = np.mean(np.equal(lda_pred, test_ys)) * 100 return accuracy, auroc
def LDA_top_k(trn, trn_label, tst, tst_label,num_label,group,top_k): labels_unified = range(len(group)) clf = LinearDiscriminantAnalysis() clf.fit(trn, trn_label) predict_probs = clf.predict_proba(trn) best_k = np.argsort(predict_probs, axis=1)[:,-top_k:] best_k_unified = [unify_label(r,group) for r in best_k] best_k_unified = np.array(best_k_unified).tolist() prob = [[res.count(l) for l in labels_unified] for res in best_k_unified] predict_unified = np.array([np.argmax(p) for p in prob]) trn_acc_unified = np.sum(predict_unified == unify_label(trn_label, group)) / (1.0 * len(predict_unified)) predict_probs = clf.predict_proba(tst) best_k = np.argsort(predict_probs, axis=1)[:,-top_k:] best_k_unified = [unify_label(r,group) for r in best_k] best_k_unified = np.array(best_k_unified).tolist() prob = [[res.count(l) for l in labels_unified] for res in best_k_unified] predict_unified = np.array([np.argmax(p) for p in prob]) tst_acc_unified = np.sum(predict_unified == unify_label(tst_label, group)) / (1.0 * len(predict_unified)) return trn_acc_unified,tst_acc_unified
def do_LDA(model, X_train, Y_train, X_test, Y_test): clf = LinearDiscriminantAnalysis() X_train = clf.fit_transform(X_train, Y_train) X_test = clf.transform(X_test) clf = model clf = clf.fit(X_train, Y_train) scores = clf.predict_proba(X_test) print("LDA") print(clf.score(X_test, Y_test)) trueLabelsBin = label_binarize(Y_test, classes=list(set(Y_test))) print(trueLabelsBin.ravel()) fpr, tpr, rf = roc_curve(trueLabelsBin.ravel(), scores.ravel()) return fpr, tpr
def train(self): try: model_score_dict = dict() model_start_time = datetime.datetime.now() lda = LinearDiscriminantAnalysis(shrinkage="auto", solver="lsqr", # eigen, svd(default) ) lda.fit(self.x_train, self.y_train) y_pred = lda.predict(self.x_test) acc_lda = accuracy_score(y_pred, self.y_test) print("Linear Discriminant Analysis Accuracy Score is : ", acc_lda) model_end_time = datetime.datetime.now() model_running_performance = model_end_time - model_start_time #Confusion Matrix conf_mat = confusion_matrix(self.y_test, y_pred) # ROC Curve pred_proba_lda = lda.predict_proba(self.x_test)[::, 1] fpr, tpr, _ = metrics.roc_curve(self.y_test, pred_proba_lda) auc_lda = metrics.roc_auc_score(self.y_test, pred_proba_lda) plt.figure() lw = 3 plt.plot(fpr, tpr, label="Linear Discriminant Analysis, auc_lda = " + str(auc_lda)) plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='dashed') plt.title('Linear Discriminant Analysis ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc=4) plt.savefig('./static/images/roc_lda.png') #Assign all score values to dict model_score_dict["model_running_performance"] = (model_running_performance.seconds/60) model_score_dict["accuracy"] = acc_lda model_score_dict["conf_mat"] = conf_mat.tolist() model_score_dict["fpr"] = fpr.tolist() model_score_dict["tpr"] = tpr.tolist() model_score_dict["auc"] = auc_lda md = ModelDetail(**{'AlgorithmName': 'Linear Discriminant Analysis', 'ModelScoreDict': str(model_score_dict)}) md.save() # Export model with open('./HRAnalysis/analysemodels/models/LDA.pkl', 'wb') as model_file: #pickle.dump(lda, model_file) pickle.dump({"columns": self.x_test.columns.tolist(), "model": lda}, model_file) except Exception as e: raise e
class LDA(Model): def __init__(self): input_type = NumericalDataTypesEnum.table output_type = NumericalDataTypesEnum.vector super().__init__(input_type=input_type, output_type=output_type) self.__model = LinearDiscriminantAnalysis(solver="svd") def predict(self, data: InputData): predicted = self.__model.predict_proba(data.features)[:, 1] return predicted def fit(self, data: InputData): train_data, _ = train_test_data_setup(data=data) self.__model.fit(train_data.features, train_data.target) def tune(self, data): return 1
def LDA(line_list, temp): """ :param line_list: list of SAM object :param temp: temperature :return: """ temp_list = [32, 37, 42, 47, 52, 57] coef_list = [[[-0.14494789, 0.18791679, 0.02588474]], [[-0.13364364, 0.22510179, 0.05494031]], [[-0.09006122, 0.25660706, 0.1078303]], [[-0.01593182, 0.24498485, 0.15753649]], [[0.01860365, 0.1750174, 0.17003374]], [[0.03236755, 0.11624593, 0.24306498]]] inter_list = [-1.17545204, -5.40436344, -12.45549846, -19.32670233, -20.11992898, -23.98652919] class_list = [-1, 1] try: classfier_index = temp_list.index(temp) except ValueError: print("The given temperature was not in temp_list:", temp_list) sys.exit() coef_array = np.asarray(coef_list) inter_array = np.asarray(inter_list) class_array = np.asarray(class_list) lda_classifer = LinearDiscriminantAnalysis() lda_classifer.coef_ = coef_array[classfier_index] lda_classifer.intercept_ = inter_array[classfier_index] lda_classifer.classes_ = class_array test_list = [] for sub_line in line_list: if sub_line.xs_tag: test_list.append([np.float(len(sub_line)), sub_line.xs_tag, sub_line.gc_content]) else: return False lda_prob = lda_classifer.predict_proba(np.asarray(test_list))[:, 1] lda_prob = map(lambda x: x < 0.5, lda_prob) if all(lda_prob): return True return False
def sim(): #save_data('test.data') A = np.loadtxt('test1.data', delimiter=',') y = A[:, 0] # Remove targets from input data A = A[:, 1:] for i in [0, 1, 2, 4]: for j in range(len(A)): A[j][i] = random.randint(0, 100) sel = VarianceThreshold(threshold=(.8 * (1 - .8))) sff = sel.fit_transform(A) clf = RandomForestClassifier() clf = clf.fit(A, y) hh = clf.feature_importances_ #jj = clf.predict([[1, 2, 3, 25, 50]]) model = SelectFromModel(clf, prefit=True) X_new = model.transform(A) hh2 = X_new.shape #plot(A, y) lda = LinearDiscriminantAnalysis(n_components=2) hh3 = lda.fit(A, y) drA = lda.transform(A) Z = generate_data_set(1, 5) Z = lda.transform(Z) z_lab = lda.predict(Z) z_prob = lda.predict_proba(Z) plt.figure() x = [l[0] for l in drA] y = [l[1] for l in drA] cls = [int(lda.predict([x1, y1])[0]) for x1, y1 in zip(x, y)] plt.scatter(x, y, c=[[1, 0, 0]]) plt.savefig('a.png')
def main(): """Read Train/test log.""" df = pd.read_csv("train.csv") # encode result label le = LabelEncoder().fit(df.species) labels = le.transform(df.species) classes = list(le.classes_) print classes # drop extra field df = df.drop(['species', 'id'], 1) # train/test split using stratified sampling sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23) for train_index, test_index in sss: x_train, x_test = df.values[train_index], df.values[test_index] y_train, y_test = labels[train_index], labels[test_index] # classification algorithm # classification(x_train, y_train, x_test, y_test) # Predict Test Set favorite_clf = LinearDiscriminantAnalysis() favorite_clf.fit(x_train, y_train) test = pd.read_csv('test.csv') test_ids = test.id test = test.drop(['id'], axis=1) test_predictions = favorite_clf.predict_proba(test) print test_predictions # Format DataFrame submission = pd.DataFrame(test_predictions, columns=classes) submission.tail() submission.insert(0, 'id', test_ids) submission.reset_index() submission.tail() # Export Submission submission.to_csv('submission.csv', index=False) submission.tail()
class LDAwithYHandling(BaseEstimator, TransformerMixin): def __init__(self): self.lda = LinearDiscriminantAnalysis() def maxIndexWithSampling(y): chosenIndices = np.empty((y.shape[0], 1)) for i in range(0, y.shape[0]): rand = random.random() rowY = y[i] cumSum = 0 for j in range(0, rowY.shape[0]): cumSum += rowY[j] if rand < cumSum: chosenIndices[i] = j break return chosenIndices def maxIndex(y): y_n = np.empty((y.shape[0], 1)) for i in range(0, y.shape[0]): maxYIndex = np.argmax(y[i]) y_n[i] = maxYIndex return y_n def fit(self, X, y, sample_weight=None): chosenIndices = np.argmax(y, axis=1) self.lda.fit(X, chosenIndices) return self def score(self, X, y, sample_weight=None): y_p = self.predict_proba(X) n_samples = X.shape[0] correl = 0 for i in range(0, n_samples): correla, _ = stats.spearmanr(y[i], y_p[i]) correl = correl + correla return correl / n_samples def predict_proba(self, X): return self.lda.predict_proba(X)
def fit(self, X,y, method='self-training', treshold=0.7): getLabel = lambda p: np.where(p>treshold)[0][0] if np.any(p>treshold) else -1 yp = copy(y) mask = np.ones(len(y),dtype=bool) #mask of labeled data mask[np.where(yp==-1)[0]] = False #cheke unlabeled data , msk = number of labeled data lda = LinearDiscriminantAnalysis(solver='svd',store_covariance=True, n_components=10) #print(y) #if there are no unlabeled data if(len(np.where(yp==-1)[0])==0): #replace with len(mask)=0? method = 'supervised' if method =='supervised': lda.fit(X[mask,:],yp[mask]) #train with all labeled data elif method=='self-training': counter=0 while True: lda.fit(X[mask,:],yp[mask]) if len(yp[~mask]) == 0 or counter == self.max_iter: break probs = lda.predict_proba(X[~mask]) yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype) counter+=1 mask = np.ones(len(y), dtype=bool) mask[np.where(yp==-1)[0]]=False elif method == 'label-propagation': label_prop_model=LabelPropagation(kernel='knn',n_neighbors=10,alpha=0.9) label_prop_model.fit(X,yp) #print(probs) probs = label_prop_model.predict_proba(X[~mask]) yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype) self.propagated_labels = yp lda.fit(X[mask,:],yp[mask]) else: raise('No valid method was given!') self.classifier, self.means_, self.covariance_ =lda, lda.means_, lda.covariance_
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, 'solver %s' % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, 'solver %s' % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, 'solver %s' % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg='solver %s' % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), 'solver %s' % solver # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def processTraining(cvtrainx,cvtrainy,cvevalx,prob=False): print cvtrainx[0] #cvevalx=[' '.join(s) for s in cvevalx] print cvevalx[0] tfv = TfidfVectorizer(min_df=10, max_features=None, strip_accents='unicode', analyzer=mytokenlizer, ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') cvtrainx=tfv.fit_transform(cvtrainx) cvevalx=tfv.transform(cvevalx) tsvd=TruncatedSVD(n_components=600,random_state=2016) cvtrainx=tsvd.fit_transform(cvtrainx) cvevalx=tsvd.transform(cvevalx) print len(tfv.get_feature_names()) print tfv.get_feature_names()[0:10] clf=LinearDiscriminantAnalysis() clf.fit(cvtrainx,cvtrainy) if prob: predictValue=clf.predict_proba(cvevalx) else: predictValue=clf.predict(cvevalx) return predictValue
print "Random Forest" test_model(model) model_lda = LinearDiscriminantAnalysis() print "LDA" test_model(model_lda) use_prediction = False raw_test_data, test_labels = readDataMultipleFiles([3]) test_data_matrix, test_data_matrices, test_labels, test_labels_binary = buildMatricesAndLabels(raw_test_data, test_labels, scaling_functions) test_predictions = [] for features in test_data_matrix: if not use_prediction: test_predictions.append(model_lda.decision_function([features])[0]) # score for classes_[1] else: test_predictions.append(model_lda.predict_proba([features])[0]) for i in range(target_count): print sum(test_labels_binary[i]) thresholds_for_bci = multiclassRoc(test_predictions, test_labels_binary) # model = SVC(C=1000, kernel="poly", degree=2) # print "SVM" # test_model(model) # pickle.Pickler(file("U:\\data\\test\\5_targets\\model0.pkl", "w")).dump(model_lda) # pickle.Pickler(file("U:\\data\\test\\5_targets\\model0_mm.pkl", "w")).dump(min_max) # pickle.Pickler(file("U:\\data\\test\\5_targets\\model0_thresh.pkl", "w")).dump(thresholds_for_bci) # print model_lda.coef_
def TrialClassificationWithPhysiology(phys_filename, trial_types, plot_results = False): BlockAB_stress_trial_inds = np.ravel(np.nonzero(trial_types==1)) BlockAB_reg_trial_inds = np.ravel(np.nonzero(trial_types==0)) num_trials = len(trial_types) phys_features = dict() sp.io.loadmat(phys_filename,phys_features) ibi_reg_mean = np.ravel(phys_features['ibi_reg_mean'] ) ibi_stress_mean = np.ravel(phys_features['ibi_stress_mean']) pupil_reg_mean = np.ravel(phys_features['pupil_reg_mean']) pupil_stress_mean = np.ravel(phys_features['pupil_stress_mean']) ibi = np.zeros([num_trials, 1]) ibi[BlockAB_reg_trial_inds] = ibi_reg_mean.reshape((len(BlockAB_reg_trial_inds),1)) ibi[BlockAB_stress_trial_inds] = ibi_stress_mean.reshape((len(BlockAB_stress_trial_inds),1)) pupil = np.zeros([num_trials,1]) pupil[BlockAB_reg_trial_inds] = pupil_reg_mean.reshape((len(BlockAB_reg_trial_inds),1)) pupil[BlockAB_stress_trial_inds] = pupil_stress_mean.reshape((len(BlockAB_stress_trial_inds),1)) ibi = ibi - np.nanmean(ibi) pupil = pupil - np.nanmean(pupil) # trial classification with physiological data X_phys = np.hstack((ibi, pupil)) svc = LinearDiscriminantAnalysis(solver='eigen', shrinkage = 'auto') #svc = SVC(kernel='linear', C=0.5, probability=True, random_state=0) #svc = LogisticRegression(C=1.0, penalty='l1') svc.fit(X_phys,trial_types) y_pred = svc.predict(X_phys) classif_rate = np.mean(y_pred.ravel()==trial_types.ravel())*100 xx = np.linspace(0.8*np.min(ibi),1.2*np.max(ibi),100) yy = np.linspace(0.8*np.min(pupil),1.2*np.max(pupil),100) xx,yy = np.meshgrid(xx,yy) Xfull = np.c_[xx.ravel(), yy.ravel()] probas = svc.predict_proba(Xfull) n_classes = np.unique(y_pred).size class_labels = ['Regular', 'Stress'] cmap = plt.get_cmap('bwr') #plt.title('SVM Classification with Physiological Data: %f correct' % (classif_rate)) if plot_results: plt.figure() for k in range(n_classes): plt.subplot(1,n_classes,k+1) plt.title(class_labels[k]) imshow_handle = plt.imshow(probas[:,k].reshape((100,100)), vmin = 0.1, vmax = 0.9,extent = (0.8*np.min(ibi),1.2*np.max(ibi),0.8*np.min(pupil),1.2*np.max(pupil)), origin = 'lower',aspect='auto', cmap = cmap) if k==0: plt.xlabel('IBI') plt.ylabel('Pupil') plt.xticks(()) plt.yticks(()) plt.axis('tight') idx = (y_pred == k) if idx.any(): plt.scatter(X_phys[idx,0], X_phys[idx,1],marker = 'o',color = 'k') ax = plt.axes([0.15, 0.04, 0.7, 0.05]) plt.colorbar(imshow_handle, cax = ax,orientation = 'horizontal') plt.title('SVM Classification with Physiological Data: %f correct' % (classif_rate)) plt.show() return ibi, pupil
mc_logloss = [] mc_train_pred = [] for i_mc in range(params['n_monte_carlo']): cv_n = params['cv_n'] kf = StratifiedKFold(target.values, n_folds=cv_n, shuffle=True, random_state=i_mc ** 3) xgboost_rounds = [] for cv_train_index, cv_test_index in kf: X_train, X_test = train[cv_train_index, :], train[cv_test_index, :] y_train, y_test = target.iloc[cv_train_index].values, target.iloc[cv_test_index].values lda.fit(X_train, y_train) # predict predicted_results = lda.predict_proba(X_test)[:, 1] train_predictions[cv_test_index] = predicted_results print('logloss score ', log_loss(target.values, train_predictions)) mc_logloss.append(log_loss(target.values, train_predictions)) mc_train_pred.append(train_predictions) mc_train_pred = np.mean(np.array(mc_train_pred), axis=0) mc_logloss_mean.append(np.mean(mc_logloss)) mc_logloss_sd.append(np.std(mc_logloss)) print('The Logloss range is: %.5f to %.5f' % (mc_logloss_mean[-1] - mc_logloss_sd[-1], mc_logloss_mean[-1] + mc_logloss_sd[-1])) print_results.append('The AUC range is: %.5f to %.5f' % (mc_logloss_mean[-1] - mc_logloss_sd[-1], mc_logloss_mean[-1] + mc_logloss_sd[-1])) print('For ', mc_logloss)
lda = LinearDiscriminantAnalysis() lda.fit(output, labels) print(lda.predict([[-0.8, -1]])) y_pred = lda.predict(output) print(labels) print(y_pred) mcc = matthews_corrcoef(labels,y_pred) print("MCC="+str(mcc)) # Plotting LDA contour nx, ny = 200, 100 x_min, x_max = np.amin(output[:,0]), np.amax(output[:,0]) y_min, y_max = np.amin(output[:,1]), np.amax(output[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed') # Plotting LDA means plt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10) plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10) plt.title('LDA with MDS and Gaussian Mixture') # Plot red and green data output_red = output[0:26] output_green = output[27:52] plt.scatter(output_red[:, 0], output_red[:,1], color='r') plt.scatter(output_green[:, 0], output_green[:, 1],color='g') plt.show()
def discriminatePlot(X, y, cVal, titleStr=''): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique(y, return_counts = True) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts = True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT) return -1, -1, -1, -1 , -1, -1, -1 cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl],1.0)) cClasses = np.asarray(cClasses) myPrior = np.ones(nClasses)*(1.0/nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX/5))) if nDmax < nD: print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0) # Initialise Classifiers ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') qdaMod = QDA(priors = myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaScores = np.zeros(cvFolds) qdaScores = np.zeros(cvFolds) rfScores = np.zeros(cvFolds) skf = cross_validation.StratifiedKFold(yGood, cvFolds) iskf = 0 for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) iskf += 1 if (iskf != cvFolds): cvFolds = iskf ldaScores.reshape(cvFolds) qdaScores.reshape(cvFolds) rfScores.reshape(cvFolds) # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print 'Error in ldaPlot: labels do not match' # Print the coefficients of first 3 DFA print 'LDA Weights:' print 'DFA1:', ldaMod.coef_[0,:] if nClasses > 2: print 'DFA2:', ldaMod.coef_[1,:] if nClasses > 3: print 'DFA3:', ldaMod.coef_[2,:] # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:,0] = xm1 if Xrr.shape[1] > 1 : Xm[:,1] = xm2 for ix in range(2,Xrr.shape[1]): Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm) : cWeight = yPredLDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix,:] = np.dot(cWinner, cClasses) XmcLDA[ix,3] = cWeight.max()/maxLDA # Plot the surface of probability plt.figure(facecolor='white', figsize=(10,3)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0))) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm) : cWeight = yPredQDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix,:] = np.dot(cWinner, cClasses) XmcQDA[ix,3] = cWeight.max()/maxQDA # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm) : cWeight = yPredRF[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix,:] = np.dot(cWinner, cClasses) XmcRF[ix,3] = cWeight.max()/maxRF # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.show() # Results ldaScore = ldaScores.mean()*100.0 qdaScore = qdaScores.mean()*100.0 rfScore = rfScores.mean()*100.0 ldaScoreSE = ldaScores.std() * 100.0 qdaScoreSE = qdaScores.std() * 100.0 rfScoreSE = rfScores.std() * 100.0 print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses) print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE) print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE) print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE) return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
def test_lda_predict_proba(solver, n_classes): def generate_dataset(n_samples, centers, covariances, random_state=None): """Generate a multivariate normal data given some centers and covariances""" rng = check_random_state(random_state) X = np.vstack([rng.multivariate_normal(mean, cov, size=n_samples // len(centers)) for mean, cov in zip(centers, covariances)]) y = np.hstack([[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]) return X, y blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes] blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers)) X, y = generate_dataset( n_samples=90000, centers=blob_centers, covariances=blob_stds, random_state=42 ) lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True, shrinkage=None).fit(X, y) # check that the empirical means and covariances are close enough to the # one used to generate the data assert_allclose(lda.means_, blob_centers, atol=1e-1) assert_allclose(lda.covariance_, blob_stds[0], atol=1) # implement the method to compute the probability given in The Elements # of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression # or LDA?") precision = linalg.inv(blob_stds[0]) alpha_k = [] alpha_k_0 = [] for clazz in range(len(blob_centers) - 1): alpha_k.append( np.dot(precision, (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis])) alpha_k_0.append( np.dot(- 0.5 * (blob_centers[clazz] + blob_centers[-1])[np.newaxis, :], alpha_k[-1])) sample = np.array([[-22, 22]]) def discriminant_func(sample, coef, intercept, clazz): return np.exp(intercept[clazz] + np.dot(sample, coef[clazz])) prob = np.array([float( discriminant_func(sample, alpha_k, alpha_k_0, clazz) / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz) for clazz in range(n_classes - 1)]))) for clazz in range( n_classes - 1)]) prob_ref = 1 - np.sum(prob) # check the consistency of the computed probability # all probabilities should sum to one prob_ref_2 = float( 1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz) for clazz in range(n_classes - 1)])) ) assert prob_ref == pytest.approx(prob_ref_2) # check that the probability of LDA are close to the theoretical # probabilties assert_allclose(lda.predict_proba(sample), np.hstack([prob, prob_ref])[np.newaxis], atol=1e-2)