Пример #1
0
def model(x_train, x_test, y_train, y_test):
    if len(set(list(x_train[:, 0]))) == 1:
        auc_train_1, auc_test_1, auc_train_2, auc_test_2 = np.nan, np.nan, np.nan, np.nan
    else:
        clf_1 = LinearDiscriminantAnalysis()
        clf_2 = LogisticRegression(penalty='l2', solver='liblinear', C=1)
        clf_1.fit(x_train, y_train)
        clf_2.fit(x_train, y_train)
        predict_train_1 = clf_1.predict_proba(x_train)
        predict_test_1 = clf_1.predict_proba(x_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_train,
                                                 predict_train_1[:, 1],
                                                 pos_label=1)
        auc_train_1 = metrics.auc(fpr, tpr)
        fpr, tpr, thresholds = metrics.roc_curve(y_test,
                                                 predict_test_1[:, 1],
                                                 pos_label=1)
        auc_test_1 = metrics.auc(fpr, tpr)
        predict_train_2 = clf_2.predict_proba(x_train)
        predict_test_2 = clf_2.predict_proba(x_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_train,
                                                 predict_train_2[:, 1],
                                                 pos_label=1)
        auc_train_2 = metrics.auc(fpr, tpr)
        fpr, tpr, thresholds = metrics.roc_curve(y_test,
                                                 predict_test_2[:, 1],
                                                 pos_label=1)
        auc_test_2 = metrics.auc(fpr, tpr)

    return auc_train_1, auc_test_1, auc_train_2, auc_test_2
Пример #2
0
def fiteando(ResultadosX_train,ResultadosX_test,y_train,y_test,true):
    F1_train = []
    F1_test = []
    for i in range(3,40):
        clf = LinearDiscriminantAnalysis()
        clf.fit(ResultadosX_train[:,0:i],y_train)
        y_predict_train = clf.predict(ResultadosX_train[:,0:i])
        y_predict_test = clf.predict(ResultadosX_test[:,0:i])
        if true ==0:
            probs_train = clf.predict_proba(ResultadosX_train[:,0:i])[:,0]
        else:
            probs_train = clf.predict_proba(ResultadosX_train[:,0:i])[:,1]
        precision_train, recall_train, thresholds = precision_recall_curve(y_train, probs_train, pos_label=true)
        Formula1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)
        
        if true ==0:
            probs_test = clf.predict_proba(ResultadosX_test[:,0:i])[:,0]
        else:
            probs_test = clf.predict_proba(ResultadosX_test[:,0:i])[:,1]
            
        precision_test, recall_test, thresholds = precision_recall_curve(y_test, probs_test, pos_label=true)
        Formula1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
        
        ddte = np.argmax(Formula1_test)
        ddtr = np.argmax(Formula1_train)
        F1_train.append(Formula1_train[ddtr])
        F1_test.append(Formula1_test[ddte])
    return F1_train,F1_test
Пример #3
0
class myLDABinary(myModel):
    def make(self , make_params ):
        self.model = LinearDiscriminantAnalysis(**make_params )
        return self

    def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ):
        if type(xtrain) == pd.core.frame.DataFrame:
            self.model.fit(xtrain.astype('float32') , ytrain.astype('float32')  , **fit_params)
        else:
            self.model.fit(xtrain , ytrain  , **fit_params)

    def predict(self , xs , threshold = 0.5):
        if type(xs) == pd.core.frame.DataFrame:
            return self.model.predict(xs.astype('float32'))
        else:
            return self.model.predict(xs)
                    
    def predict_proba(self, xs):
        if type(xs) == pd.core.frame.DataFrame:
            return self.model.predict_proba(xs.astype('float32'))[:,1]
        else:
            if len(xs.shape) == 1:
                return self.model.predict_proba(xs.reshape(1,-1))
            else:
                return self.model.predict_proba(xs)
Пример #4
0
def linear_discriminant_analysis(x_train, y_train, x_test, y_test, n_components=2, compute_threshold=True):
    '''
        Train Linear Discriminant Analysis (LDA) classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
        n_components: Number of components (< n_classes - 1) for dimensionality reduction.
    '''
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5}
    model = LinearDiscriminantAnalysis(priors=None, n_components=n_components)
    #X_r2 = model.fit(x_train, y_train).transform(X)
    metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False)


    model.fit(x_train, y_train)


    if compute_threshold is True:
        probTest  = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest    = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode)
    else:
        predTest    = model.predict(x_test)

    return predTest, metricsCV, model
Пример #5
0
class Ensamble_LDA(object):
    def __init__(self):
        self.model = LDA()

        self.ERP = np.load('../erp.npy')
        self.ERP /= np.linalg.norm(self.ERP)

        self.non_ERP = np.load('../non_erp.npy')
        self.non_ERP /= np.linalg.norm(self.non_ERP)

    def fit(self, X, y, *args, **kwargs):

        features = []

        for i in range(8):
            features.append(np.dot(X[:, i, :], self.ERP))
            features.append(np.dot(X[:, i, :], self.non_ERP))

        X = np.dstack(features)[0]

        self.model.fit(X, y)

    def predict(self, X):

        X = X.reshape(1, 8, SAMPLING_RATE)

        features = []

        for i in range(8):
            features.append(np.dot(X[:, i, :], self.ERP))
            features.append(np.dot(X[:, i, :], self.non_ERP))

        X = np.dstack(features)[0]

        return 1 if self.model.predict_proba(X)[0][1] > 0.7 else 0

    def predict_proba(self, X):

        X = X.reshape(1, 8, SAMPLING_RATE)

        features = []

        for i in range(8):
            features.append(np.dot(X[:, i, :], self.ERP))
            features.append(np.dot(X[:, i, :], self.non_ERP))

        X = np.dstack(features)[0]

        return self.model.predict_proba(X)[0][1]

    def __str__(self):
        return 'Ensamble_LDA'

    def __repr__(self):
        return 'Ensamble_LDA'
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert_true(np.any(y_pred3 != y3), "solver %s" % solver)

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
                           "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_allclose(
            np.exp(y_log_proba_pred1),
            y_proba_pred1,
            rtol=1e-6,
            atol=1e-6,
            err_msg="solver %s" % solver,
        )

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert np.any(y_pred3 != y3), "solver %s" % solver

    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    with pytest.raises(NotImplementedError):
        clf.fit(X, y)

    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     shrinkage=0.1,
                                     covariance_estimator=ShrunkCovariance())
    with pytest.raises(
            ValueError,
            match=("covariance_estimator and shrinkage "
                   "parameters are not None. "
                   "Only one of the two can be set."),
    ):
        clf.fit(X, y)

    # test bad solver with covariance_estimator
    clf = LinearDiscriminantAnalysis(solver="svd",
                                     covariance_estimator=LedoitWolf())
    with pytest.raises(ValueError,
                       match="covariance estimator is not supported with svd"):
        clf.fit(X, y)

    # test bad covariance estimator
    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     covariance_estimator=KMeans(
                                         n_clusters=2, n_init="auto"))
    with pytest.raises(ValueError):
        clf.fit(X, y)
def train_eval_pca_LDA(args, config, train_xs, train_ys, test_xs, test_ys,
                       reduced_dim):
    mean = train_xs.mean()
    std = train_xs.std()
    train_normed = (train_xs - mean) / std
    test_normed = (test_xs - mean) / std
    n_inst = np.shape(train_normed)[0]
    train_normed_T = train_normed.T
    R = train_normed_T.dot(train_normed) / (n_inst - 1)
    U, S, V = np.linalg.svd(R)
    Eigen_values = np.square(S + 1e-5)

    goodness_of_fit = Eigen_values / np.sum(Eigen_values).round(3)

    pc_score_train = np.matmul(np.asarray(train_normed), U[:, :reduced_dim])
    pc_score_test = np.matmul(np.asarray(test_normed), U[:, :reduced_dim])

    lda = LDA()
    lda.fit(pc_score_train, train_ys)
    lda_pred = lda.predict(pc_score_test)
    pos_lda_proba = lda.predict_proba(pc_score_test)[:, 0]

    fpr, tpr, thresholds = metrics.roc_curve(test_ys,
                                             pos_lda_proba,
                                             pos_label=0)
    auroc = metrics.auc(fpr, tpr)
    accuracy = np.mean(np.equal(lda_pred, test_ys)) * 100

    return goodness_of_fit, accuracy, auroc
class LinearDiscriminantAnalysisPredictor(PredictorBase):
    '''
    Linear Discriminant Analysis
    '''

    def __init__(self, animal_type):
        self.animal_type = animal_type
        self.clf = LinearDiscriminantAnalysis()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def find_best_params(self):
        parameters = {'solver': ['svd', 'lsqr', 'eigen']}
        knn = LinearDiscriminantAnalysis()
        clf = grid_search.GridSearchCV(knn, parameters)
        train_data = get_data('../data/train.csv')
        train_data = select_features(train_data, self.animal_type)
        X = train_data.drop(['OutcomeType'], axis=1)
        y = train_data['OutcomeType']
        clf.fit(X, y)
        print clf.best_params_
Пример #10
0
class LinearDiscriminantAnalysisImpl():

    def __init__(self, solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001):
        self._hyperparams = {
            'solver': solver,
            'shrinkage': shrinkage,
            'priors': priors,
            'n_components': n_components,
            'store_covariance': store_covariance,
            'tol': tol}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Пример #11
0
def run_linear_discriminant_analysis(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    clf = LinearDiscriminantAnalysis().fit(X_train, y_train)
    print('ML Model: Linear Discriminant Analysis')

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)

    train_predictions_p = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = clf.predict_proba(test)
    return test_predictions, acc, ll
Пример #12
0
def lda(np_train_x, np_train_y, np_test_x, np_test_y, verified_num,
        rejected_num, p):
    model_LDA = LinearDiscriminantAnalysis()
    model_LDA.fit(np_train_x, np_train_y)
    for prob in p:
        predicted_values_LDA = np.where(
            model_LDA.predict_proba(np_test_x)[:, 1] > prob, 1, 0)

        total_miss_classified_LDA = 0
        reject_wrong_LDA = 0
        verify_wrong_LDA = 0
        for i in range(len(np_test_x)):
            total_miss_classified_LDA += abs(np_test_y[i] -
                                             predicted_values_LDA[i])
            if np_test_y[i] == 1 and predicted_values_LDA[i] == 0:
                reject_wrong_LDA += 1
            if np_test_y[i] == 0 and predicted_values_LDA[i] == 1:
                verify_wrong_LDA += 1
        print("\n----------------------Linear Discriminant Analysis prob:",
              prob, "--------------------")
        print("miss-classification rate :", total_miss_classified_LDA / 25000,
              "\nFalse negative rate (type1 error) :",
              reject_wrong_LDA / verified_num,
              "\nFalse positive rate (type2 error) :",
              verify_wrong_LDA / rejected_num)
Пример #13
0
    def _get_5_fold_roc_input(self, target_df, field_flag):
        n_sample = target_df.shape[0]
        data_X = np.zeros((n_sample, self._n_feature), dtype=float)
        for feature_idx in range(self._n_feature):
            pc_str = self.get_pc_str(feature_idx)
            data_X[:, feature_idx] = target_df[pc_str].tolist()[:]
        data_Y = target_df[field_flag].tolist()
        data_Y = np.array(data_Y)

        # convert multi-class to single. only consider the class with largest label num.
        num_classes = np.max(data_Y) + 1

        estimate_prob = np.zeros((n_sample, ), dtype=float)
        n_fold = KFold(n_splits=5)
        logger.info('Run 5 fold LDA')
        for train_idx, test_idx in n_fold.split(data_X):
            data_X_train, data_X_test = data_X[train_idx], data_X[test_idx]
            data_Y_train, data_Y_test = data_Y[train_idx], data_Y[test_idx]

            lda_obj = LinearDiscriminantAnalysis(n_components=1)
            lda_obj.fit(data_X_train, data_Y_train)

            # print(data_X_test)
            # print(num_classes)
            # print(test_idx)
            estimate_prob[test_idx] = lda_obj.predict_proba(
                data_X_test)[:, int(num_classes) - 1]

            logger.info(
                f'Num of positive sample in test group {np.sum(data_Y_test)}')

        return estimate_prob, (data_Y == num_classes - 1).astype(int)
Пример #14
0
def linear_classification(X: np.ndarray, Y: np.ndarray, X_test: np.ndarray,
        Y_test: np.ndarray, n_folds: int, n_comps_max: int, threshold: float, show_plots: bool,
        fignum: int, figsize: Tuple[int, int], normalize: bool):
    # Create k-folds
    kf = KFold(n_splits=n_folds)
    # PCA - CV
    cum_var_ratios = np.zeros((n_folds, n_comps_max))
    for i, (train_inds, val_inds) in enumerate(kf.split(X)):
        X_train, X_val = X[train_inds,:], X[val_inds,:]
        model = decomposition.PCA(n_components=n_comps_max)
        scores = model.fit_transform(X_train)
        cum_var_ratios[i,:] = np.cumsum(model.explained_variance_ratio_)
    cum_var_ratios = np.pad(cum_var_ratios, ((0,0),(1,0)), 'constant')
    cum_var_means = np.mean(cum_var_ratios, axis=0)
    cum_var_stds = np.std(cum_var_ratios, axis=0)
    # Plot CV explained variance
    if show_plots:
        plt.figure(num=fignum, figsize=figsize)
        plt.errorbar(np.arange(0, n_comps_max+1), cum_var_means, yerr=cum_var_stds, ecolor='r')
        plt.title('Explained Variance ({:d}-fold CV)'.format(n_folds))
        plt.xlabel('PCs')
        plt.ylabel('Cumulative explained variance')
        plt.xlim(0, n_comps_max)
        plt.ylim(0, 1)
        plt.show()
    # Find number of components based on CV
    n_comps = np.where(cum_var_means>=threshold)[0][0]
    print('In linear analysis: ')
    print('# of PCs need to explain {:.0f}% variance in x: {}\n'.format(threshold*100, n_comps))
    # PCA model
    pca_model = decomposition.PCA(n_components=n_comps)
    train_scores = pca_model.fit_transform(X)
    if show_plots:
        pca_inspection(X, Y, n_comps)
    test_scores = pca_model.transform(X_test)
    # LDA model
    lda_model = LinearDiscriminantAnalysis()
    lda_model.fit(train_scores, Y)
    Yhat_train = lda_model.predict(train_scores)
    probs_train = lda_model.predict_proba(train_scores)
    # Predict
    Yhat_test = lda_model.predict(test_scores)
    probs_test = lda_model.predict_proba(test_scores)
    # Result analysis
    analyze_results(Y, Yhat_train, probs_train, Y_test, Yhat_test, probs_test,
            normalize=normalize)
Пример #15
0
    def LDA(self):
        # load data
        df = pd.read_csv('data//train.csv')
        Train_data_transformed = df
        Y = Train_data_transformed["target"]
        X = Train_data_transformed.drop(['target'], axis=1)
        X_trainval, X_test, Y_trainval, Y_test = train_test_split(
            X, Y, random_state=0)
        # X_train, X_valid, Y_train, Y_valid = train_test_split(X_trainval, Y_trainval, random_state=0)

        # Standarize data
        scaler = StandardScaler().fit(X_trainval)
        X_trainval_transformed = scaler.transform(X_trainval)
        X_test_transformed = scaler.transform(X_test)

        # train LDA model
        Eva = Evaluation.Evaluation()
        best_score = 0
        giniscore = 0
        kfolds = 5
        for C in [10, 20, 30, 40, 50]:
            Data_pca = PCA(n_components=C).fit(X_trainval_transformed)
            X_train_pca = Data_pca.transform(X_trainval_transformed)
            X_test_pca = Data_pca.transform(X_test_transformed)
            lda_model = LinearDiscriminantAnalysis().fit(
                X_train_pca, Y_trainval)
            prob = lda_model.predict_proba(X_test_pca)[:, 1]
            giniscore = Eva.gini_score(Y_test, prob)
            print("When n_components=", C, ":\nMean score is", giniscore)
            if giniscore > best_score:
                best_score = giniscore
                best_parameter = C

        #Get the best model using best parameter we chosen
        # Selected_PCA_model = PCA(n_components=50).fit(X_trainval_transformed)
        Selected_PCA_model = PCA(
            n_components=best_parameter).fit(X_trainval_transformed)
        X_train_pca_best = Selected_PCA_model.transform(X_trainval_transformed)
        X_test_pca_best = Selected_PCA_model.transform(X_test_transformed)

        LDA_model = LinearDiscriminantAnalysis().fit(X_train_pca_best,
                                                     Y_trainval)
        self.gini = Eva.gini_score(
            Y_test,
            LDA_model.predict_proba(X_test_pca_best)[:, 1])
        return LDA_model, self.gini
def LDA(data, target, train_index):
    X_train, X_test, y_train, y_test = train_test_split(
        data.iloc[:train_index, :], target, test_size=0.25)
    clf = LinearDiscriminantAnalysis(shrinkage='auto')
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    ll = log_loss(y_test, y_pred)
    return ll
Пример #17
0
 def predict_LDA(self, x, y, x_test, y_test):
     LDA_predict = np.array([])
     LDA = LinearDiscriminantAnalysis()
     LDA.fit(x, y)
     LDA_predict = np.append(LDA_predict, LDA.predict(x_test))
     p = LDA.predict_proba(x_test)
     print("Lda done", np.mean(y_test == LDA_predict))
     return LDA_predict, p
Пример #18
0
def lda(df, headers, title):
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    
    df_train = df[:int(len(df)*0.8)].reset_index(drop=True).fillna(0)
    df_test = df[int(len(df)*0.8):].reset_index(drop=True).fillna(0)

    lda.fit(df_train[headers], df_train['cho2_b'])
    qda.fit(df_train[headers], df_train['cho2_b'])

    y_pred=lda.predict(df_test[headers])
    y=df_test['cho_b']
    utils.evaluate(y, y_pred, 0, 'LDA '+title)
    utils.plot_eval(df_test, y, y_pred, title='LDA '+title)
    y_pred=qda.predict(df_test[headers])
    utils.evaluate(y, y_pred, 0, 'QDA '+title)
    utils.plot_eval(df_test, y, y_pred, title='QDA '+title)

    # plot areas
    if len(headers) == 2:
        cho_true = df_test[df_test['cho2_b'] == True]
        cho_false = df_test[df_test['cho_b'] == False]

        fig = plt.figure(figsize=(12, 8))
        plt.subplot(2, 1, 1)
        plt.suptitle('LDA')
        plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=8, marker='o')
        plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=15, marker='o')

        nx, ny = 200, 100
        x_min, x_max = plt.xlim()
        y_min, y_max = plt.ylim()
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                             np.linspace(y_min, y_max, ny))
        Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()+1/1000000000000])
        Z = Z[:, 1].reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap='RdBu',
                       norm=colors.Normalize(0., 1.), zorder=0)
        plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')
        plt.legend()

        plt.subplot(2, 1, 2)
        plt.suptitle('QDA')
        plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=3, marker='o')
        plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=5, marker='x')
        nx, ny = 200, 100
        x_min, x_max = plt.xlim()
        y_min, y_max = plt.ylim()
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                             np.linspace(y_min, y_max, ny))
        Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
        Z = Z[:, 1].reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap='RdBu',
                       norm=colors.Normalize(0., 1.), zorder=0)
        plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')
        plt.legend()

    return lda, qda
Пример #19
0
def fiteando(ResultadosX_train,ResultadosX_test,y_train,y_test,true):

    clf = LinearDiscriminantAnalysis()
    clf.fit(ResultadosX_train[:,0:10],y_train)
    probs_test = clf.predict_proba(ResultadosX_test[:,0:10])[:,1]
    precision, recall, thresholds = precision_recall_curve(y_test, probs_test, pos_label=true)
    F1_test = 2 * (precision * recall) / (precision + recall)
    F1_test = F1_test[1:]
    return F1_test, precision, recall, thresholds
Пример #20
0
def prob_pre_recall(x_fit,x,y_fit,y):

	lda = LinearDiscriminantAnalysis()
	lda.fit(x_fit[:,0:10],y_fit)
	proba = lda.predict_proba(x[:,0:10])[:,1]
	precision, recall, threshold = skm.precision_recall_curve(y,proba,pos_label = 1)
	f1 = 2*precision*recall/(precision+recall)

	return precision, recall, threshold, f1
Пример #21
0
 def find_putative(self):
     train_val, train_lab = self.get_training(
         self.training, "DEGREE", ["IBD1", "IBD2"])
     self.check_error(train_lab, ["2nd", "3rd"], "degree")
     classif = LinearDiscriminantAnalysis().fit(
         train_val, train_lab)
     self.putative["SECOND_PROB"] = self.putative.apply(
         lambda x: classif.predict_proba([[x.IBD1, x.IBD2]])[0][0],
         axis=1)
     self.putative = self.putative[
         self.putative["SECOND_PROB"] > threshold]
Пример #22
0
 def classify_second(self, train_df, put_df):
     train_val, train_lab = self.get_training(
         train_df[train_df["DEGREE"] == "2nd"], "REL", ["HSR", "N"])
     self.check_error(train_lab, ["AV", "MHS", "PHS", "GP"],
                      "2nd degree")
     classif = LinearDiscriminantAnalysis().fit(
         train_val, train_lab)
     probs = classif.predict_proba(put_df[["HSR",
                                           "N"]].values.tolist())
     for index, rel in enumerate(self.second):
         put_df[rel] = [p[index] for p in probs]
     return put_df
def train_eval_LDA(args, config, train_xs, train_ys, test_xs, test_ys):
    lda = LDA()
    lda.fit(train_xs, train_ys)
    lda_pred = lda.predict(test_xs)
    pos_lda_proba = lda.predict_proba(test_xs)[:, 0]

    fpr, tpr, thresholds = metrics.roc_curve(test_ys,
                                             pos_lda_proba,
                                             pos_label=0)
    auroc = metrics.auc(fpr, tpr)
    accuracy = np.mean(np.equal(lda_pred, test_ys)) * 100

    return accuracy, auroc
Пример #24
0
def LDA_top_k(trn, trn_label, tst, tst_label,num_label,group,top_k):
	labels_unified = range(len(group))
	clf = LinearDiscriminantAnalysis()
	clf.fit(trn, trn_label)

	predict_probs = clf.predict_proba(trn)
	best_k = np.argsort(predict_probs, axis=1)[:,-top_k:]
	best_k_unified = [unify_label(r,group) for r in best_k]
	best_k_unified = np.array(best_k_unified).tolist()
	prob = [[res.count(l) for l in labels_unified] for res in best_k_unified]
	predict_unified = np.array([np.argmax(p) for p in prob])
	trn_acc_unified = np.sum(predict_unified == unify_label(trn_label, group)) / (1.0 * len(predict_unified))

	predict_probs = clf.predict_proba(tst)
	best_k = np.argsort(predict_probs, axis=1)[:,-top_k:]
	best_k_unified = [unify_label(r,group) for r in best_k]
	best_k_unified = np.array(best_k_unified).tolist()
	prob = [[res.count(l) for l in labels_unified] for res in best_k_unified]
	predict_unified = np.array([np.argmax(p) for p in prob])
	tst_acc_unified = np.sum(predict_unified == unify_label(tst_label, group)) / (1.0 * len(predict_unified))

	return trn_acc_unified,tst_acc_unified
Пример #25
0
def do_LDA(model, X_train, Y_train, X_test, Y_test):
    clf = LinearDiscriminantAnalysis()
    X_train = clf.fit_transform(X_train, Y_train)
    X_test = clf.transform(X_test)
    clf = model
    clf = clf.fit(X_train, Y_train)
    scores = clf.predict_proba(X_test)
    print("LDA")
    print(clf.score(X_test, Y_test))
    trueLabelsBin = label_binarize(Y_test, classes=list(set(Y_test)))
    print(trueLabelsBin.ravel())
    fpr, tpr, rf = roc_curve(trueLabelsBin.ravel(), scores.ravel())
    return fpr, tpr
    def train(self):
        try:
            model_score_dict = dict()
            model_start_time = datetime.datetime.now()

            lda = LinearDiscriminantAnalysis(shrinkage="auto",
                                             solver="lsqr",  # eigen, svd(default)
                                             )
            lda.fit(self.x_train, self.y_train)
            y_pred = lda.predict(self.x_test)
            acc_lda = accuracy_score(y_pred, self.y_test)
            print("Linear Discriminant Analysis Accuracy Score is : ", acc_lda)

            model_end_time = datetime.datetime.now()
            model_running_performance = model_end_time - model_start_time

            #Confusion Matrix
            conf_mat = confusion_matrix(self.y_test, y_pred)

            # ROC Curve
            pred_proba_lda = lda.predict_proba(self.x_test)[::, 1]
            fpr, tpr, _ = metrics.roc_curve(self.y_test, pred_proba_lda)
            auc_lda = metrics.roc_auc_score(self.y_test, pred_proba_lda)

            plt.figure()
            lw = 3
            plt.plot(fpr, tpr, label="Linear Discriminant Analysis, auc_lda = " + str(auc_lda))
            plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='dashed')
            plt.title('Linear Discriminant Analysis ROC')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.legend(loc=4)
            plt.savefig('./static/images/roc_lda.png')

            #Assign all score values to dict
            model_score_dict["model_running_performance"] = (model_running_performance.seconds/60)
            model_score_dict["accuracy"] = acc_lda
            model_score_dict["conf_mat"] = conf_mat.tolist()
            model_score_dict["fpr"] = fpr.tolist()
            model_score_dict["tpr"] = tpr.tolist()
            model_score_dict["auc"] = auc_lda

            md = ModelDetail(**{'AlgorithmName': 'Linear Discriminant Analysis', 'ModelScoreDict': str(model_score_dict)})
            md.save()

            # Export model
            with open('./HRAnalysis/analysemodels/models/LDA.pkl', 'wb') as model_file:
                #pickle.dump(lda, model_file)
                pickle.dump({"columns": self.x_test.columns.tolist(), "model": lda}, model_file)
        except Exception as e:
            raise e
Пример #27
0
class LDA(Model):
    def __init__(self):
        input_type = NumericalDataTypesEnum.table
        output_type = NumericalDataTypesEnum.vector

        super().__init__(input_type=input_type, output_type=output_type)
        self.__model = LinearDiscriminantAnalysis(solver="svd")

    def predict(self, data: InputData):
        predicted = self.__model.predict_proba(data.features)[:, 1]
        return predicted

    def fit(self, data: InputData):
        train_data, _ = train_test_data_setup(data=data)
        self.__model.fit(train_data.features, train_data.target)

    def tune(self, data):
        return 1
Пример #28
0
def LDA(line_list, temp):
    """

    :param line_list: list of SAM object
    :param temp: temperature
    :return:
    """
    temp_list = [32, 37, 42, 47, 52, 57]
    coef_list = [[[-0.14494789, 0.18791679, 0.02588474]],
                 [[-0.13364364, 0.22510179, 0.05494031]],
                 [[-0.09006122, 0.25660706, 0.1078303]],
                 [[-0.01593182, 0.24498485, 0.15753649]],
                 [[0.01860365, 0.1750174, 0.17003374]],
                 [[0.03236755, 0.11624593, 0.24306498]]]
    inter_list = [-1.17545204, -5.40436344, -12.45549846,
                  -19.32670233, -20.11992898, -23.98652919]
    class_list = [-1, 1]
    try:
        classfier_index = temp_list.index(temp)
    except ValueError:
        print("The given temperature was not in temp_list:", temp_list)
        sys.exit()

    coef_array = np.asarray(coef_list)
    inter_array = np.asarray(inter_list)
    class_array = np.asarray(class_list)

    lda_classifer = LinearDiscriminantAnalysis()

    lda_classifer.coef_ = coef_array[classfier_index]
    lda_classifer.intercept_ = inter_array[classfier_index]
    lda_classifer.classes_ = class_array

    test_list = []
    for sub_line in line_list:
        if sub_line.xs_tag:
            test_list.append([np.float(len(sub_line)), sub_line.xs_tag, sub_line.gc_content])
        else:
            return False
    lda_prob = lda_classifer.predict_proba(np.asarray(test_list))[:, 1]
    lda_prob = map(lambda x: x < 0.5, lda_prob)
    if all(lda_prob):
        return True
    return False
def sim():
    #save_data('test.data')
    A = np.loadtxt('test1.data', delimiter=',')

    y = A[:, 0]

    # Remove targets from input data
    A = A[:, 1:]

    for i in [0, 1, 2, 4]:
        for j in range(len(A)):
            A[j][i] = random.randint(0, 100)

    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    sff = sel.fit_transform(A)

    clf = RandomForestClassifier()
    clf = clf.fit(A, y)
    hh = clf.feature_importances_
    #jj = clf.predict([[1, 2, 3, 25, 50]])

    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(A)
    hh2 = X_new.shape

    #plot(A, y)

    lda = LinearDiscriminantAnalysis(n_components=2)
    hh3 = lda.fit(A, y)
    drA = lda.transform(A)

    Z = generate_data_set(1, 5)
    Z = lda.transform(Z)
    z_lab = lda.predict(Z)
    z_prob = lda.predict_proba(Z)

    plt.figure()
    x = [l[0] for l in drA]
    y = [l[1] for l in drA]
    cls = [int(lda.predict([x1, y1])[0]) for x1, y1 in zip(x, y)]
    plt.scatter(x, y, c=[[1, 0, 0]])
    plt.savefig('a.png')
Пример #30
0
def main():
    """Read Train/test log."""
    df = pd.read_csv("train.csv")

    # encode result label
    le = LabelEncoder().fit(df.species)
    labels = le.transform(df.species)
    classes = list(le.classes_)
    print classes

    # drop extra field
    df = df.drop(['species', 'id'], 1)

    # train/test split using stratified sampling
    sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)
    for train_index, test_index in sss:
        x_train, x_test = df.values[train_index], df.values[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

    # classification algorithm
    # classification(x_train, y_train, x_test, y_test)

    # Predict Test Set
    favorite_clf = LinearDiscriminantAnalysis()
    favorite_clf.fit(x_train, y_train)
    test = pd.read_csv('test.csv')
    test_ids = test.id
    test = test.drop(['id'], axis=1)
    test_predictions = favorite_clf.predict_proba(test)
    print test_predictions

    # Format DataFrame
    submission = pd.DataFrame(test_predictions, columns=classes)
    submission.tail()
    submission.insert(0, 'id', test_ids)
    submission.reset_index()
    submission.tail()

    # Export Submission
    submission.to_csv('submission.csv', index=False)
    submission.tail()
Пример #31
0
class LDAwithYHandling(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lda = LinearDiscriminantAnalysis()

    def maxIndexWithSampling(y):
        chosenIndices = np.empty((y.shape[0], 1))
        for i in range(0, y.shape[0]):
            rand = random.random()
            rowY = y[i]
            cumSum = 0
            for j in range(0, rowY.shape[0]):
                cumSum += rowY[j]
                if rand < cumSum:
                    chosenIndices[i] = j
                    break
        return chosenIndices

    def maxIndex(y):
        y_n = np.empty((y.shape[0], 1))
        for i in range(0, y.shape[0]):
            maxYIndex = np.argmax(y[i])
            y_n[i] = maxYIndex
        return y_n

    def fit(self, X, y, sample_weight=None):
        chosenIndices = np.argmax(y, axis=1)
        self.lda.fit(X, chosenIndices)
        return self

    def score(self, X, y, sample_weight=None):
        y_p = self.predict_proba(X)
        n_samples = X.shape[0]
        correl = 0
        for i in range(0, n_samples):
            correla, _ = stats.spearmanr(y[i], y_p[i])
            correl = correl + correla
        return correl / n_samples

    def predict_proba(self, X):
        return self.lda.predict_proba(X)
Пример #32
0
 def fit(self, X,y, method='self-training', treshold=0.7):
     getLabel = lambda p: np.where(p>treshold)[0][0] if np.any(p>treshold) else -1 
     yp = copy(y)
     mask = np.ones(len(y),dtype=bool) #mask of labeled data
     mask[np.where(yp==-1)[0]] = False #cheke unlabeled data , msk = number of labeled data
     
     lda = LinearDiscriminantAnalysis(solver='svd',store_covariance=True, n_components=10)
     #print(y)
     #if there are no unlabeled data
     if(len(np.where(yp==-1)[0])==0):  #replace with len(mask)=0?
         method = 'supervised'
         
     if method =='supervised':
         lda.fit(X[mask,:],yp[mask]) #train with all labeled data
      
     elif method=='self-training':
         counter=0
         while True:
             lda.fit(X[mask,:],yp[mask])
             if len(yp[~mask]) == 0 or counter == self.max_iter:
                 break
             probs = lda.predict_proba(X[~mask])
             yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype)
             counter+=1
             mask = np.ones(len(y), dtype=bool)
             mask[np.where(yp==-1)[0]]=False
             
     elif method == 'label-propagation':
         label_prop_model=LabelPropagation(kernel='knn',n_neighbors=10,alpha=0.9)
         label_prop_model.fit(X,yp)
         #print(probs)
         probs = label_prop_model.predict_proba(X[~mask])
         yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype)
         self.propagated_labels = yp
         
         lda.fit(X[mask,:],yp[mask])
         
     else:
         raise('No valid method was given!')
     self.classifier, self.means_, self.covariance_ =lda, lda.means_, lda.covariance_
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, 'solver %s' % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, 'solver %s' % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
                           'solver %s' % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_allclose(np.exp(y_log_proba_pred1),
                        y_proba_pred1,
                        rtol=1e-6,
                        atol=1e-6,
                        err_msg='solver %s' % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert np.any(y_pred3 != y3), 'solver %s' % solver

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
def processTraining(cvtrainx,cvtrainy,cvevalx,prob=False):
    print cvtrainx[0]
    #cvevalx=[' '.join(s) for s in cvevalx]
    print cvevalx[0]
    tfv = TfidfVectorizer(min_df=10,  max_features=None,
        strip_accents='unicode', analyzer=mytokenlizer,
        ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

    cvtrainx=tfv.fit_transform(cvtrainx)
    cvevalx=tfv.transform(cvevalx)
    tsvd=TruncatedSVD(n_components=600,random_state=2016)
    cvtrainx=tsvd.fit_transform(cvtrainx)
    cvevalx=tsvd.transform(cvevalx)
    print len(tfv.get_feature_names())
    print tfv.get_feature_names()[0:10]
    clf=LinearDiscriminantAnalysis()
    clf.fit(cvtrainx,cvtrainy)
    if prob:
        predictValue=clf.predict_proba(cvevalx)
    else:
        predictValue=clf.predict(cvevalx)
    return predictValue
Пример #35
0
print "Random Forest"
test_model(model)

model_lda = LinearDiscriminantAnalysis()
print "LDA"
test_model(model_lda)

use_prediction = False
raw_test_data, test_labels = readDataMultipleFiles([3])
test_data_matrix, test_data_matrices, test_labels, test_labels_binary = buildMatricesAndLabels(raw_test_data, test_labels, scaling_functions)
test_predictions = []
for features in test_data_matrix:
    if not use_prediction:
        test_predictions.append(model_lda.decision_function([features])[0])  # score for classes_[1]
    else:
        test_predictions.append(model_lda.predict_proba([features])[0])

for i in range(target_count):
    print sum(test_labels_binary[i])

thresholds_for_bci = multiclassRoc(test_predictions, test_labels_binary)

# model = SVC(C=1000, kernel="poly", degree=2)
# print "SVM"
# test_model(model)

# pickle.Pickler(file("U:\\data\\test\\5_targets\\model0.pkl", "w")).dump(model_lda)
# pickle.Pickler(file("U:\\data\\test\\5_targets\\model0_mm.pkl", "w")).dump(min_max)
# pickle.Pickler(file("U:\\data\\test\\5_targets\\model0_thresh.pkl", "w")).dump(thresholds_for_bci)

# print model_lda.coef_
def TrialClassificationWithPhysiology(phys_filename, trial_types, plot_results = False):
	
	BlockAB_stress_trial_inds = np.ravel(np.nonzero(trial_types==1))
	BlockAB_reg_trial_inds = np.ravel(np.nonzero(trial_types==0))
	num_trials = len(trial_types)

	phys_features = dict()
	sp.io.loadmat(phys_filename,phys_features)
	ibi_reg_mean = np.ravel(phys_features['ibi_reg_mean'] )
	ibi_stress_mean = np.ravel(phys_features['ibi_stress_mean'])
	pupil_reg_mean = np.ravel(phys_features['pupil_reg_mean'])
	pupil_stress_mean = np.ravel(phys_features['pupil_stress_mean'])

	ibi = np.zeros([num_trials, 1])
	ibi[BlockAB_reg_trial_inds] = ibi_reg_mean.reshape((len(BlockAB_reg_trial_inds),1))
	ibi[BlockAB_stress_trial_inds] = ibi_stress_mean.reshape((len(BlockAB_stress_trial_inds),1))
	pupil = np.zeros([num_trials,1])
	pupil[BlockAB_reg_trial_inds] = pupil_reg_mean.reshape((len(BlockAB_reg_trial_inds),1))
	pupil[BlockAB_stress_trial_inds] = pupil_stress_mean.reshape((len(BlockAB_stress_trial_inds),1))

	ibi = ibi - np.nanmean(ibi)
	pupil = pupil - np.nanmean(pupil)

	# trial classification with physiological data
	X_phys = np.hstack((ibi, pupil))
	svc = LinearDiscriminantAnalysis(solver='eigen', shrinkage = 'auto')
	#svc = SVC(kernel='linear', C=0.5, probability=True, random_state=0)
	#svc = LogisticRegression(C=1.0, penalty='l1')
	svc.fit(X_phys,trial_types)
	y_pred = svc.predict(X_phys)
	classif_rate = np.mean(y_pred.ravel()==trial_types.ravel())*100

	xx = np.linspace(0.8*np.min(ibi),1.2*np.max(ibi),100)
	yy = np.linspace(0.8*np.min(pupil),1.2*np.max(pupil),100)
	xx,yy = np.meshgrid(xx,yy)
	Xfull = np.c_[xx.ravel(), yy.ravel()]
	probas = svc.predict_proba(Xfull)
	n_classes = np.unique(y_pred).size
	class_labels = ['Regular', 'Stress']

	cmap = plt.get_cmap('bwr')
	
	#plt.title('SVM Classification with Physiological Data: %f correct' % (classif_rate))
	if plot_results:
		plt.figure()
		for k in range(n_classes):
			plt.subplot(1,n_classes,k+1)
			plt.title(class_labels[k])
			imshow_handle = plt.imshow(probas[:,k].reshape((100,100)), vmin = 0.1, vmax = 0.9,extent = (0.8*np.min(ibi),1.2*np.max(ibi),0.8*np.min(pupil),1.2*np.max(pupil)), origin = 'lower',aspect='auto', cmap = cmap)
			if k==0:
				plt.xlabel('IBI')
				plt.ylabel('Pupil')
			plt.xticks(())
			plt.yticks(())
			plt.axis('tight')
			idx = (y_pred == k)
			if idx.any():
				plt.scatter(X_phys[idx,0], X_phys[idx,1],marker = 'o',color = 'k')
		ax = plt.axes([0.15, 0.04, 0.7, 0.05])		
		plt.colorbar(imshow_handle, cax = ax,orientation = 'horizontal')
		plt.title('SVM Classification with Physiological Data: %f correct' % (classif_rate))
		plt.show()

	return ibi, pupil
Пример #37
0
    mc_logloss = []
    mc_train_pred = []
    for i_mc in range(params['n_monte_carlo']):
        cv_n = params['cv_n']
        kf = StratifiedKFold(target.values, n_folds=cv_n, shuffle=True, random_state=i_mc ** 3)

        xgboost_rounds = []

        for cv_train_index, cv_test_index in kf:
            X_train, X_test = train[cv_train_index, :], train[cv_test_index, :]
            y_train, y_test = target.iloc[cv_train_index].values, target.iloc[cv_test_index].values

            lda.fit(X_train, y_train)

            # predict
            predicted_results = lda.predict_proba(X_test)[:, 1]
            train_predictions[cv_test_index] = predicted_results

        print('logloss score ', log_loss(target.values, train_predictions))
        mc_logloss.append(log_loss(target.values, train_predictions))
        mc_train_pred.append(train_predictions)

    mc_train_pred = np.mean(np.array(mc_train_pred), axis=0)

    mc_logloss_mean.append(np.mean(mc_logloss))
    mc_logloss_sd.append(np.std(mc_logloss))
    print('The Logloss range is: %.5f to %.5f' %
          (mc_logloss_mean[-1] - mc_logloss_sd[-1], mc_logloss_mean[-1] + mc_logloss_sd[-1]))
    print_results.append('The AUC range is: %.5f to %.5f' %
                         (mc_logloss_mean[-1] - mc_logloss_sd[-1], mc_logloss_mean[-1] + mc_logloss_sd[-1]))
    print('For ', mc_logloss)
lda = LinearDiscriminantAnalysis()
lda.fit(output, labels)
print(lda.predict([[-0.8, -1]]))

y_pred = lda.predict(output)
print(labels)
print(y_pred)
mcc = matthews_corrcoef(labels,y_pred)
print("MCC="+str(mcc))

# Plotting LDA contour
nx, ny = 200, 100
x_min, x_max = np.amin(output[:,0]), np.amax(output[:,0])
y_min, y_max = np.amin(output[:,1]), np.amax(output[:,1])
xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))
Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z[:, 1].reshape(xx.shape)
plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed')

# Plotting LDA means
plt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10)
plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10)
plt.title('LDA with MDS and Gaussian Mixture')

# Plot red and green data
output_red = output[0:26]
output_green = output[27:52]
plt.scatter(output_red[:, 0], output_red[:,1], color='r')
plt.scatter(output_green[:, 0], output_green[:, 1],color='g')
plt.show()
Пример #39
0
def discriminatePlot(X, y, cVal, titleStr=''):
    # Frederic's Robust Wrapper for discriminant analysis function.  Performs lda, qda and RF afer error checking, 
    # Generates nice plots and returns cross-validated
    # performance, stderr and base line.
    # X np array n rows x p parameters
    # y group labels n rows
    # rgb color code for each data point - should be the same for each data beloging to the same group
    # titleStr title for plots
    # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
    
    # Global Parameters
    CVFOLDS = 10
    MINCOUNT = 10
    MINCOUNTTRAINING = 5 
    
    # Initialize Variables and clean up data
    classes, classesCount = np.unique(y, return_counts = True)  # Classes to be discriminated should be same as ldaMod.classes_
    goodIndClasses = np.array([n >= MINCOUNT for n in classesCount])
    goodInd = np.array([b in classes[goodIndClasses] for b in y])
    yGood = y[goodInd]
    XGood = X[goodInd]
    cValGood = cVal[goodInd]


    classes, classesCount = np.unique(yGood, return_counts = True) 
    nClasses = classes.size         # Number of classes or groups  

    # Do we have enough data?  
    if (nClasses < 2):
        print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT)
        return -1, -1, -1, -1 , -1, -1, -1
    cvFolds = min(min(classesCount), CVFOLDS)
    if (cvFolds < CVFOLDS):
        print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS)
   
    # Data size and color values   
    nD = XGood.shape[1]                 # number of features in X
    nX = XGood.shape[0]                 # number of data points in X
    cClasses = []   # Color code for each class
    for cl in classes:
        icl = (yGood == cl).nonzero()[0][0]
        cClasses.append(np.append(cValGood[icl],1.0))
    cClasses = np.asarray(cClasses)
    myPrior = np.ones(nClasses)*(1.0/nClasses)  

    # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted.
    nDmax = int(np.fix(np.sqrt(nX/5)))
    if nDmax < nD:
        print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' 
    nDmax = min(nD, nDmax)
    pca = PCA(n_components=nDmax)
    Xr = pca.fit_transform(XGood)
    print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0)
    
    
    # Initialise Classifiers  
    ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') 
    qdaMod = QDA(priors = myPrior)
    rfMod = RF()   # by default assumes equal weights

        
    # Perform CVFOLDS fold cross-validation to get performance of classifiers.
    ldaScores = np.zeros(cvFolds)
    qdaScores = np.zeros(cvFolds)
    rfScores = np.zeros(cvFolds)
    skf = cross_validation.StratifiedKFold(yGood, cvFolds)
    iskf = 0
    
    for train, test in skf:
        
        # Enforce the MINCOUNT in each class for Training
        trainClasses, trainCount = np.unique(yGood[train], return_counts=True)
        goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount])
        goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]])

        # Specity the training data set, the number of groups and priors
        yTrain = yGood[train[goodIndTrain]]
        XrTrain = Xr[train[goodIndTrain]]

        trainClasses, trainCount = np.unique(yTrain, return_counts=True) 
        ntrainClasses = trainClasses.size
        
        # Skip this cross-validation fold because of insufficient data
        if ntrainClasses < 2:
            continue
        goodInd = np.array([b in trainClasses for b in yGood[test]])    
        if (goodInd.size == 0):
            continue
           
        # Fit the data
        trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses)
        ldaMod.priors = trainPriors
        qdaMod.priors = trainPriors
        ldaMod.fit(XrTrain, yTrain)
        qdaMod.fit(XrTrain, yTrain)        
        rfMod.fit(XrTrain, yTrain)
        

        ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]])

        iskf += 1
     
    if (iskf !=  cvFolds):
        cvFolds = iskf
        ldaScores.reshape(cvFolds)
        qdaScores.reshape(cvFolds)
        rfScores.reshape(cvFolds)
      
# Refit with all the data  for the plots
        
    ldaMod.priors = myPrior
    qdaMod.priors = myPrior
    Xrr = ldaMod.fit_transform(Xr, yGood)
    # Check labels
    for a, b in zip(classes, ldaMod.classes_):
        if a != b:
            print 'Error in ldaPlot: labels do not match'
  
    # Print the coefficients of first 3 DFA 
    print 'LDA Weights:'
    print 'DFA1:', ldaMod.coef_[0,:]
    if nClasses > 2:
        print 'DFA2:', ldaMod.coef_[1,:] 
    if nClasses > 3:
        print 'DFA3:', ldaMod.coef_[2,:] 
        
    # Obtain fits in this rotated space for display purposes   
    ldaMod.fit(Xrr, yGood)    
    qdaMod.fit(Xrr, yGood)
    rfMod.fit(Xrr, yGood)
    
    XrrMean = Xrr.mean(0)
                
    # Make a mesh for plotting
    x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1))
    xm1 = np.reshape(x1, -1)
    xm2 = np.reshape(x2, -1)
    nxm = np.size(xm1)
    Xm = np.zeros((nxm, Xrr.shape[1]))
    Xm[:,0] = xm1
    if Xrr.shape[1] > 1 :
        Xm[:,1] = xm2
        
    for ix in range(2,Xrr.shape[1]):
        Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix]
        
    XmcLDA = np.zeros((nxm, 4))  # RGBA values for color for LDA
    XmcQDA = np.zeros((nxm, 4))  # RGBA values for color for QDA
    XmcRF = np.zeros((nxm, 4))  # RGBA values for color for RF

    
    # Predict values on mesh for plotting based on the first two DFs     
    yPredLDA = ldaMod.predict_proba(Xm) 
    yPredQDA = qdaMod.predict_proba(Xm) 
    yPredRF = rfMod.predict_proba(Xm)

    
    # Transform the predictions in color codes
    maxLDA = yPredLDA.max()
    for ix in range(nxm) :
        cWeight = yPredLDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcLDA[ix,:] = np.dot(cWinner, cClasses)
        XmcLDA[ix,3] = cWeight.max()/maxLDA
    
    # Plot the surface of probability    
    plt.figure(facecolor='white', figsize=(10,3))
    plt.subplot(131)
    Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0)))
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))    
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')

    
    # Transform the predictions in color codes
    maxQDA = yPredQDA.max()
    for ix in range(nxm) :
        cWeight = yPredQDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcQDA[ix,:] = np.dot(cWinner, cClasses)
        XmcQDA[ix,3] = cWeight.max()/maxQDA
    
    # Plot the surface of probability    
    plt.subplot(132)
    Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    
    # Transform the predictions in color codes
    maxRF = yPredRF.max()
    for ix in range(nxm) :
        cWeight = yPredRF[ix,:]           # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses  # Weighted colors does not work
        XmcRF[ix,:] = np.dot(cWinner, cClasses)
        XmcRF[ix,3] = cWeight.max()/maxRF
    
    # Plot the surface of probability    
    plt.subplot(133)
    Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:    
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    plt.show()


    # Results
    ldaScore = ldaScores.mean()*100.0
    qdaScore = qdaScores.mean()*100.0
    rfScore = rfScores.mean()*100.0
    ldaScoreSE = ldaScores.std() * 100.0
    qdaScoreSE = qdaScores.std() * 100.0 
    rfScoreSE = rfScores.std() * 100.0 
    
    print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses)
    print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE)
    print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE)
    print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE)
    return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
def test_lda_predict_proba(solver, n_classes):
    def generate_dataset(n_samples, centers, covariances, random_state=None):
        """Generate a multivariate normal data given some centers and
        covariances"""
        rng = check_random_state(random_state)
        X = np.vstack([rng.multivariate_normal(mean, cov,
                                               size=n_samples // len(centers))
                       for mean, cov in zip(centers, covariances)])
        y = np.hstack([[clazz] * (n_samples // len(centers))
                       for clazz in range(len(centers))])
        return X, y

    blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
    blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
    X, y = generate_dataset(
        n_samples=90000, centers=blob_centers, covariances=blob_stds,
        random_state=42
    )
    lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True,
                                     shrinkage=None).fit(X, y)
    # check that the empirical means and covariances are close enough to the
    # one used to generate the data
    assert_allclose(lda.means_, blob_centers, atol=1e-1)
    assert_allclose(lda.covariance_, blob_stds[0], atol=1)

    # implement the method to compute the probability given in The Elements
    # of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
    # or LDA?")
    precision = linalg.inv(blob_stds[0])
    alpha_k = []
    alpha_k_0 = []
    for clazz in range(len(blob_centers) - 1):
        alpha_k.append(
            np.dot(precision,
                   (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis]))
        alpha_k_0.append(
            np.dot(- 0.5 * (blob_centers[clazz] +
                            blob_centers[-1])[np.newaxis, :], alpha_k[-1]))

    sample = np.array([[-22, 22]])

    def discriminant_func(sample, coef, intercept, clazz):
        return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))

    prob = np.array([float(
        discriminant_func(sample, alpha_k, alpha_k_0, clazz) /
        (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
                  for clazz in range(n_classes - 1)]))) for clazz in range(
                      n_classes - 1)])

    prob_ref = 1 - np.sum(prob)

    # check the consistency of the computed probability
    # all probabilities should sum to one
    prob_ref_2 = float(
        1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
                      for clazz in range(n_classes - 1)]))
    )

    assert prob_ref == pytest.approx(prob_ref_2)
    # check that the probability of LDA are close to the theoretical
    # probabilties
    assert_allclose(lda.predict_proba(sample),
                    np.hstack([prob, prob_ref])[np.newaxis],
                    atol=1e-2)