Exemplo n.º 1
1
def classify_using_lda(feat1, feat2, num_comp=2):

    n_plus = len(feat1)
    n_minus = len(feat2)

    X = np.concatenate((feat1, feat2), axis=0)
    y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0)
    y += 1

    print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape)

    lda = LDA(n_components=num_comp)
    lda.fit(X, y)

    # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2?
    # Is it able to to differentiate using just 1 component? Crazy!!
    X_tr = lda.transform(X)

    print(X_tr.shape, lda.score(X, y))

    # CRAZY, we don't actually have the 2nd component from LDA
    X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1)
    X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1)

    plt.plot(X1[:, 0], X1[:, 1], 'ro')
    plt.plot(X2[:, 0], X2[:, 1], 'g+')

    plt.ylim(-1, 3)
    plt.show()
Exemplo n.º 2
0
def computing_performance_LDA(in_path=None, seeds=list([0])):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    n_times = len(seeds)
    for k in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k])
        sum_u65, sum_u80 = 0, 0
        lda.fit(X_train, y_train)
        n, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        print("--k-->", k, sum_u65 / n, sum_u80 / n)
        mean_u65 += sum_u65 / n
        mean_u80 += sum_u80 / n
    print("--->", mean_u65 / n_times, mean_u80 / n_times)
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert_true(np.any(y_pred3 != y3), "solver %s" % solver)

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
Exemplo n.º 4
0
    def tuneSpatialFilters(self):

        print colors.MAGENTA
        num_total_spatial_filters = self.all_spatial_filters.shape[0]

        best_mean = 0
        best_num = 0
        best_score = None

        for i in xrange(num_total_spatial_filters):

            num_filters_to_try = i+1
            print "trying with first",num_filters_to_try,"spatial filters"
            trial_X = self.extractFeatures(self.epochs, self.all_spatial_filters[:num_filters_to_try])
            lda = LinearDiscriminantAnalysis()
            lda = lda.fit(trial_X, self.y)
            cross_validation_folds = 10
            xval = cross_val_score(lda, trial_X, self.y, cv=cross_validation_folds)
            #print xval
            this_mean = xval.mean()
            print "mean",this_mean
            if this_mean > best_mean:
                best_mean = this_mean
                best_num = num_filters_to_try
                best_score = xval

        print "-----------------------------"
        print "best mean was", best_mean, "with", best_num, "filters used"
        print best_score

        print colors.ENDC
def performLDA(data_to_fit, y, numComponent=None):
    data_to_fit_np_t = np.array(data_to_fit).T
    if numComponent is None:
        numComponent = len(data_to_fit_np_t)
    lda_model = LinearDiscriminantAnalysis(n_components=numComponent)
    lda_results = lda_model.fit_transform(data_to_fit_np_t, y)
    return lda_model, lda_results
Exemplo n.º 6
0
    def test(self):
        iris = datasets.load_iris()
        
        X = iris.data
        y = iris.target
        target_names = iris.target_names
        
        pca = PCA(n_components=3)
        X_r = pca.fit(X).transform(X)
        
        lda = LinearDiscriminantAnalysis(n_components=3)
        X_r2 = lda.fit(X, y).transform(X)
        
        # Percentage of variance explained for each components
        print('explained variance ratio (first two components): %s'
              % str(pca.explained_variance_ratio_))

        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
            ax.scatter(X_r[y == i, 0], X_r[y == i, 1], zs=X[y == i, 2], c=c, label=target_name)
        plt.legend()
        plt.title('PCA of IRIS dataset')
            
        fig2 = plt.figure()
        ax = fig2.add_subplot(111, projection='3d')
        for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
            ax.scatter(X_r2[y == i, 0], X_r2[y == i, 1], zs=X[y == i, 2], c=c, label=target_name)
        plt.legend()
        plt.title('LDA of IRIS dataset')
            
        plt.show()
def visualize_lda2D(X,y):
	"""
	Visualize the separation between classes using the two most discriminant features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
	labels=['Paid','Default']
	lda = LDA(n_components = 2,solver='eigen')
	# lda = LDA(n_components = 2)
	discriminative_attributes = lda.fit(X, y).transform(X)

	palette = sea.color_palette()
	# plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5)
	# plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5)
	plt.xlabel('First Linear Discriminant')
	plt.ylabel('Second Linear Discriminant')

	leg = plt.legend(loc='upper right', fancybox=True)
	leg.get_frame().set_alpha(0.5)
	plt.title("Linear Discriminant Analysis")
	plt.tight_layout

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/lda.png'.format(output_dir))
Exemplo n.º 8
0
def plot_lda(features, labels):
    """
    Input
        features: features to get LDA and plot
        labels: labels of features
    Description
        plots the LDA of features
    """
    lda = LinearDiscriminantAnalysis(n_components=2)
    new_features = lda.fit(chroma[0], chroma[1]).transform(chroma[0])

    colors = list("rgbykrgbyk")
    markers = list("xxxxxooooo")

    plt.figure(len(genres)) # for all together
    for i, genre in enumerate(genres):
        plt.figure(i) # for one particular genre
        plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0],
                    new_features[i*num_songs:(i+1)*num_songs, 1],
                    c=colors[i], marker=markers[i], label=genre)
        plt.title(genre)

        plt.figure(len(genres)) # for all together
        plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0],
                    new_features[i*num_songs:(i+1)*num_songs, 1],
                    c=colors[i], marker=markers[i], label=genre)
    plt.legend()
    plt.title('LDA')
    plt.show()
Exemplo n.º 9
0
def main():
    """Read Train/test log."""
    df = pd.read_csv("train.csv")

    # train/test split using stratified sampling
    labels = df['label']
    df = df.drop(['label'], 1)
    sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)
    for train_index, test_index in sss:
        x_train, x_test = df.values[train_index], df.values[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

    # classification algorithm
    classification(x_train, y_train, x_test, y_test)

    # Predict Test Set
    favorite_clf = LinearDiscriminantAnalysis()
    favorite_clf.fit(x_train, y_train)
    test = pd.read_csv('test.csv')
    test_predictions = favorite_clf.predict(test)
    print test_predictions

    # Format DataFrame
    submission = pd.DataFrame(test_predictions, columns=['Label'])
    submission.tail()
    submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1)
    submission.reset_index()
    submission.tail()

    # Export Submission
    submission.to_csv('submission.csv', index=False)
    submission.tail()
class LinearDiscriminantAnalysisPredictor(PredictorBase):
    '''
    Linear Discriminant Analysis
    '''

    def __init__(self, animal_type):
        self.animal_type = animal_type
        self.clf = LinearDiscriminantAnalysis()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def find_best_params(self):
        parameters = {'solver': ['svd', 'lsqr', 'eigen']}
        knn = LinearDiscriminantAnalysis()
        clf = grid_search.GridSearchCV(knn, parameters)
        train_data = get_data('../data/train.csv')
        train_data = select_features(train_data, self.animal_type)
        X = train_data.drop(['OutcomeType'], axis=1)
        y = train_data['OutcomeType']
        clf.fit(X, y)
        print clf.best_params_
def test_lda_orthogonality():
    # arrange four classes with their means in a kite-shaped pattern
    # the longer distance should be transformed to the first component, and
    # the shorter distance to the second component.
    means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])

    # We construct perfectly symmetric distributions, so the LDA can estimate
    # precise means.
    scatter = np.array([[0.1, 0, 0], [-0.1, 0, 0], [0, 0.1, 0], [0, -0.1, 0],
                        [0, 0, 0.1], [0, 0, -0.1]])

    X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
    y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])

    # Fit LDA and transform the means
    clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
    means_transformed = clf.transform(means)

    d1 = means_transformed[3] - means_transformed[0]
    d2 = means_transformed[2] - means_transformed[1]
    d1 /= np.sqrt(np.sum(d1 ** 2))
    d2 /= np.sqrt(np.sum(d2 ** 2))

    # the transformed within-class covariance should be the identity matrix
    assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))

    # the means of classes 0 and 3 should lie on the first component
    assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)

    # the means of classes 1 and 2 should lie on the second component
    assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
Exemplo n.º 12
0
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    for idx_train, idx_test in kf.split(y):
        print("---k-FOLD-new-executing--")
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lda.fit(X_cv_train, y_cv_train)
        n_test = len(idx_test)
        sum_u65, sum_u80 = 0, 0
        for i, test in enumerate(X_cv_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
Exemplo n.º 13
0
class LinearDiscriminantAnalysiscls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.lda_cls = LinearDiscriminantAnalysis()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.lda_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.lda_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.lda_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
Exemplo n.º 14
0
def plot_lda_only(filename, title, filename_fig):

    df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None)
    fig = plt.figure()
    fig.suptitle(title, fontsize=20)
    columns_ls = []
    for column in df.columns:
        columns_ls.append(column)

    X = df[columns_ls[0:len(columns_ls)-1]].values
    Y = df[columns_ls[len(columns_ls)-1]].values

    clf_lda = LinearDiscriminantAnalysis()
    clf_lda.fit(X, Y)
    w = clf_lda.coef_[0]
    a = -w[0]/w[1]

    xx = np.linspace(-12, 34)
    yy = a*xx-clf_lda.intercept_[0]/w[1]
    plt.plot(xx,yy, color="blue", label ="LDA decision boundary")

    print "Weights W0 %.2f and W1%.2f"%(w[0], w[1])
    plt.text(0, 0, "Y=+1", fontsize=12)
    plt.text(10, -20, "Y=-1", fontsize=12)
    # plt.plot(xx, yy_down, 'k--')
    # plt.plot(xx, yy_up, 'k--')
    # plt.plot(xx,yy,color="black", label ="svm decision boundary")


    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=16)
    # fig.savefig(filename_fig)
    # model = LogisticRegression()
    # model.fit(X, Y)
    # w = model.coef_[0]
    # a = -w[0]/w[1]
    #
    # xx = np.linspace(-12, 34)
    # yy = a*xx-model.intercept_[0]/w[1]
    #
    # plt.plot(xx,yy, label ="logistic decision boundary")
    #
    # clf_lda = LinearDiscriminantAnalysis()
    # clf_lda.fit(X, Y)
    # w = clf_lda.coef_[0]
    # a = -w[0]/w[1]
    #
    # xx = np.linspace(-12, 34)
    # yy = a*xx-clf_lda.intercept_[0]/w[1]
    # plt.plot(xx,yy, color="blue", label ="LDA decision boundary")

    # plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
    #         s=80, color='b')
    plt.scatter(X[:, 0], X[:, 1], c=Y)

    plt.axis('tight')
    plt.legend()

    plt.show()
Exemplo n.º 15
0
def feature_distribute_4_projection(channel_length=4, projection='pca'):
    ''' 六个动作的四个特征组合的2D映射分布,共34个通道  '''
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
    markers = ['o', '+', 'v', '^', '*', 'x']
    sample_len = 100
    subjects = ['subject_'+str(i+1) for i in range(2)]          # 受试者
    # subjects = ['subject_1']
    for subject in subjects:
        title_pre = subject + '_feature_class_'

        channel_num = 34                                        # 通道
        # channel_num = 18
        # channel_num = 1
        for channel in range(channel_num):

            feature_list = ['MAV', 'ZC', 'SSC', 'WL']
            # feature_list = ['MAV']

            actions = [i + 1 for i in range(6)]                 # 动作
            # actions = [1, 2]
            fig = plt.figure(figsize=(8, 6))
            ax = fig.add_subplot()
            trains = np.array([])
            targets = np.array([], np.int)
            for action in actions:
                filename = title_pre + str(action)
                feature = np.load(
                    root_path + '/train1_250_100/' + filename + '.npy')
                train = feature[:sample_len, channel * channel_length : channel * channel_length+4]
                target = np.ones(train.shape[0], np.int) * action
                # print train.shape, target.shape, target[0, 0:5]
                trains = np.concatenate((trains, train), axis=None)
                targets = np.concatenate(
                    (targets, target), axis=None)

                # sys.exit(0)
            trains = trains.reshape((-1, 4))
            
            # print trains.shape, targets.shape
            if projection == 'pca':
                pca = PCA(n_components=2)
                X_r = pca.fit(trains).transform(trains)
            elif projection == 'lda':
                lda = LinearDiscriminantAnalysis(n_components=2)
                X_r = lda.fit(trains, targets).transform(trains)

            for action in actions:
                plt.scatter(X_r[targets == action, 0], X_r[targets == action, 1], 
                    c=colors[action], marker=markers[
                    action % 1], alpha=0.5, label=action)
            plt.legend()
            plt.title(subject + '-channel_' + str(channel) + '-' + projection + '-TD4')
            
            # plt.show()
            plt.savefig(
                'result/figure/distribute4_proj/' + subject + '-channel_'
                + str(channel) + '-' + projection + '-TD4',
                dpi=120)
            plt.close()
Exemplo n.º 16
0
def doLDA(x,digits,s):
    myLDA = LDA()
    myLDA.fit(x.PCA[:,:s],digits.train_Labels)
    newtest = digits.test_Images -x.centers
    [email protected](x.V[:s,:])
    labels = myLDA.predict(newtest)
    errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels)
    return errors
def Train(enhancedGeneSet, classLabels):
    enhancedGeneSet = np.array(enhancedGeneSet);
    classLabels = np.array(classLabels);
    classifier = LinearDiscriminantAnalysis();
    classifier.fit(enhancedGeneSet, classLabels);
    #del enhancedGeneSet;
    #del classLabels;
    return classifier;
Exemplo n.º 18
0
def lda(X, y, n):
	'''
		Returns optimal projection of the data
		LDA with n components
	'''
	selector = LinearDiscriminantAnalysis(n_components=n)
	selector.fit(X, y)
	return selector.transform(X), y
Exemplo n.º 19
0
    def train_model(self):
        ### Train spectrum data
        # form training data and labels
        X = np.empty((0, self.freq_cutoff), int)
        y = np.empty((0, 1), int)

        data_dir = 'clap_data/claps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [1] * data.shape[0])

        data_dir = 'clap_data/noclaps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [0] * data.shape[0])

        # pca = PCA(n_components=200)
        # X_pca = pca.fit_transform(X)

        # fit the model
        # clf = LogisticRegression(penalty='l1')
        clf = LinearDiscriminantAnalysis()
        clf.fit(X, y)
        preds = clf.predict(X)
        # X_new = clf.transform(X)

        # clf2 = LinearDiscriminantAnalysis()
        # clf2.fit(X_new, y)
        # preds2 = clf2.predict(X_new)

        # print X.shape, X_pca.shape
        print preds
        print np.sum(preds), preds.size
        # print preds2, np.sum(preds2)

        # save model
        pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w'))
        self.clap_clf = clf

        ### Train decay data
        X = np.empty((0, self.decay_samples/10), int)

        data_dir = 'clap_data/claps/decay/'
        for fname in os.listdir(data_dir):
            if fname.endswith('npy'):
                data = np.load("%s%s"% (data_dir, fname))
                print data.shape, X.shape
                X = np.append(X, data, axis=0)

        print X.shape
        X_avg = np.mean(X, axis=0)
        plt.plot(X_avg)
        plt.show()

        # Average decay data
        np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
Exemplo n.º 20
0
Arquivo: util.py Projeto: pvigier/sa
def plot_lda(X, y):
    colors = ['b', 'r']
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_r = lda.fit(X, y).transform(X)
    plt.figure()
    for i, c in enumerate(colors):
        plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=str(i))
    plt.legend()
    plt.title('PCA')
Exemplo n.º 21
0
 def _get_lda(self, data, variables):
     domain = Domain(attributes=variables, class_vars=data.domain.class_vars)
     data = data.transform(domain)
     lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
     lda.fit(data.X, data.Y)
     scalings = lda.scalings_[:, :2].T
     if scalings.shape == (1, 1):
         scalings = np.array([[1.], [0.]])
     return scalings
def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
    """
    Tests that if the number of samples equals the number
    of classes, a ValueError is raised.
    """
    X = np.array([[0.5, 0.6], [0.6, 0.5]])
    y = np.array(["a", "b"])
    clf = LinearDiscriminantAnalysis(solver=solver)
    with pytest.raises(ValueError, match="The number of samples must be more"):
        clf.fit(X, y)
Exemplo n.º 23
0
def feature_scaling(feature_matrix,target,reductor=None,scaler=None):
    lda = LDA(n_components=2)    
    minmax = MinMaxScaler(feature_range=(-1,1))
    if not reductor:
        reductor = lda.fit(feature_matrix,target)
    feature_matrix_lda = reductor.transform(feature_matrix)
    if not scaler:
        scaler = minmax.fit(feature_matrix_lda)
    feature_matrix_scaled = scaler.transform(feature_matrix_lda)
    return feature_matrix_scaled,reductor,scaler
Exemplo n.º 24
0
	def self_tune(self, X, y, verbose=False):
		# fix random seed for reproducibility
		seed = 5
		np.random.seed(seed)

		# define k-fold cross validation test harness
		kfold = StratifiedKFold(y=y, n_folds=self.tuning_csp_num_folds, shuffle=True, random_state=seed)

		# init scores
		cvscores = {}
		for i in xrange(1,self.num_spatial_filters):
			cvscores[i+1] = 0

		for i, (train, test) in enumerate(kfold):
			# calculate CSP spatial filters
			csp = CSP(n_components=self.num_spatial_filters)
			csp.fit(X[train], y[train])

			# try all filters, from the given num down to 2
			# (1 is too often found to be overfitting)
			for j in xrange(2,self.num_spatial_filters):
				num_filters_to_try = j

				# calculate spatial filters
				csp.n_components = num_filters_to_try
				# apply CSP filters to train data
				tuning_train_LDA_features = csp.transform(X[train])
				np.nan_to_num(tuning_train_LDA_features)
				check_X_y(tuning_train_LDA_features, y[train])

				# apply CSP filters to test data
				tuning_test_LDA_features = csp.transform(X[test])
				np.nan_to_num(tuning_test_LDA_features)
				check_X_y(tuning_test_LDA_features, y[test])


				# train LDA
				lda = LinearDiscriminantAnalysis()
				prediction_score = lda.fit(tuning_train_LDA_features, y[train]).score(tuning_test_LDA_features, y[test])

				cvscores[num_filters_to_try] += prediction_score

				if verbose:
					print "prediction score", prediction_score, "with",num_filters_to_try,"spatial filters"

		best_num = max(cvscores, key=cvscores.get)
		best_score = cvscores[best_num] / i+1
		if verbose:
			print "best num filters:", best_num, "(average accuracy ",best_score,")"
			print "average scores per filter num:"
			for k in cvscores:
				print k,":", cvscores[k]/i+1

		return [best_num, best_score]
Exemplo n.º 25
0
def assess_embedding(to_vec):
	"""
	Returns LDA classification score and projected data
	"""
	(x_data, y_data) = get_x_y_matrices(to_vec)

	lda = LDA(n_components=2)
	x_prime = lda.fit_transform(x_data, y_data)
	score = lda.score(x_data, y_data)

	return (x_prime.reshape(26, ), y_data, score)
def test_lda_explained_variance_ratio():
    # Test if the sum of the normalized eigen vectors values equals 1
    n_features = 2
    n_classes = 2
    n_samples = 1000
    X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                      centers=n_classes, random_state=11)

    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
    clf_lda_eigen.fit(X, y)
    assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
Exemplo n.º 27
0
def LD(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelLD=LinearDiscriminantAnalysis()
     modelLD.fit(train_desc,np.array(train_labels))
     joblib.dump((modelLD, img_classes, stdSlr), pth+"/ld-bof.pkl", compress=3) 
     test(pth, "ld-")
Exemplo n.º 28
0
 def testEvaluateLDA(self, trCList, teCList):
     # LDA object
     clf = LinearDiscriminantAnalysis()
     # fit lda model using training chromosomes
     clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings))
     
     predicted = clf.predict(teCList)
         
     self.confusionMatrix(testGroupings, predicted, 'lda_test')
     
     # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy
     return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
Exemplo n.º 29
0
def PCA_plot(D, TFS, EXPS, A , toEXPS, toTFS):
	A 	= A.T
	pca = sd.PCA(n_components=2)
	X_r = pca.fit(A).transform(A)
	F 	= plt.figure(figsize=(15,10))
	ax 	= F.add_subplot(111)
	y 	= [get_color(toEXPS[i], EXPS, BINARY=True) for i in range(X_r.shape[0]) ]
	lda = LinearDiscriminantAnalysis(n_components=2)
	X_r2 = lda.fit(D, y).transform(D)

	ax.scatter(X_r[:,0], X_r[:,1], c=[get_color(toEXPS[i], EXPS) for i in range(X_r.shape[0])], s=150 )
	plt.show()
Exemplo n.º 30
0
def transformLDA(X,y,xTest):
    
    originalSize = np.size(X,1)
    print("Learning LDA \nProjecting {} features to 1 component".format(originalSize))
    priors = [0.5,0.5]

    clf = LinearDiscriminantAnalysis('svd', n_components=1,priors=priors)
    print(X.shape)
    X = clf.fit_transform(X,y)
    print("True size of X : ", X.shape)

    if xTest != []:
        xTest = clf.transform(xTest)
    return X,xTest
Exemplo n.º 31
0
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Loading data form Iris
data = load_iris()
# Setting data
x = data.data
# Setting target
y = data.target
# Giving test and train samples
train_x, test_x, train_y, test_y = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=12)
# Calling Linear Discriminant method
lindiscamodel = LinearDiscriminantAnalysis()
# Calling Logical Regression method
logregmodel = LogisticRegression()
# Fitting the train data
lindiscamodel.fit(train_x, train_y)
# Predicting the Test data
liprediction = lindiscamodel.predict(test_x)
# Accuracy for linear regression
print("Accuracy for linear regression is ",
      accuracy_score(liprediction, test_y))
# Fitting logical regression model
logregmodel.fit(train_x, train_y)
# Prediction logical regression model
loprediction = logregmodel.predict(test_x)
# Logistic regression accuracy
Exemplo n.º 32
0
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# print(X_train)
# print Y_train 

# print("n_components size: %d" % X_train.size)
# print("n_components len: %d" % len(X_train))
# print("n_components index 0: %d" % len(X_train[0]))

# pca = PCA(n_components = (len(X_train)-2))
# pca.fit(X_train)
# X_train = pca.transform(X_train)

# Run LDA

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
# lda.fit(zip(*(shuffle(X_train, Y_train))))


# project training data onto found axes so can look later at plots of how its discriminating
X_trans = lda.transform(X_train)
# Transform validation set  onro these new axes
X_trans2 = lda.transform(X_validation)

#Make predictions as to what each data row is in the validation set
predictions = lda.predict(X_validation)

#look at accuracy of predicitons
for i in range(Y_validation.size):
    if predictions[i] == Y_validation[i] :
Exemplo n.º 33
0
time_test_svm = (time.clock() - start)

# test knn classifier
parameters_knn = {'n_neighbors': [2, 10, 5, 20, 50, 100]}
start = time.clock()
best_knn = get_best_model(KNeighborsClassifier(), parameters_knn, X_train_top,
                          y_train)
time_train_knn = (time.clock() - start)
start = time.clock()
score_test_knn = best_knn.score(X_test_top, y_test)
time_test_knn = (time.clock() - start)

# test LDA classifier
parameters_lda = {'solver': ('svd', 'lsqr', 'eigen')}
start = time.clock()
best_lda = get_best_model(LinearDiscriminantAnalysis(), parameters_lda,
                          X_train_top, y_train)
time_train_lda = (time.clock() - start)
start = time.clock()
score_test_lda = best_lda.score(X_test_top, y_test)
time_test_lda = (time.clock() - start)

# test decision tree classifier
parameters_dtree = {'criterion': ('gini', 'entropy')}
start = time.clock()
best_dtree = get_best_model(DecisionTreeClassifier(), parameters_dtree,
                            X_train_top, y_train)
time_train_dtree = (time.clock() - start)
start = time.clock()
score_test_dtree = best_dtree.score(X_test_top, y_test)
time_test_dtree = (time.clock() - start)
Exemplo n.º 34
0
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, 
                                test_size=validation_size, random_state=seed)

print("X_train :",len(X_train), "\nX_validation :",len(X_validation))
# harnais de test 
#   nous allons eclater nos données en 10 sous parties (10 folders) 
#   le modele va s'entrainer sur 9 d'entre elles et tester sur le dernier folder 
#   Il va itérer cette opération sur toutes les combinaisons de folder existantes

# construire les modeles 
#   nous allons faire cela sur plusieurs modeles (6) afin de choisir lequel est le plus performant/pertinent
#   avec le même traitement des données (folder) pour chaque model, afin de les rendre comparable

models =[]
models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
models.append(('+ProcheVoisins', KNeighborsClassifier()))
models.append(('ArbreDecision', DecisionTreeClassifier()))
models.append(('NaiveBayes', GaussianNB()))
models.append(('VecteurDeSupport', SVC(gamma='auto')))

# on évalue chaque modele
results = []
names = []
 
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name,cv_results.mean(), cv_results.std())
Exemplo n.º 35
0
        SBi = Ni * np.mat(ui - u).T * np.mat(ui - u)
        SB += SBi
    S = np.linalg.inv(Sw) * SB
    eigVals, eigVects = np.linalg.eig(S)  #求特征值,特征向量
    eigValInd = np.argsort(eigVals)
    eigValInd = eigValInd[:(-n_dim - 1):-1]
    w = eigVects[:, eigValInd]
    data_ndim = np.dot(data, w)

    return data_ndim


if __name__ == '__main__':
    iris = load_iris()
    X = iris.data
    Y = iris.target
    data_1 = lda(X, Y, 2)

    data_2 = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, Y)

    plt.figure(figsize=(8, 4))
    plt.subplot(121)
    plt.title("my_LDA")
    plt.scatter(data_1[:, 0], data_1[:, 1], c=Y)

    plt.subplot(122)
    plt.title("sklearn_LDA")
    plt.scatter(data_2[:, 0], data_2[:, 1], c=Y)
    plt.savefig("LDA.png")
    plt.show()
Exemplo n.º 36
0
# pegando o diretório que contem as imagens no formato MNIST
path = os.path.join("images-mnist")

# passando o diretório para a função MNIST para trabalhar com as imagens
data = MNIST(path)

print("Loading dataset")
trainImages, trainLabels = data.load_training(
)  # carregando imagens de treinamento
testImages, testLabels = data.load_testing()  #carregando imagens de teste
print("Dataset is load")

tempoInicial = time.time()

lda = LinearDiscriminantAnalysis()
# definindo a função LDA

print("Training LDA")
lda.fit(trainImages, trainLabels)
ldaResult = lda.predict(testImages)
#print_report(predictions, testLabels)
printResul(ldaResult)
tempoAux = time.time()
tempo(int(tempoAux - tempoInicial))
print(int(tempoAux - tempoInicial))

k = 1
resultKnn = list()

knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    number_principal_components = pca.n_components_
    MSE_PCA_train = 1 - pca.explained_variance_ratio_.sum()

    pca_output = "Principal Component Analysis (PCA) \n\n" \
                 "Explained Variance: " + str(round(pca.explained_variance_ratio_.sum(), 3)) + "" \
                                                                                               " \nNumber of Principal Components: " + str(
        number_principal_components) + "\n" + "" \
                                              "MSE (training data): " + str(round(MSE_PCA_train, 3)) + "\n" + line_str
    print(pca_output)

    # calculate the PCs for the test data as well
    principal_components_test = pca.transform(df_test_input_)

    # _______________________________________________________________________________________________________________________
    # TASK 6: Fisher Discriminant
    fisher = LinearDiscriminantAnalysis()
    fisher.fit(df_train_input_, df_train_output['class'].values)
    fisher_components_train = fisher.transform(df_train_input_)
    fisher_components_test = fisher.transform(df_test_input_)

    # Adding Fisher Discrimant Analysis to PCA
    fisher_pca = LinearDiscriminantAnalysis()
    fisher_pca.fit(principal_components_train, df_train_output['class'].values)
    fisher_pca_components_train = fisher_pca.transform(
        principal_components_train)
    fisher_pca_components_test = fisher_pca.transform(
        principal_components_test)

    print("\n" + line_str)
    # _______________________________________________________________________________________________________________________
    # TASK 4: MLP
Exemplo n.º 38
0
print(stats.f_oneway(
        compra_sim['durabilid'],
        compra_nao['durabilid'])
     )

print(stats.f_oneway(
        compra_sim['estilo'],
        compra_nao['estilo'])
     )

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = compra_xls[['durabilid', 'desempenh', 'estilo']]
y = compra_xls['compra'] == 'sim'
print(y)
clf = LinearDiscriminantAnalysis()
clf.fit(X, y)
print(clf.decision_function(X))
print(clf.score(X, y))

y_ = clf.predict(X)

print(clf)
print(clf.score(X, y))
print(clf.coef_, clf.intercept_)


comprapredic = pd.read_csv("comprapredic.csv", header=0, sep=";")
X2 = comprapredic[['durabilid', 'desempenh', 'estilo']]
clf.predict(X2)
Exemplo n.º 39
0
                preload=True)
epochs_train = epochs.copy().crop(tmin=1., tmax=2.)
labels = epochs.events[:, -1] - 2

# %%
# Classification with linear discrimant analysis

# Define a monte-carlo cross-validation generator (reduce variance):
scores = []
epochs_data = epochs.get_data()
epochs_data_train = epochs_train.get_data()
cv = ShuffleSplit(10, test_size=0.2, random_state=42)
cv_split = cv.split(epochs_data_train)

# Assemble a classifier
lda = LinearDiscriminantAnalysis()
csp = CSP(n_components=4, reg=None, log=True, norm_trace=False)

# Use scikit-learn Pipeline with cross_val_score function
clf = Pipeline([('CSP', csp), ('LDA', lda)])
scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1)

a = csp.fit(epochs_data_train)
a = csp.fit_transform(epochs_data_train)

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("Classification accuracy: %f / Chance level: %f" %
      (np.mean(scores), class_balance))
Exemplo n.º 40
0
nnscores = Neural(x1_n, y1)

MAXITER = 20
nn_trnscores = np.tile(nnscores[0], MAXITER)
nn_tstscores = np.tile(nnscores[1], MAXITER)
nn_time = np.tile(nnscores[2], MAXITER)

data1 = np.array(x1_n)
#---------------------->APPLY CLUSTERING
#independent------------>
newkmdata, newemdata, newldadata = [], [], []
km = KMeans(n_clusters=2, random_state=0).fit(data1)
kmdata = km.labels_
em = GM(n_components=2, random_state=0).fit(data1)
emdata = em.predict(data1)
lda = LDA(n_components=2).fit(data1, y1)
data1_lda = lda.transform(data1)

x1_nn = x1_n.tolist()
for i in range(len(x1_nn)):
    newkm = (x1_nn[i])
    kmdatai = int(kmdata[i])
    newkm.extend([kmdatai])
    newkmdata.append(newkm)

x1_nn = x1_n.tolist()
for i in range(len(x1_nn)):
    newem = (x1_nn[i])
    emdatai = int(emdata[i])
    newem.extend([kmdatai])
    newemdata.append(newem)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
#from sklearn.cluster import AgglomerativeClustering

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianMixture(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

X = train[0::, 1::]
y = train[0::, 0]

acc_dict = {}

for train_index, test_index in sss.split(X, y):
# In[4]:

#標準化
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# # LDA

# In[5]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# In[6]:

# 執行分類
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# In[7]:

y_pred = classifier.predict(X_test)
# Modeling step Test differents algorithms
random_state = 2
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(
    AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),
                       random_state=random_state,
                       learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers:
    cv_results.append(
        cross_val_score(classifier,
                        X_train,
                        y=Y_train,
                        scoring="accuracy",
                        cv=kfold,
                        n_jobs=4))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
Exemplo n.º 44
0
Arquivo: ml.py Projeto: fominok/ml
def train_and_test(filename, class_field):
    attrs, classes = prepare_ds(filename, class_field)

    corr_scatter = []

    for cls, color in zip(range(1, 4), ('red', 'green', 'blue')):
        attr_one = attrs[:, 0][classes == cls]
        attr_two = attrs[:, 1][classes == cls]
        p = pearsonr(attr_one, attr_two)
        corr_scatter.append({
            "x": attr_one.tolist(),
            "y": attr_two.tolist(),
            "p": p[0]
        })
        # plt.scatter(x=attr_one, y=attr_two, marker='o', color=color,
        #             label='cls: {:}, pearsonr={:.2f}'.format(cls, p[0]))

    # plt.title('Pearson correlation')
    # plt.xlabel('Elevation, m')
    # plt.ylabel('Slope, num')
    # plt.legend(loc='upper right')
    # plt.show()

    data_train, data_test, class_train, class_test = train_test_split(
        attrs,
        classes,
        test_size=.3,
        random_state=123,
    )

    lda = LDA(n_components=2)
    lda_transform = lda.fit_transform(data_train, class_train)

    lda_scatter = []
    # plt.figure(figsize=(10, 8))
    for cls, color in zip(range(1, 4), ('red', 'green', 'blue')):
        attr_one = lda_transform[:, 0][class_train == cls]
        attr_two = lda_transform[:, 1][class_train == cls]
        lda_scatter.append({"x": attr_one.tolist(), "y": attr_two.tolist()})
        # plt.scatter(x=attr_one, y=attr_two, marker='o', color=color,
        #             label='cls: {:}'.format(cls))

    # plt.xlabel('vec 1')
    # plt.ylabel('vec 2')
    # plt.legend()
    # plt.show()

    lda_clf = LDA()
    lda_clf.fit(data_train, class_train)

    pred_train_lda = lda_clf.predict(data_train)
    print('Точность классификации на обучающем наборе данных (LDA): {:.2%}'.
          format(metrics.accuracy_score(class_train, pred_train_lda)))

    pred_test_lda = lda_clf.predict(data_test)
    print('Точность классификации на тестовом наборе данных (LDA): {:.2%}'.
          format(metrics.accuracy_score(class_test, pred_test_lda)))

    qda_clf = QuadraticDiscriminantAnalysis()
    qda_clf.fit(data_train, class_train)

    pred_train_qda = qda_clf.predict(data_train)
    print('Точность классификации на обучающем наборе данных (QDA): {:.2%}'.
          format(metrics.accuracy_score(class_train, pred_train_qda)))

    pred_test_qda = qda_clf.predict(data_test)
    print('Точность классификации на тестовом наборе данных (QDA): {:.2%}'.
          format(metrics.accuracy_score(class_test, pred_test_qda)))
    return corr_scatter, lda_scatter
Exemplo n.º 45
0
def compare_svm(paths,
                orientation1,
                orientation2,
                save_path,
                aliases,
                lda_comp,
                pca_comp,
                title,
                c,
                samplingRate=32000):
    if aliases is None:
        text = paths
    else:
        text = aliases

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    for index, path in enumerate(paths):
        for dirpath, _, file_paths in os.walk(path):
            label_names = []
            s = []
            file_paths.sort(
                lambda x, y: cmp(int(x.split('_')[-1]), int(y.split('_')[-1])))
            for fp in file_paths:
                file_path = os.path.join(dirpath, fp)
                data = pd.read_csv(file_path, header=None)
                raw_data = data.iloc[:, :-1].values
                Y = data.iloc[:, -1].values
                if raw_data.shape[1] != 1:
                    if lda_comp is not None:
                        lda = LDA(n_components=lda_comp)
                        X = lda.fit_transform(raw_data, Y)
                    elif pca_comp is not None:
                        pca = PCA(n_components=pca_comp)
                        X = pca.fit(raw_data)
                    else:
                        X = raw_data

                    interval_splits = [
                        int(y) for y in file_path.split('_')[-2:]
                    ]
                    for i in range(0, len(interval_splits), 2):
                        first = interval_splits[i] / float(samplingRate)
                        second = interval_splits[i + 1] / float(samplingRate)
                        if interval_splits[i + 1] == 117574:
                            label_names.append(''.join([
                                str('{0:.3f}'.format(first)), 's', '\n-\n',
                                str('{0:.3f}'.format(second)), 's',
                                '\nStim OFF'
                            ]))
                        elif interval_splits[i] == 0:
                            label_names.append(''.join([
                                str('{0:.3f}'.format(first)), 's', '\n-\n',
                                str('{0:.3f}'.format(second)), 's',
                                '\nTrial Start'
                            ]))
                        elif interval_splits[i] == 32066:
                            label_names.append(''.join([
                                str('{0:.3f}'.format(first)), 's', '\n-\n',
                                str('{0:.3f}'.format(second)), 's', '\nStim ON'
                            ]))
                        elif interval_splits[i + 1] == 133606:
                            label_names.append(''.join([
                                str('{0:.3f}'.format(first)), 's', '\n-\n',
                                str('{0:.3f}'.format(second)), 's',
                                '\nTrial END'
                            ]))
                        else:
                            label_names.append(''.join([
                                str('{0:.3f}'.format(first)), 's', '\n-\n',
                                str('{0:.3f}'.format(second)), 's'
                            ]))

                    scaler = StandardScaler()
                    X_norm = scaler.fit_transform(X)
                    new_x = X_norm[np.logical_or(Y == orientation1,
                                                 Y == orientation2)]
                    new_y = Y[np.logical_or(Y == orientation1,
                                            Y == orientation2)]

                    score = 0
                    time = 0
                    skf = StratifiedKFold(n_splits=2, shuffle=True)
                    for x in range(0, 5000):
                        for i, (train,
                                test) in enumerate(skf.split(new_x, new_y)):
                            xtrain, xval = new_x[train], new_x[test]
                            ytrain, yval = new_y[train], new_y[test]
                            clf = Pipeline([('scaler', StandardScaler()),
                                            ('SVM',
                                             svm.SVC(kernel='linear', C=1))])
                            clf.fit(xtrain, ytrain)
                            time += 1
                            score += clf.score(xval, yval)

                    s.append(score / time)

            if c is None:
                color = np.random.rand(3)
                ax1.plot(label_names,
                         s,
                         marker='o',
                         label=text[index],
                         color=color)
            else:
                ax1.plot(label_names,
                         s,
                         marker='o',
                         label=text[index],
                         color=c[index])

    plt.xlabel("Time")
    plt.ylabel("Accuracy")
    ax1.set_ylim(0.3, 1.0)

    plt.legend()
    fig.set_size_inches(28, 12, forward=True)
    if title is not None:
        ax1.set_title(title)
    if save_path is not None:
        plt.savefig(save_path)
    else:
        plt.show()
Exemplo n.º 46
0
    for train_index, test_index in kfold.split(data_opto_SOM,target): 
        

        ## Opto SOM ##
        x_train, x_test = data_opto_SOM[train_index,:],data_opto_SOM[test_index,:]
        y_train,y_test = target[train_index],target[test_index]      
        mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i)
        mul_lr.fit(x_train, y_train)
        score_opto_SOM_LR[n,f] = mul_lr.score(x_test, y_test)*100
        print(mul_lr.score(x_test,y_test))        

        clf = NearestCentroid(metric='euclidean',shrink_threshold=None)  
        clf.fit(x_train,y_train)
        score_opto_SOM_NN[n,f] = clf.score(x_test,y_test)*100
     
        lda = LinearDiscriminantAnalysis(solver='svd')
        lda.fit(x_train,y_train)
        score_opto_SOM_LDA[n,f]=lda.score(x_test,y_test)*100
        print(lda.score(x_test,y_test))

        svm_algo = svm.SVC(decision_function_shape='ovo',kernel='linear')
        svm_algo.fit(x_train,y_train)
        score_opto_SOM_SVM[n,f]=svm_algo.score(x_test,y_test)*100
 
        ## Opto PV ##
        x_train, x_test = data_opto_PV[train_index,:],data_opto_PV[test_index,:]
        y_train,y_test = target[train_index],target[test_index]      
        mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i)
        mul_lr.fit(x_train, y_train)
        score_opto_PV_LR[n,f] = mul_lr.score(x_test, y_test)*100
        print(mul_lr.score(x_test,y_test))        
Exemplo n.º 47
0
                                          data_size=len(train_targets),
                                          input_data=train_data,
                                          width=sample_width)
            filter_feature = GetNonTargetsAverage(train_inputs, train_targets)
            train_inputs = ApplySpecialFilter(train_inputs,
                                              filter_feature,
                                              reshaped=True)
            print(train_inputs.shape)
            # Modeling
            train_targets = np.array(train_targets)
            lsvc = LinearSVC(C=0.01, penalty="l1",
                             dual=False).fit(train_inputs, train_targets)
            sel_model = SelectFromModel(lsvc, prefit=True)
            train_inputs = sel_model.transform(train_inputs)
            print(train_inputs.shape)
            model = LinearDiscriminantAnalysis()
            model.fit(train_inputs, train_targets)

            # Prediction
            test_events = BasicDataProcess.LoadDataFromFile(
                data_dir + "/Test/testEvents.txt")
            test_data = BasicDataProcess.LoadEEGFromFile(data_dir, False)
            test_inputs = PreprocessData(data_dir,
                                         filter_applied=True,
                                         pca_applied=False,
                                         pca_threshold=20,
                                         reshaped=False,
                                         data_size=len(test_events),
                                         input_data=test_data,
                                         width=sample_width)
            test_inputs = ApplySpecialFilter(test_inputs,
def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir):
    '''
    run baseline classifier: LDA
    Wrapper script to run an LDA classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    '''

    # read the Rdata file
    ro.r['load'](CV_RDataPath)

    nfolds = np.array(ro.r['n_folds'], dtype = 'int')
    tokeep = np.array(ro.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(ro.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind =ro.r['Test_Idx']
    train_ind =ro.r['Train_Idx']

    # read the data
    data=ro.r['readRDS'](DataPath)
    data=pd.DataFrame(dgc_to_csr(data).toarray()).T
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep='\t', usecols = col)
    
#     print(len(data))
#     print(labels)
#     print(len(tokeep))
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # normalize data
    data = np.log1p(data)

    Classifier = LinearDiscriminantAnalysis()

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)
        print(len(pred))

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
#     print(len(tr_time))

    OutputDir = Path(OutputDir)
    os.makedirs(Path(OutputDir),exist_ok=True)
    truelab.to_csv(str(OutputDir / Path("LDA_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("LDA_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")),
                   index = False)
Exemplo n.º 49
0
scatter_matrix(dataset)
# pyplot.show()

#分离数据集
array=dataset.values
X=array[:,0:4]
Y=array[:,4]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation=train_test_split(X,Y,test_size=validation_size,random_state=seed)

#算法审查
# 线性回归 线性判别分析 K邻近 分类与回归树  贝叶斯分类器  支持向量机
models={}
models['LR']=LogisticRegression()
models['LDA']=LinearDiscriminantAnalysis()
models['KNN']=KNeighborsClassifier()
models['CART']=DecisionTreeClassifier()
models['NB']=GaussianNB()
models['SVM']=SVC()

#评估算法
result=[]
for key in models:
    kfold=KFold(n_splits=10,random_state=seed)
    cv_result=cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring='accuracy')
    result.append(cv_result)
    print('%s:%f(%f)'%(key,cv_result.mean(),cv_result.std()))

#箱线图比较算法
fiq=pyplot.figure()
Exemplo n.º 50
0
    mrnaseq = get_expression(cancer_types[catype])

    #%% Calculate mitotic index
    genes = ["MKI67"] # ki67
    mitindex = pd.DataFrame(columns=["mitindex"]+genes,index=data.columns)
    samples = [s[-35:-19] if (s[-1] in ["A","B"]) else s[-34:-18] for s in mitindex.index]
    mitindex.loc[:,genes] = [list(mrnaseq.loc[genes,s].values) if (s in mrnaseq.columns) else [np.nan]*len(genes) for s in samples]
    mitindex.loc[:,"mitindex"] = mitindex.loc[:,genes].mean(1)
    
    #%% Dimension reduction
    ## Discriminant analysis
    tumcode = [re.findall("(?<=TCGA-[A-Z0-9]{2}-[A-Z0-9]{4}-)[0-9]+",s)[0] for s in data.columns]
    targets = ["CA" if s=="01" else "HE" for s in tumcode]
        
    ## Discriminant analysis
    disc = LinearDiscriminantAnalysis(n_components=2, store_covariance=True)
    principalComponents = disc.fit_transform(data.transpose(),targets)
    expl_var = disc.explained_variance_ratio_
    normcoef = disc.coef_*disc.covariance_.diagonal()
    coef = pd.DataFrame(normcoef.transpose(), index = data.index)
    # Record normalized coefficients
    weights.loc[coef.index,catype] = coef.values.reshape((coef.shape[0],))
    
    # Add an uninformative second component if not generated
    if principalComponents.shape[1]==1:
        principalComponents = np.append(principalComponents,np.random.uniform(size=(principalComponents.shape[0],1)),axis=1)
        expl_var = np.append(expl_var, 0)

    ## Plot
    principalDf = pd.DataFrame(principalComponents,columns=["PCA1","PCA2"],index=data.columns)
    principalDf["source"] = targets
Exemplo n.º 51
0
def LDA_process(X):
    l, n = X.shape
    lda = LinearDiscriminantAnalysis(n_components=2)
    lda.fit(X[:, :-1], X[:, -1])
    Y = lda.transform(X[:, :-1])
    return Y
    width, height).astype(int).transpose(1, 0)
GaussNB_predict_prob = GaussNB.predict_proba(data_all)
# Post-processing using Graph-Cut
Seg_Label, seg_accuracy = Post_Processing(GaussNB_predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
print('(GaussNB) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
      % (GaussNB.score(X_train,y_train),GaussNB.score(X_test,y_test),\
         seg_accuracy, (time.time()-start_time)))
# draw classification map
draw(GT_Label, GaussNB_Label, Seg_Label, train_map, test_map)
print('--------------------------------------------------------------------')

# discriminant_analysis - linear discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
start_time = time.time()
LDA = LinearDiscriminantAnalysis().fit(X_train, y_train)
LDA_Label = LDA.predict(data_all).reshape(width,
                                          height).astype(int).transpose(1, 0)
LDA_predict_prob = LDA.predict_proba(data_all)
# Post-processing using Graph-Cut
Seg_Label, seg_accuracy = Post_Processing(LDA_predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
print('(LDA) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
      % (LDA.score(X_train,y_train),LDA.score(X_test,y_test),\
         seg_accuracy, (time.time()-start_time)))
# draw classification map
draw(GT_Label, LDA_Label, Seg_Label, train_map, test_map)
print('--------------------------------------------------------------------')

# Logistic Regression
from sklearn.linear_model import LogisticRegression
Exemplo n.º 53
0
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=0)

model = svm.SVC(kernel='linear', C=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc25 = accuracy_score(y_test, y_pred)

pca = PCA(n_components=15)
X1 = pca.fit_transform(data1)
Y = np.repeat(range(1, 16), 11)
x_train, x_test, y_train, y_test = train_test_split(X1,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=0)

model = svm.SVC(kernel='linear', C=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc15 = accuracy_score(y_test, y_pred)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA()
Y = np.repeat(range(1, 16), 11)
X2 = model.fit_transform(data1, Y)
x_train, x_test, y_train, y_test = train_test_split(X2, Y, test_size=0.33)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc_LDa = accuracy_score(y_test, y_pred)
Exemplo n.º 54
0
for value in [0, 1]:
    # forecast
    yhat = naive_prediction(testX, value)
    # evaluate
    score = accuracy_score(testy, yhat)
    # summarize
    print('Naive=%d score=%.3f' % (value, score))

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 trainX,
                                                 trainy,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
Y=Y.astype(str)
#my_imputer = SimpleImputer()
#X=my_imputer.fit_transform(X)

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='lbfgs',max_iter=10000, multi_class='auto'))) 
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
Exemplo n.º 56
0
clever_print('svm with RBF kernel with cv')
svm_rbf_cv= GridSearchCV(SVC(kernel='rbf'), param_grid, cv=CV_SET)
svm_rbf_cv.fit(svm_scaler.transform(data_all_x), data_all_y)


print('accuracy on training and dev')
print(svm_rbf_cv.best_score_)
print('best param')
print(svm_rbf_cv.best_params_)


# LDA QDA------------------------------------

clever_print('LDA analysis')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda=LinearDiscriminantAnalysis().fit(data_train_x,data_train_y)
print('accuracy on training')
print(accuracy_score(data_train_y,lda.predict(data_train_x)))
print('accuracy on dev')
print(accuracy_score(data_dev_y,lda.predict(data_dev_x)))

clever_print('QDA analysis')

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda=QuadraticDiscriminantAnalysis().fit(data_train_x,data_train_y)
print('accuracy on training')
print(accuracy_score(data_train_y,qda.predict(data_train_x)))
print('accuracy on dev')
print(accuracy_score(data_dev_y,qda.predict(data_dev_x)))

# NN(MLP)------------------------------------
 def __init__(self, n_features):
     self.n_features = n_features
     self.lda = LinearDiscriminantAnalysis(n_components=n_features)
Exemplo n.º 58
0
     i_iter = -1
     acc_test_nested_iter = np.zeros((cv_nested.get_n_splits(), ))
     for tr_idx_nested, te_idx_nested in cv_split_nested:
         i_iter += 1
         epochs_train_nested = epochs_train[tr_idx_nested, :, :]
         epochs_test_nested = epochs_train[te_idx_nested, :, :]
         labels_train_nested = labels_train[tr_idx_nested]
         labels_test_nested = labels_train[te_idx_nested]
         csp = CSP(n_components=n_components,
                   reg='ledoit_wolf',
                   log=True,
                   cov_est='concat')
         csp.fit(epochs_train_nested, labels_train_nested)
         epochs_train_nested_new = csp.transform(epochs_train_nested)
         epochs_test_nested_new = csp.transform(epochs_test_nested)
         lda = LinearDiscriminantAnalysis()
         lda.fit(epochs_train_nested_new, labels_train_nested)
         # lbl_train_pred_nested = lda.predict(epochs_train_nested_new)
         lbl_test_pred_nested = lda.predict(epochs_test_nested_new)
         acc_test_nested_iter[i_iter] = np.mean(
             lbl_test_pred_nested == labels_test_nested)
     acc_n_components[i_comp] = np.mean(acc_test_nested_iter)
 idx1 = np.argmax(acc_n_components)
 n_components = list(range_n_components)[idx1]
 print('*****************************')
 print('n_components=', n_components)
 print('acc_nested_max=', acc_n_components[idx1])
 print('*****************************')
 csp = CSP(n_components=n_components,
           reg='ledoit_wolf',
           log=True,
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results`
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
Exemplo n.º 60
0
    e_co = []
    l_lda = []
    l_clustering = []
    l_co = []

    for index, i in enumerate(num_unlabeled):

        X_labeled = X_train[:num_labeled]
        y_labeled = y_train[:num_labeled]
        X_unlabeled = X_train[num_labeled: num_labeled + num_unlabeled[index]]
        y_unlabeled = y_train[num_labeled: num_labeled + num_unlabeled[index]]

        ###### supervised-LDA ######
        X_trnall = X_train[: num_labeled]
        y_trnall = y_train[: num_labeled]
        clf_lda = LinearDiscriminantAnalysis()
        clf_lda.fit(X_trnall, y_trnall)
        train_predictions = clf_lda.predict(X_test)
        e_lda.append(1 - accuracy_score(y_test, train_predictions))
        l_lda.append(log_loss(y_test, train_predictions))

        ###### SS-Clustering ######
        if num_unlabeled[index] == 0:
            X_trnall = X_train[: num_labeled]
            y_trnall = y_train[: num_labeled]
        else:
            X_trnall, y_trnall = ssclustering(X_labeled, y_labeled, X_unlabeled)
        clf_lda = LinearDiscriminantAnalysis()
        clf_lda.fit(X_trnall, y_trnall)
        train_predictions = clf_lda.predict(X_test)
        e_clustering.append(1 - accuracy_score(y_test, train_predictions))