Пример #1
0
    def setUp(self):
        j = 2
        self.train_set = model.TrainingSet('.train_set_synthesis.csv')

        X = np.array([[1, 1.9], [1.9, 1], [3.8, 4.2], [4, 3.6], [3.6, 4.4]])
        Y = np.array([[1, 0], [1, 0], [0, 1], [0, 1], [0, 1]])
        self.train_set.x = X.copy()
        self.train_set.y = Y.copy()

        self.train_set.autoscale()
        # autoscale also matrices for sklearn
        X = self.train_set.x.copy()
        Y = self.train_set.y.copy()
        self.nipals = model.nipals(X, Y)

        self.sklearn_pls = sklCD.PLSRegression(n_components=j,
                                               scale=True,
                                               max_iter=1e4,
                                               tol=1e-6,
                                               copy=True)
        self.sklearn_pls.fit(X, Y)

        IO.Log.debug('NIPALS x scores', self.nipals.T)
        IO.Log.debug('sklearn x scores', self.sklearn_pls.x_scores_)
        IO.Log.debug('NIPALS x loadings', self.nipals.P)
        IO.Log.debug('sklearn x loadings', self.sklearn_pls.x_loadings_)
        IO.Log.debug('NIPALS x weights', self.nipals.W)
        IO.Log.debug('sklearn x weights', self.sklearn_pls.x_weights_)
        IO.Log.debug('NIPALS y scores', self.nipals.U)
        IO.Log.debug('sklearn y scores', self.sklearn_pls.y_scores_)
        IO.Log.debug('NIPALS y loadings', self.nipals.Q)
        IO.Log.debug('sklearn y loadings', self.sklearn_pls.y_loadings_)
        IO.Log.debug('sklearn y weights', self.sklearn_pls.y_weights_)
Пример #2
0
def plsRegressAnalysis(xTrain, yTrain, xTest = None, yTest = None):
    kf = model_selection.KFold(n_splits=5,random_state=10)
    trans, features = xTrain.shape
    lvMax = int(min(trans, features)/3)
    lvBest = 0
    rmsecvBest = np.inf
    for lvTemp in range(1,lvMax+1):
        squareArray = np.array([[]])
        for train, test in kf.split(xTrain):
            xTrainTemp = xTrain[train, :]
            yTrainTemp = yTrain[train]
            xTestTemp = xTrain[test, :]
            yTestTemp = yTrain[test]
            yPredictTemp, coefTemp = PLS(xTestTemp, yTestTemp, xTrainTemp, yTrainTemp, lvTemp)
            residual = yPredictTemp - yTestTemp
            square = np.dot(residual.T, residual)
            squareArray = np.append(squareArray, square)
            # squareArray.append(square)
        RMSECV = np.sqrt(np.sum(squareArray) / xTrain.shape[0])
        if RMSECV<rmsecvBest:
            rmsecvBest = RMSECV
            lvBest = lvTemp
    if xTest is None:
        return rmsecvBest, lvBest
    else:
        plsModel = cross_decomposition.PLSRegression(n_components=lvBest)
        plsModel.fit(xTrain, yTrain)
        coef = plsModel.coef_
        yPredict = plsModel.predict(xTest)
        yTrainPredict = plsModel.predict(xTrain)
        R2 = sm.r2_score(yTrain,yTrainPredict)
        MSE = sm.mean_squared_error(yTest,yPredict)
        R2P = sm.r2_score(yTest, yPredict)
        return yPredict, R2, rmsecvBest, R2P,MSE, lvBest
Пример #3
0
 def test_pls(self):
     """ tests the exactness of ppdire's pls"""
     
     skpls = skc.PLSRegression(n_components=4)
     skpls.fit(self.Xs,(self.y-np.mean(self.y))/np.std(self.y))
     pppls = ppdire(projection_index = dicomo, pi_arguments = {'mode' : 'cov'}, n_components=4, square_pi=True, optimizer='SLSQP', optimizer_options={'maxiter':500})
     pppls.fit(self.x,self.y)
     np.testing.assert_almost_equal(np.abs(np.matmul(self.Xs,skpls.coef_)*np.std(self.y) + np.mean(self.y)),np.abs(pppls.fitted_),decimal=3)
Пример #4
0
def Pls (df, df2, string):
    pls2 = PLSRegression(n_components=2)
    (xs,ys) = pls2.fit_transform(df,df2)
    t = df2.values
    principalDf = pd.DataFrame(data = xs
             , columns = ['pls 1', 'pls 2'])
    pls = cross_decomposition.PLSRegression(n_components = 10)
    pls.fit(df, df2) 
    variance = np.var(pls.x_scores_, axis = 0) 
    principalDf [string] = t
    return principalDf, variance 
Пример #5
0
    def test_PLSRegression(self):

        n = 1000
        q = 3
        p = 10
        X = np.random.normal(size=n * p).reshape((n, p))
        B = np.array([[1, 2] + [0] * (p - 2)] * q).T
        # each Yj = 1*X1 + 2*X2 + noize
        Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

        df = pdml.ModelFrame(X, target=Y)
        pls1 = df.cross_decomposition.PLSRegression(n_components=3)
        df.fit(pls1)
        result = df.predict(pls1)

        pls2 = cd.PLSRegression(n_components=3)
        pls2.fit(X, Y)
        expected = pls2.predict(X)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #6
0
def mx_PLSRegression(train_x, train_y):
    mx = cross_decomposition.PLSRegression()
    mx.fit(train_x, train_y)
    return mx
Пример #7
0
    ngrams = textfeature()

    ## load data
    train_data = ngrams.load_data('../holger_train_judgeyear.csv', index_col=0)
    test_data = ngrams.load_data('../holger_test_judgeyear.csv', index_col=0)
    judge_year_index = ngrams.load_data('../datasets/judge_year2index.pkl',
                                        format='pkl')
    ngram_dict = ngrams.load_data(
        '../datasets/grams_dict2002-2016/grams_dict.pkl', format='pkl')

    bow_feature = ngrams.load_data(
        '../datasets/grams_dict2002-2016/bow_features.pkl', format='pkl')

    model_zoo = Counter()
    model_zoo['OLS'] = linear_model.LinearRegression()
    model_zoo['PLS'] = cross_decomposition.PLSRegression(n=200)
    model_zoo['RF'] = RandomForestRegressor()
    model_zoo['Elastic Net'] = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7)
    features = bow_feature
    ngrams.process_data(train_data, judge_year_index, features, istrain=True)
    ngrams.process_data(test_data, judge_year_index, features, istrain=False)
    ngrams.get_train_test(features)
    bow_train, bow_test = ngrams.get_vector()
    X_train, X_test = ngrams.get_tfidf(bow_train, bow_test)
    train_total, test_total = ngrams.combine_data(X_train, X_test)

    cvres = []
    alphas = np.linspace(0.01, 1, 50)
    for a in alphas:
        ela = ngrams.model_pre(model_zoo['Elastic Net'],
                               train_total,
Пример #8
0
# Scree plot
plt.bar(np.arange(1, spca.named_steps['pca'].n_components_ + 1) - 0.4,\
        spca.named_steps['pca'].explained_variance_ratio_)
cum_evr = np.cumsum(spca.named_steps['pca'].explained_variance_ratio_)
plt.plot(np.arange(1, spca.named_steps['pca'].n_components_ + 1), cum_evr,\
         color='black')

'''
Partial least squares (PLS) regression
'''

# Create a pipeline that scales the data and performs PLS regression
spls = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pls', cross_dec.PLSRegression(scale=False))
])

# Train a PLS regression model with three components
spls.set_params(
    pls__n_components=3
)
spls.fit(boroughs, feelings)

# Define folds for cross-validation
kf = cv.KFold(len(feelings), n_folds=10, shuffle=True)

# Compute average MSE across folds
mses = cv.cross_val_score(spls, boroughs, feelings,\
                          scoring='mean_squared_error', cv=kf)
np.mean(-mses)
Пример #9
0
def PLS_DA(cmode, fold_quantity, components):

    print(
        '''-------------------------------------------------------------------------------
    Gastrointestinal Lesion Classifier (by Willie Wu and Linus Chen): PLS-DA
    -------------------------------------------------------------------------------'''
    )

    #load data
    feature_data = []
    with open('feature_data.csv', newline='') as csvfile:
        data_reader = csv.reader(csvfile, delimiter=',')
        for row in data_reader:
            feature_data.append([int(row[1])] + [float(r) for r in row[2:]])
        print('Data read in successfully...')

    #check input flags
    if cmode not in ['binary', 'multi', 'debug']:
        raise FlagError('Only binary and multi modes are available.')
    if fold_quantity > len(feature_data) // 2 or fold_quantity < -1:
        raise FlagError('Folds must be >=-1 (LOOCV) and <= length of data.')
    if components > len(feature_data) // 2 or components < 1:
        raise FlagError('Components must be > 0 and <= length of data.')

    #benign and malignant classes
    classes = ['Hyp (b)', 'Ser (m)', 'Ade (m)']
    #number of PC in models
    model = skcd.PLSRegression(components)

    #pair indicates that the 2 rows were combined into one
    pair_data = np.array([
        feature_data[i][1:] + feature_data[i + 1][1:]
        for i in range(0,
                       len(feature_data) - 1, 2)
    ])
    pair_answers = np.array(
        [feature_data[i][0] for i in range(0,
                                           len(feature_data) - 1, 2)])
    pair_modified_answers = []

    #multiclass prediction is 3 runs of PLSR-DA, for x vs. not-x
    #for binary, only check the first run of hyp vs. not-hyp
    #3 different sets of answers for each type of classification
    for i in range(len(classes)):
        pair_modified_answers.append([
            1 if pair_answers[j] == (i + 1) else -1
            for j in range(0, len(pair_answers))
        ])
    pair_modified_answers = np.array(pair_modified_answers)

    if fold_quantity == -1:
        fold_quantity = len(pair_data)

    #randomly choose folds for each lesion, approx. equal sizes
    folds = [[] for a in range(fold_quantity)]
    fold_membership = [i % fold_quantity for i in range(len(pair_data))]
    shuffle(fold_membership)
    for b in range(len(fold_membership)):
        folds[fold_membership[b]].append(b)

    #make predictions for each fold and each class
    test_pair_predictions = []
    test_pair_answers = []
    for f in range(fold_quantity):

        fold_test_pair_predictions = []
        test_fold = folds[f]
        train_fold = [i for i in range(len(pair_data)) if i not in test_fold]

        for c in range(len(classes)):

            #set training data
            train_pair_data = pair_data[train_fold]
            train_pair_modified_answers = pair_modified_answers[c][train_fold]

            #set test data
            test_pair_data = pair_data[test_fold]

            #fit model to training data, predict
            model.fit(train_pair_data, train_pair_modified_answers)
            fold_test_pair_predictions.append(
                model.predict(test_pair_data).flatten())

        test_pair_predictions.append(
            np.swapaxes(fold_test_pair_predictions, 0, 1))
        test_pair_answers.append(pair_answers[test_fold])

    #find which class has the highest predicted value
    if cmode in ['binary', 'debug']:
        predictions = [
            1 if ff[0] > 0 else 0 for f in test_pair_predictions for ff in f
        ]
        answers = [1 if aa == 1 else 0 for a in test_pair_answers for aa in a]
        acc, sens, spec, f1 = binary_calc_model_stats(predictions, answers)
        if cmode != 'debug':
            print('F1 Score: {0:.2f}%'.format(round(f1 * 100, 2)))
            print('Accuracy: {0:.2f}%'.format(round(acc * 100, 2)))
            print('Sensitivity: {0:.2f}%'.format(round(sens * 100, 2)))
            print('Specificity: {0:.2f}%'.format(round(spec * 100, 2)))
            print('===================')
        return acc, sens, spec, f1, np.array(model.x_loadings_), np.array(
            model.y_loadings_), answers, test_pair_predictions

    if cmode == 'multi':
        predictions = [
            np.argmax(f) + 1 for f in test_pair_predictions for ff in f
        ]
        acc, sens, spec, tots = multi_calc_model_stats(predictions,
                                                       pair_answers)
        print('Total Accuracy: {0:.2f}%'.format(round(tots[0] * 100)))
        print('F1 Score: {0:.2f}%'.format(round(tots[1] * 100, 2)))
        for ac, se, sp, name in zip(acc, sens, spec,
                                    ['Hyperplasic', 'Serrated', 'Adenoma']):
            print(name + ' stats: ')
            print('Accuracy: {0:.2f}%'.format(round(ac * 100, 2)))
            print('Sensitivity: {0:.2f}%'.format(round(se * 100, 2)))
            print('Specificity: {0:.2f}%'.format(round(sp * 100, 2)))
            print('===================')
Пример #10
0

modes = [0,1,2,3,4,5,6,7,8,9]

Y = pc.projectedWeights[0:len(modes),:].T


#### Generating final models

Recon_models = []
for train_index, test_index in loo:
    test_index = test_index[0]
    X_train, X_test = X[train_index], X[test_index]
#    Y_train, Y_test = Y[train_index], Y[test_index]
    # initialize plsr
    plsr = cross_decomposition.PLSRegression(n_components=5, scale = False)
    plsr.fit(X, Y)
	
    #prediction
    y = (plsr.predict(X_test)).reshape((len(modes),))

    #Reconstruct test instance
#    P = pc.reconstruct(
#			pc.getWeightsBySD(modes, y),
#			modes
#			)
    P = pc.reconstruct(y, modes)
    #Reshape - this will maybe be different for you - hardcoded for mine here
    R = P.reshape((np.shape(Data_flat)[1]/3,3))
    Recon_models.append (R)
    
Пример #11
0
def pls_inspection(X: np.ndarray, Y: np.ndarray, n_comps: int):
    n_classes = len(np.unique(Y))
    Y_encoded = one_hot_encode(Y)
    model = cross_decomposition.PLSRegression(n_components=n_comps,
                                              scale=False)
    model.fit(X, Y_encoded)

    # Extract information
    scores = model.x_scores_
    loadings = model.x_loadings_
    var_scores = np.var(scores, axis=0)
    var_X = np.sum(np.var(X, axis=0))
    var_ratios = var_scores / var_X
    cum_var_ratios = np.cumsum(var_ratios)

    # Colormap
    cmap = plt.cm.jet
    cmaplist = [cmap(i) for i in range(cmap.N)]
    cmap = mpl.colors.LinearSegmentedColormap.from_list(
        'Custom map', cmaplist, cmap.N)
    bounds = np.linspace(0, n_classes, n_classes + 1)
    norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

    # Explained variance plot
    plt.figure(num=3, figsize=(8, 6))
    plt.plot(np.pad(cum_var_ratios, (1, 0), 'constant'))
    plt.title('Explained variance')
    plt.xlabel('Principal components')
    plt.ylabel('Cumulative explained variance')
    plt.xlim((0, n_comps))
    plt.ylim((0, 1))

    # Loadings plot
    plt.figure(num=4, figsize=(8, 6))
    plt.plot(loadings[:, 0])
    plt.title('PC1 loadings')

    plt.figure(num=5, figsize=(8, 6))
    plt.plot(loadings[:, 1])
    plt.title('PC2 loadings')

    plt.figure(num=6, figsize=(8, 6))
    plt.plot(loadings[:, 2])
    plt.title('PC3 loadings')

    # 2D scores plot
    plt.figure(num=7, figsize=(8, 6))
    scat = plt.scatter(scores[:, 0],
                       scores[:, 1],
                       c=Y,
                       s=2,
                       cmap=cmap,
                       norm=norm)
    cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)
    cb.set_label('Classes')
    plt.title('Scores plot (PLS)')
    plt.xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100))
    plt.ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100))

    # 3D scores plot
    fig = plt.figure(num=8, figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    scat = ax.scatter(scores[:, 0],
                      scores[:, 1],
                      scores[:, 2],
                      c=Y,
                      s=2,
                      cmap=cmap,
                      norm=norm)
    cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)
    cb.set_label('Classes')
    ax.set_title('Scores plot (PLS)')
    ax.set_xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] *
                                                            100))
    ax.set_ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] *
                                                            100))
    ax.set_zlabel('PC3 ({:.2f}% explained variance)'.format(var_ratios[2] *
                                                            100))

    plt.show()
#models.append( {"name": "1.6.3. RadiusNeighborsRegressor uniform", \
#				"model": neighbors.RadiusNeighborsRegressor(weights = "uniform")} )
#ZeroDivisionError: Weights sum to zero, can't be normalized
#models.append( {"name": "1.6.3. RadiusNeighborsRegressor distance", \
#				"model": neighbors.RadiusNeighborsRegressor(weights = "distance")} )
models.append( {"name": "1.6.3. NearestCentroid", \
    "model": neighbors.NearestCentroid()} )

## 1.7. Gaussian Processes
## too slow?
#models.append( {"name": "1.7. Gaussian Processes", \
#				"model": gaussian_process.GaussianProcess()} )

## 1.8. Cross decomposition
models.append( {"name": "1.8. Cross decomposition PLSRegression", \
    "model": cross_decomposition.PLSRegression()} )
models.append( {"name": "1.8. Cross decomposition PLSCanonical", \
    "model": cross_decomposition.PLSCanonical()} )
# slow
#models.append( {"name": "1.8. Cross decomposition CCA", \
#				"model": cross_decomposition.CCA()} )

## 1.9. Naive Bayes (for classification?)
#ValueError: Unknown label type: array
#models.append( {"name": "1.9.1. GaussianNB", \
#				"model": naive_bayes.GaussianNB()} )

# doesn't work for this dataset?
#models.append( {"name": "1.9.2. MultinomialNB", \
#				"model": naive_bayes.MultinomialNB()} )
Пример #13
0
def PLS(xTest, yTest, xTrain, yTrain, nComponents):
    plsModel = cross_decomposition.PLSRegression(n_components=nComponents)
    plsModel.fit(xTrain,yTrain)
    coef = plsModel.coef_
    yPredict = plsModel.predict(xTest)
    return yPredict, plsModel