예제 #1
0
def fit_base_model(classifiers, fully, dummyY, trainx, testx):
    """ Takes a list of classifiers and/or PLS regression and
    does dimension reduction by returning the predictions of the classifiers
    or first two scores of the PLS regression on bootstrapped subsamples of
    the data."""

    trainProbs = []
    testProbs = []

    iterations = 0
    for clf in classifiers:
        for i in range(clf[1]):
            iterations += 1
            print(iterations)
            print(clf[0])
            train_rows = np.random.choice(trainx.shape[0],
                                          round(trainx.shape[0] * base_prop),
                                          True)
            oob_rows = list(set(range(trainx.shape[0])) - set(train_rows))
            print(len(train_rows))
            print(len(oob_rows))
            x = trainx[train_rows, :]
            if clf[0] == 'PLS':
                y = dummyY[train_rows, :]
                mod = PLSRegression().fit(x, y)
                trainscores = mod.transform(trainx)
                testscores = mod.transform(testx)
                trainProbs.append(trainscores[:, 0])
                trainProbs.append(trainscores[:, 1])
                testProbs.append(testscores[:, 0])
                testProbs.append(testscores[:, 1])
            else:
                y = fully[train_rows]
                print('\t Fitting model...')
                mod = clf[0].fit(x, y)
                print('\t Predicting training results...')
                tpreds = mod.predict_proba(trainx)
                trainProbs.append(list(tpreds[:, 1]))
                print('\t Predicting test results...')
                testProbs.append(list(mod.predict_proba(testx)[:, 1]))
                print('\t OOB score: ' + str(log_loss(fully[oob_rows],
                                                      tpreds[oob_rows, :])))
    return trainProbs, testProbs
예제 #2
0
def do_pls(X, Y):
    pls2 = PLSRegression(n_components=2)
    pls2.fit(X,Y)
    out = pls2.transform(X)
    print(out)
    print(out.shape)

    plt.title("PLS2")
    plt.xlabel("PL1")
    plt.ylabel("PL2")
    plt.grid();
    plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis')
    plt.savefig('pls.png', dpi=125)
예제 #3
0
파일: canonics.py 프로젝트: csxeba/NitaGeo
def pls_approach():
    from sklearn.cross_decomposition import PLSRegression

    (X, Y), cities = pull_xy_data()

    pls = PLSRegression()
    pls.fit(X, Y)

    plsX, plsY = pls.transform(X, Y)

    plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1)

    return "OK What Now?"
def hacerPLS(X,Y):
    pls_wild_b = PLSRegression(n_components = 9) 
    pls_wild_b.fit(X,Y)
    Z = pls_wild_b.transform(X)
    scores = list() 
    scores_std = list()
    n_features = np.shape(X)[1]
    
    X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0)
    N = np.shape(X)[0]
    
    for num_comp in range(n_features):
        kf = KFold(N,n_folds = 10)
        aux_scores = list()
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
              
            if num_comp == 0:
                y_pred = np.mean(y_test)
                y_pred = y_pred* np.ones(np.shape(y_test))
                aux_scores.append(metrics.mean_squared_error(y_test,y_pred))
            
            else:
                pls_foo = PLSRegression(n_components = num_comp)                        
                pls_foo.fit(X_train,y_train)
                y_pred = pls_foo.predict(X_test)
            
                #obtaing the score
                this_score = metrics.mean_squared_error(y_test,y_pred)
                aux_scores.append(this_score)
                
        scores.append(np.mean(aux_scores))
        scores_std.append(np.std(aux_scores))
    
    plt.plot(scores)
    xlabel('Componentes')
    ylabel("$MSE$")
    title("Animales PLS")
    plt.show()
    
    num_comp = np.argmin(scores)
    
    pls_pred = PLSRegression(n_components =2)
    pls_pred.fit(X,Y)
    y_pred_test = pls_pred.predict(X_test_tot)
    
    print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
예제 #5
0
def reduce_PLS(dataframe):
    PLS_file="data/pls_structure.pickle"
    selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]]
    X=np.array(dataframe[selectedcolumn])
    y=np.array(dataframe["click"])
    if os.path.exists(PLS_file):
        stand_PLS=pickle.load(open(PLS_file,'rb'))
        print "PLS structure is loaded."
    else:
        stand_PLS=PLSRegression(n_components=10,scale=True)
        stand_PLS.fit(X, y[:,np.newaxis])
        stand_PLS.y_scores_=None
        stand_PLS.x_scores_=None
        pickle.dump(stand_PLS,open(PLS_file,"wb"))
        print "PLS transform structure is stored."
    T=stand_PLS.transform(X)
    print "PLS transformation is performed."
    return T
예제 #6
0
class PLS_method(DR_Technique):
    r"""Partial Least Squares dimension reduced subspace

    This computes reduced subspace by:
    (1) standardizing x and y
    (1) applying 2-blocks regression PLS2 over x and y

    Example:
        >>> DR_model=PLS_method(0,[0,1])
        >>> DR_model.calculate(train_x,train_y)
    """
    def __init__(self, dim_DR, orig_range):
        r"""Args:
            dim_DR: number of dimensions to reduce to
            orig_range: the bounds of the original subspace
        """

        if dim_DR != 0 and isinstance(dim_DR, int):
            super().__init__('PLS', dim_DR, orig_range)
        else:
            raise ValueError(
                'dim_DR cannot equal 0 for PLS or is not an integer')

        #need to save mean and std for later encode/decode
        self.x_mean = None
        self.x_std = None
        self.y_mean = None
        self.y_std = None

    def calculate(self, train_x, train_y):
        ###assumes train_x or train_y is not standardized
        ############################################
        #Step 1: calc params for later use         #
        ############################################
        self.x_mean = train_x.mean(axis=0)
        self.x_std = train_x.std(axis=0)
        self.x_std[self.x_std == 0.0] = 1.0
        self.y_mean = train_y.mean(axis=0)
        self.y_std = train_y.std(axis=0)
        self.y_std[self.y_std == 0.0] = 1.0
        ############################################
        #Step 2: Create instance and fit PLS       #
        ############################################

        self.Model = PLSRegression(n_components=self.dim_DR)
        #automatically standardizes everything for us
        self.Model.fit(train_x, train_y)

        ############################################
        #Step 3: Determine reduced subspace bounds #
        ############################################

        DR = self.Model.transform(train_x)  #will standardize for us
        self.DR_range = np.c_[np.min(DR, axis=0) * 1.1,
                              np.max(DR, axis=0) * 1.1]  #pad a bit

        return print('PLS model created')

    def Decode_X(self, DR_input):
        #####################################################
        #        Convert DR-> orig                          #
        #####################################################
        xhat_n = np.dot(DR_input, self.Model.x_rotations_.T)
        #convert back to original domain
        xhat = (xhat_n * self.x_std) + self.x_mean
        #verify and correct domain boundaries
        return self.Enforce_Bounds(xhat)

    def Encode_X(self, x_set):
        #####################################################
        #       Convert orig->DR                            #
        #####################################################
        #standardize N(0,1)
        X_0 = np.divide(x_set - self.x_mean, self.x_std)
        #convert to DR
        return np.dot(X_0, self.Model.x_weights_)

    def Pred_Y(self, DR_set):
        #####################################################
        #       predict Y from DR                          #
        #####################################################
        #Y=TQ'+F
        yhat_n = np.dot(DR_set, self.Model.y_loadings_.T)
        if len(yhat_n) == 1:
            yhat_n = yhat_n[:, None]
        #destandardize
        return (yhat_n * self.y_std) + self.y_mean
예제 #7
0
    plt.xlim(1,np.amax(nComponents))
    plt.title('PLS Cannonical accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right')
    plt.grid(True)

if (0):
    #%% PLS Regression
    nComponents = np.arange(1,nClasses+1)
    plsRegScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        plsReg = PLSRegression(n_components=n)
        plsReg.fit(Xtrain,Ytrain)
        XtrainT = plsReg.transform(Xtrain)
        XtestT = plsReg.transform(Xtest)
        plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)

    
    plsReg = PLSRegression(n_components=2)
    plsReg.fit(Xtrain,Ytrain)
    xt = plsReg.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 2 components of projected data')
    

    #%% Plot accuracies for PLSSVD 
    plt.figure()
    for i in range (5):
예제 #8
0
class MspmPartialLeastSquares:
    """
    This module is to construct a partial_least_squares (PLS) model for feature analysis.
    
    Parameters
    ----------
    
    x (n_samples, n_features) – The training input samples
    y (n_samples, n_targets) – The training target samples
    n_components – The number of feature scores
    preprocess (default = True) - the preprocessing of the data

    Attributes
    ----------
    pls - model of PLS
    
    Example
    -------
    >>> from sklearn.datasets import load_iris
    >>> from pypm.models.mspm_partial_least_squares import MspmPartialLeastSquares
    >>> data = load_iris()
    >>> x = data.data
    array([[5.1, 3.5, 1.4, 0.2]...
    >>> y = data.target
    array([0, 0, 0, 0, 0, 0, 0...
    >>> PLS_model = MspmPartialLeastSquares(x, y, 3)
    >>> PLS_model.construct_pls_model()
    >>> Features = PLS_model.extract_pls_feature(x)
    array([[-2.26393268e+00,  1.74075256e-01,  3.62141834e-01]...
    >>> Prediction = PLS_model.pls_predict(x)
    array([[-8.05094197e-02]...
    
    """
    def __init__(self, x, y, n_components, preprocess=True):

        self.x = x
        self.y = y
        self.preprocess = preprocess
        self.n_components = n_components

        if self.preprocess:
            self.Xscaler = preprocessing.StandardScaler().fit(self.x)
            self.x = self.Xscaler.transform(self.x)

    def construct_pls_model(self):
        """
        Function to construct a pls model.
        
        """
        self.pls = PLSRegression(self.n_components)
        self.pls.fit(self.x, self.y)

    def extract_pls_feature(self, x_test):
        """
        Function to extract the PCA feature of given data using the trained-well PCA model.
        
        Parameters
        ----------
        x_test (_, n_features) - The testing samples
        
        """
        if self.preprocess:
            x_test = self.Xscaler.transform(x_test)
        return self.pls.transform(x_test)

    def pls_predict(self, x_test):

        if self.preprocess:
            x_test = self.Xscaler.transform(x_test)
        return self.pls.predict(x_test)
        plt.ylabel('1st component')
    elif i == 1:
        plt.ylabel('2nd component')
    else:
        plt.ylabel('3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
    axis_c.set_xticks(axis_c.get_xticks() + 0.5)
    print "dentro del bucleeeeeeeeeee"

#Select the number of components using CV
#%%
##PLSR
pls_wild_b = PLSRegression(n_components = 3)
pls_wild_b.fit(X_train_prepro,Y_train)
X_train_pls_proj = pls_wild_b.transform(X_train_prepro)
print("loadings")

for i in range(pls_wild_b.n_components):
    plt.figure()
    plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i])
    if i == 0:
        plt.ylabel('PLS 1st component')
    elif i == 1:
        plt.ylabel('PLS2nd component')
    else:
        plt.ylabel('PLS 3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
    axis_c.set_xticks(axis_c.get_xticks() + 0.5)
    
예제 #10
0
    from sklearn.lda import LDA
    lda = LDA()
    lda.fit(Xtrain,Ytrain)
    
    LDA_centroids = lda.means_    # Centroids of the classes (n_class, n_features)

    Xtrain = lda.transform(Xtrain)
    Xtest = lda.transform(Xtest)
    
# Linear PLS

if (FE_PLS == 1):
    pls2 = PLSRegression(n_components=n_comp)
    pls2.fit(Xtrain,Ytrain_m)
    pls2
    Xtrain = pls2.transform(Xtrain)
    Xtest = pls2.transform(Xtest)
    

    
# Kernel PLS

if (FE_kPLS == 1):
    d = pair.pairwise_distances(Xtrain,Xtrain)
    aux = np.triu(d)
    sigma = np.sqrt(np.mean(np.power(aux[aux!=0],2)*0.5))
    gamma = 1/(2*sigma**2)
    
    ktrain = pair.rbf_kernel(Xtrain,Xtrain,gamma)
    ktest = pair.rbf_kernel(Xtest,Xtrain,gamma)
    kcent = KernelCenterer()
예제 #11
0
var_index = data.columns.values.tolist()

# vector of class responses associated with data
resp = load_data.getResponseMatrix1D()
resp2 = load_data.getResponseMatrix2D()

#### Create object to normalize and un-normalize data
norm_trans = pre.StandardScaler().fit(d)
data_norm = norm_trans.transform(d)

#### Train OPLS
opls = OPLS(2, resp2).fit(data_norm, resp)

#### Train PLS for comparison
pls = PLS(2).fit(data_norm, resp)
pls.rotated_data = pls.transform(data_norm)
pls.responses = resp2

#### Figures
opls.plotProjectionScatterMultiClass(2, labels=["Healthy", "Not Healthy"])
OPLS.plotProjectionScatterMultiClass(pls, 2, labels=["Healthy", "Not Healthy"])

plt.figure()
plt.plot(opls.analysis.coef_[:, 0]**2)
#plt.plot(opls.analysis.coef_[:,1]**2)
plt.title("OPLS Weights")

plt.figure()
plt.plot(pls.coef_[:, 0]**2)
#plt.plot(pls.x_weights_[:,1]**2)
plt.title("PLS Weights")
예제 #12
0
pca = PCA(n_components=j, random_state=np.random.RandomState(0))
pca.fit(x)
x3 = pca.transform(x)
string = "pca_"
pca_column_name = [string + ` i ` for i in range(x3.shape[1])]
reduced_df = pd.DataFrame(pca.components_,
                          columns=x.columns,
                          index=pca_column_name)
sig_features = list(set(reduced_df.idxmax(axis=1).values))
print sig_features
df_final = x[sig_features]
pca_df = reduced_df[sig_features]

plsca = PLSRegression(n_components=j)
plsca.fit(x, y)
x_pls = plsca.transform(x)
string = "pls_"
x_pls_column_name = [string + ` i ` for i in range(x_pls.shape[1])]
plsca_df = pd.DataFrame(plsca.x_weights_)
plsca_trans = plsca_df.transpose()
x_pls_reduced_df = pd.DataFrame(plsca_trans.values,
                                columns=x.columns,
                                index=x_pls_column_name)
pls_sig_features = list(set(x_pls_reduced_df.idxmax(axis=1).values))
print pls_sig_features
df_trans.reset_index(['CUSTOMER_KEY'], inplace=True)
pls_final = pd.concat([df_trans[pls_sig_features], df_trans['CUSTOMER_KEY']],
                      axis=1)
y.reset_index(['CUSTOMER_KEY'], inplace=True)
df2 = pd.concat([y, pls_final], axis=1)
df2.set_index('CUSTOMER_KEY', inplace=True)
예제 #13
0
plt.title("PCA")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

lda = LinearDiscriminantAnalysis(n_components=2).fit(x, y)
Y = lda.transform(x)
ax = fig.add_subplot(243)
plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral)
plt.title("lda")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

pls = PLSRegression(n_components=2).fit(x, y)
Y = pls.transform(x)
ax = fig.add_subplot(244)
plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral)
plt.title("%s" % "PLS")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

Y = manifold.MDS(n_components=2).fit_transform(x)
ax = fig.add_subplot(246)
plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral)
plt.title("mds")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
예제 #14
0
def pls_thing(scenario_data, xcols, ycols, titlestr):
    #PLS Summary Stats
    pls = PLSRegression(n_components=3)
    pls.fit(scenario_data[xcols], scenario_data[ycols])
    k = 0
    transformed_x_full = pls.transform(scenario_data[xcols])
    y = scenario_data[ycols]

    results = pd.DataFrame(columns=('Case Label', 'Explained Variance Ratio',
                                    'RegressionCoefs', 'Regression R^2',
                                    'SpearmanCorr', 'SpearmanPvalue',
                                    'Loadings', 'X Weights', 'X Loadings',
                                    'X Scores'))

    if type(titlestr) == type([]):
        titlestr = ' '.join(titlestr)

    #Linear fits for each individual component
    for c in range(np.shape(pls.x_weights_)[1]):
        x_transformed_1pc = transformed_x_full[:, k].reshape(-1, 1)
        lr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
        lr.fit(x_transformed_1pc, y)
        print('Regression Coefs', lr.coef_)
        print('R^2', lr.score(x_transformed_1pc, y))
        print('Spearman: ', scipy.stats.spearmanr(x_transformed_1pc, y))
        print('Component: ', c)
        results.loc[len(results)] = np.nan
        results.loc[len(results) - 1,
                    'Case Label'] = titlestr + ' Component ' + str(k)
        #        results.loc[len(results)-1,'Explained Variance Ratio'] = pls.explained_variance_ratio_[k]
        results.set_value(len(results) - 1, 'RegressionCoefs', lr.coef_)
        results.loc[len(results) - 1,
                    'Regression R^2'] = lr.score(x_transformed_1pc, y)
        results.loc[len(results) - 1, 'SpearmanCorr'] = scipy.stats.spearmanr(
            x_transformed_1pc, y)[0]
        results.loc[len(results) - 1,
                    'SpearmanPvalue'] = scipy.stats.spearmanr(
                        x_transformed_1pc, y)[1]
        results.set_value(len(results) - 1, 'X Weights', pls.x_weights_[:, k])
        results.set_value(
            len(results) - 1, 'X Loadings', pls.x_loadings_[:, k])
        results.set_value(len(results) - 1, 'X Scores', pls.x_scores_[:, k])

        plt.plot(x_transformed_1pc, y, '*')
        plt.xlabel('Component ' + str(k))
        plt.ylabel('Performance')
        plt.title('PLS ' + titlestr)
        plt.show()
        k += 1
        print(results)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_title("PLS PC0 vs PC1 vs Performance " + ' '.join(cs), fontsize=14)
    ax.set_xlabel("PC0", fontsize=12)
    ax.set_ylabel("PC1", fontsize=12)
    ax.scatter(transformed_x_full[:, 0],
               transformed_x_full[:, 1],
               s=100,
               c=y,
               marker='*',
               cmap=cm.bwr)
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(transformed_x_full[:, 0],
               transformed_x_full[:, 1],
               transformed_x_full[:, 2],
               s=100,
               c=y,
               marker='*',
               cmap=cm.bwr)
    ax.set_title("PLS PC0 vs PC1 vs PC2 vs Performance " + ' '.join(cs),
                 fontsize=14)
    ax.set_xlabel("PC0", fontsize=12)
    ax.set_ylabel("PC1", fontsize=12)
    ax.set_zlabel("PC2", fontsize=12)
    plt.show()

    print(results)
    return results
예제 #15
0
pcr.fit(X_train, y_train)
pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline

pls = PLSRegression(n_components=1)
pls.fit(X_train, y_train)

fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth")
axes[0].scatter(
    pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions"
)
axes[0].set(
    xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
)
axes[0].legend()
axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth")
axes[1].scatter(
    pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions"
)
axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
axes[1].legend()
plt.tight_layout()
plt.show()

# %%
# As expected, the unsupervised PCA transformation of PCR has dropped the
# second component, i.e. the direction with the lowest variance, despite
# it being the most predictive direction. This is because PCA is a completely
# unsupervised transformation, and results in the projected data having a low
# predictive power on the target.
#
예제 #16
0
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA, TruncatedSVD

pca = PCA(n_components=8)
pca_feats = [3, 5, 10, 14, 18, 19, 22, 23, 25, 26, 27]

train_pca_df = pd.DataFrame([])
test_pca_df = pd.DataFrame([])
for feat in pca_feats:
    feat_label = "F" + str(feat)
    train_pca_df[feat_label] = train_features[feat_label]
    test_pca_df[feat_label] = test_features[feat_label]

pls = PLSRegression(n_components=8)  # This works good for the log reg model
pls.fit(train_pca_df, train_y)
train_feats_pls = pd.DataFrame(pls.transform(train_pca_df),
                               index=train_features.index)
test_feats_pls = pd.DataFrame(pls.transform(test_pca_df),
                              index=test_features.index)

#%% Replace pca feats with new feats
for feat in pca_feats:
    feat_label = "F" + str(feat)
    train_features = train_features.drop([feat_label], axis=1)
    test_features = test_features.drop([feat_label], axis=1)
train_features = pd.concat([train_features, train_feats_pls], axis=1)
test_features = pd.concat([test_features, test_feats_pls], axis=1)

#%% Logistic Regression on the initial features

lr = LogisticRegression()
예제 #17
0
def plot_pcr_vs_pls():
    rng = np.random.RandomState(0)
    n_samples = 500
    cov = [[3, 3], [3, 4]]
    X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
    pca = PCA(n_components=2).fit(X)

    plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')
    for i, (comp,
            var) in enumerate(zip(pca.components_, pca.explained_variance_)):
        comp = comp * var  # scale component by its variance explanation power
        plt.plot([0, comp[0]], [0, comp[1]],
                 label=f"Component {i}",
                 linewidth=5,
                 color=f"C{i + 2}")
    plt.gca().set(aspect='equal',
                  title="2-dimensional dataset with principal components",
                  xlabel='first feature',
                  ylabel='second feature')
    plt.legend()
    plt.show()

    y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2

    fig, axes = plt.subplots(1, 2, figsize=(10, 3))

    axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)
    axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y')
    axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)
    axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y')
    plt.tight_layout()
    plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    pcr = make_pipeline(StandardScaler(), PCA(n_components=1),
                        LinearRegression())
    pcr.fit(X_train, y_train)
    pca = pcr.named_steps['pca']  # retrieve the PCA step of the pipeline

    pls = PLSRegression(n_components=1)
    pls.fit(X_train, y_train)

    fig, axes = plt.subplots(1, 2, figsize=(10, 3))
    axes[0].scatter(pca.transform(X_test),
                    y_test,
                    alpha=.3,
                    label='ground truth')
    axes[0].scatter(pca.transform(X_test),
                    pcr.predict(X_test),
                    alpha=.3,
                    label='predictions')
    axes[0].set(xlabel='Projected data onto first PCA component',
                ylabel='y',
                title='PCR / PCA')
    axes[0].legend()
    axes[1].scatter(pls.transform(X_test),
                    y_test,
                    alpha=.3,
                    label='ground truth')
    axes[1].scatter(pls.transform(X_test),
                    pls.predict(X_test),
                    alpha=.3,
                    label='predictions')
    axes[1].set(xlabel='Projected data onto first PLS component',
                ylabel='y',
                title='PLS')
    axes[1].legend()
    plt.tight_layout()
    plt.show()

    print(f"PCR r-squared {pcr.score(X_test, y_test):.3f}")
    print(f"PLS r-squared {pls.score(X_test, y_test):.3f}")

    pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
    pca_2.fit(X_train, y_train)
    print(f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}")
예제 #18
0
class PLS:

    def __init__(self, params):
        self.name = "pls"
        self.model = PLSRegression(n_components=params['n_components'])
        self.target_col = None
        
    def _format_data(self, data_map):  
        if self.target_col is None:
            raise ValueError("Target col is None!")
        order = sorted(list(data_map.keys()))
        X = {k:data_map[k] for k in order if k != self.target_col}
        inputs = np.concatenate([X[k] for k in X], axis=1)
        return inputs

    def fit(self, train_map, target_col, valid_fraction=0.2, use_cv=False):
        print("Formatting data")
        self.target_col = target_col

        y = train_map[target_col].values
        X = self._format_data(train_map)

        splitpoint = int(y.shape[0]*(1-valid_fraction))

        y_valid, X_valid = y[splitpoint:], X[splitpoint:]
        y, X = y[:splitpoint], X[:splitpoint]
        
        if use_cv:
            print('Running grid search')
            param_dist = self.get_hyperparam_ranges()
            self.model = select.GridSearchCV(self.model,
                                     param_grid=param_dist,
                                     cv=3,
                                     n_jobs=2)
        
        print("Fitting PLS")
        self.model.fit(X, y)

        if valid_fraction != 0:
            print("Scoring on validation data")
            r2 = self.model.score(X_valid, y_valid)

            print("R2 for PLS:", r2)
            return r2
        else:
            print("No validation data")
            return 0.0

    def predict(self, data_map):
        X = self._format_data(data_map)
        return self.model.predict(X)

    def get_latents(self, data_map):
        X = self._format_data(data_map)
        return self.model.transform(X)

    def get_save_name(self, model_folder):
        return os.path.join(model_folder, self.name+".joblib")

    def save(self, model_folder):
        name = self.get_save_name(model_folder)
        joblib.dump(self.model, name)

    def load(self, model_folder):
        name = self.get_save_name(model_folder)
        self.model = joblib.load(name)
    
    @classmethod
    def get_hyperparam_ranges(cls):
        params = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 40]}
        return params
예제 #19
0
def plsda(df, a, b, n_components=2, mean_center=False, scale=True, **kwargs):
    """
    Partial Least Squares Discriminant Analysis, based on `sklearn.cross_decomposition.PLSRegression`

    Performs a binary group partial least squares discriminant analysis (PLS-DA) on the supplied
    dataframe, selecting the first ``n_components``.

    Sample groups are defined by the selectors ``a`` and ``b`` which are used to select columns
    from the supplied dataframe. The result model is applied to the entire dataset,
    projecting non-selected samples into the same space.

    For more information on PLS regression and the algorithm used, see the `scikit-learn documentation <http://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html>`_.

    :param df: Pandas ``DataFrame`` to perform the analysis on
    :param a: Column selector for group a
    :param b: Column selector for group b
    :param n_components: ``int`` number of components to select
    :param mean_center: ``bool`` mean center the data before performing PLS regression
    :param kwargs: additional keyword arguments to `sklearn.cross_decomposition.PLSRegression`
    :return: scores ``DataFrame`` of PLSDA scores n_components x n_samples
             weights ``DataFrame`` of PLSDA weights n_variables x n_components
    """

    if not sklearn:
        assert (
            'This library depends on scikit-learn (sklearn) to perform PLS-DA')

    from sklearn.cross_decomposition import PLSRegression

    df = df.copy()

    # We have to zero fill, nan errors in PLSRegression
    df[np.isnan(df)] = 0

    if mean_center:
        mean = np.mean(df.values, axis=0)
        df = df - mean

    sxa, _ = df.columns.get_loc_level(a)
    sxb, _ = df.columns.get_loc_level(b)

    dfa = df.iloc[:, sxa]
    dfb = df.iloc[:, sxb]

    dff = pd.concat([dfa, dfb], axis=1)
    y = np.ones(dff.shape[1])
    y[np.arange(dfa.shape[1])] = 0

    plsr = PLSRegression(n_components=n_components, scale=scale, **kwargs)
    plsr.fit(dff.values.T, y)

    # Apply the generated model to the original data
    x_scores = plsr.transform(df.values.T)

    scores = pd.DataFrame(x_scores.T)
    scores.index = [
        'Latent Variable %d' % (n + 1) for n in range(0, scores.shape[0])
    ]
    scores.columns = df.columns

    weights = pd.DataFrame(plsr.x_weights_)
    weights.index = df.index
    weights.columns = [
        'Weights on Latent Variable %d' % (n + 1)
        for n in range(0, weights.shape[1])
    ]

    loadings = pd.DataFrame(plsr.x_loadings_)
    loadings.index = df.index
    loadings.columns = [
        'Loadings on Latent Variable %d' % (n + 1)
        for n in range(0, loadings.shape[1])
    ]

    return scores, weights, loadings
예제 #20
0
        x3 = lda.transform(x[index_test])
        model.fit(x2, y[index_train])
        predict = model.predict(x3)
        accuracy_lda = metrics.accuracy_score(y[index_test], predict)
        cv_lda[count2] = 1 - accuracy_lda
        count2 += 1
    lda_score[count] = cv_lda.mean()
    lda_std[count] = cv_lda.std()
    # pls
    cv_pls = np.zeros(times)
    count2 = 0
    for train, test in kf.split(index):
        index_train = index[train]
        index_test = index[test]
        pls = PLSRegression(n_components=i).fit(x[index_train], y[index_train])
        x2 = pls.transform(x[index_train])
        x3 = pls.transform(x[index_test])
        model.fit(x2, y[index_train])
        predict = model.predict(x3)
        accuracy_pls = metrics.accuracy_score(y[index_test], predict)
        cv_pls[count2] = 1 - accuracy_pls
        count2 += 1
    pls_score[count] = cv_pls.mean()
    pls_std[count] = cv_pls.std()

    print '维度为%d' % i
    print 'pca降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_pca.mean(), cv_pca.std() * 2)
    print 'mds降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_MDS.mean(), cv_MDS.std() * 2)
    print 'Isomap降维后分类错误确率: %0.2f (+/- %0.2f)' % (cv_Isomap.mean(),
                                                  cv_Isomap.std() * 2)
    print 'lda降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_lda.mean(), cv_lda.std() * 2)
예제 #21
0
                pc1 = tmp[:,i]
                pc2 = tmp[:,j]
                plt.scatter(pc1, pc2)
            plt.xlabel("PLS Component "+str(i+1))
            plt.ylabel("PLS Component "+str(j+1))
            
    plt.show()
    


##################### MAIN CODE #####################
#### Load data into numpy array'
# Keep pandas just for conveinience right now
data = load_data.loadDataPandas('../data/SCLC_study_output_filtered_2.csv')
d = data.to_numpy()
var_index = data.columns.values.tolist()

# vector of class responses associated with data
resp = load_data.getResponseMatrix2D()

#### Create object to normalize and un-normalize data
norm_trans = pre.StandardScaler().fit(d)
data_norm = norm_trans.transform(d)
#data_norm, norm_trans = pre.mean_center(d) 
#In-built preprocessing method - TBD

#### Fit a Partial Least Squaresn
pls = PLS().fit(data_norm, resp)
pls_trans = pls.transform(data_norm)

plotProjectionScatterMultiClass(pls_trans, resp, 2)
예제 #22
0
pls_components = range(1, 18)

cv_pls = np.array([])
for m in pls_components:
    pls = PLSRegression(n_components=m)
    foo = np.transpose(college_train_x.get_values())
    transformed_college_train_x = pls.fit_transform(college_train_x,
                                                    college_train_y)[0]
    lrm = LinearRegression()
    pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x,
                            college_train_y).mean()
    cv_pls = np.append(cv_pls, pls_this_rmse)

min_m = pls_components[np.argmin(cv_pls)]
cv_pls = pd.Series(cv_pls, index=pls_components)
cv_pls.plot(title="PLSRegression Cross Validation")
plt.xlabel("Number of Components (M)")
plt.ylabel("Root Mean Square Error")
if show_plots_flag:
    plt.show()

best_pls = PLSRegression(n_components=min_m)
transformed_college_train_x = best_pls.fit_transform(college_train_x,
                                                     college_train_y)[0]
transformed_college_test_x = best_pls.transform(college_test_x)
lrm = LinearRegression()
lrm.fit(transformed_college_train_x, college_train_y)
print "\nPLSRegression Regression test RMSE (M = " + str(min_m) + ")"
print rmse(lrm, transformed_college_test_x, college_test_y)
    data_x += a

    #Split the feature vector
    for sample in a:
        for i in range(subset):
            fc7_x[i].append(sample[i * offset:(i + 1) * offset])

    #Create the labels refering to the selected data
    data_y += [k] * len(a)

#With PLS the results improve in accuracy and computational time
pls = PLSRegression(n_components=10, scale=True)

for i in range(subset):
    pls.fit(fc7_x[i], data_y)
    fc7_x[i] = pls.transform(fc7_x[i])

fc7_X_train = [None] * subset
fc7_X_test = [None] * subset
fc7_y_train = [None] * subset
fc7_y_test = [None] * subset

#Generate train/test splits for all subsets
for i in range(subset):
    fc7_X_train[i], fc7_X_test[i], fc7_y_train[i], fc7_y_test[
        i] = train_test_split(fc7_x[i],
                              data_y,
                              test_size=0.33,
                              random_state=42)

#Create parameters to choose in the grid search
예제 #24
0
    LDA_centroids = lda.means_    # Centroids of the classes (n_class, n_features)

    Xtrain_LDA = lda.transform(Xtrain)
    Xtest_LDA = lda.transform(Xtest)


# PLS 
if (FE_PLS == 1):
    from sklearn.cross_decomposition import PLSSVD,PLSCanonical,PLSRegression
    pls = PLSRegression(n_components = n_comp)
    
    pls.fit(Xtrain,Ytrain_m)
    
    PLS_weights = pls.x_weights_.T
    
    Xtrain_PLS = pls.transform(Xtrain)
    Xtest_PLS = pls.transform(Xtest)
    
Xtrain = Xtrain_LDA
Xtest = Xtest_LDA
      
#######################################################################
#######################################################################
##                           FEATURE SELECTION
#######################################################################
#######################################################################

from sklearn.ensemble import ExtraTreesClassifier
from sklearn import random_projection
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
예제 #25
0
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()
	    print "PLS Validation error " , float(error)/yp_v.shape[0]

	    X_new = pls.transform(X)
	    rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4)
	    #print "shapes ", X_new.shape, y.shape
	    #print X_new,y
            X_t, X_v, y_t, y_v = tts(X_new,yd,train_size=0.85)

	    rf.fit(X_t, y_t)
            print "Random Forest Classifier: ", rf.get_params()
	    print "Covariance Classifier Training score: ", rf.score(X_t, y_t)
	    print "Covariance Classifier Validation score: ", rf.score(X_v, y_v)
	    #print "Class prob: ", zip(rf.predict_proba(X_v), y_v)

            sample_weights = rf.predict_proba(pls.transform(Xp_t))[:,1]
	    print sample_weights.shape
	    sample_weights = abs(sample_weights-0.5)
예제 #26
0
plt.show()

# CCA
from sklearn.cross_decomposition import CCA
cca = CCA(n_components=2)
cca.fit(X, Y)
X_cca = lda.transform(X)

plt.plot(X_cca[0:50,0],X_cca[0:50,1],'o',label='setosa')
plt.plot(X_cca[50:100,0],X_cca[50:100,1],'o',label='versicolor')
plt.plot(X_cca[100:150,0],X_cca[100:150,1],'o',label='virginica')
plt.xlim([-8,9])
plt.ylim([-4,4])
plt.title('CCA')
plt.legend(loc='lower right')
plt.show()

# PLS
from sklearn.cross_decomposition import PLSRegression
pls2 = PLSRegression(n_components=2)
pls2.fit(X, Y)
X_pls = pls2.transform(X)

plt.plot(X_pls[0:50,0],X_pls[0:50,1],'o',label='setosa')
plt.plot(X_pls[50:100,0],X_pls[50:100,1],'o',label='versicolor')
plt.plot(X_pls[100:150,0],X_pls[100:150,1],'o',label='virginica')
plt.xlim([-3,3])
plt.ylim([-1,1])
plt.title('PLS')
plt.legend(loc='lower right')
plt.show()
예제 #27
0
    original_dataset = pd.read_csv(settings.TRAIN_FILE)
    target = FeatureColumnsExtractor(settings.TARGET).fit_transform(
        original_dataset).apply(lambda x: np.sqrt(x))

    feature_union = get_feature_union()
    dataset = feature_union.fit_transform(original_dataset, target)
    var_thresh = VarianceThreshold(threshold=0.02)
    dataset = var_thresh.fit_transform(dataset)

    high_corr = HighCorrelationFilter(threshold=0.82)
    dataset = high_corr.fit_transform(dataset)

    n_components = 17
    pls = PLSRegression(n_components=n_components)
    pls.fit(dataset, target)
    dataset_ = pls.transform(dataset)

    estimators = get_estimation_pipeline()
    estimators.fit(dataset_, target)

    original_test_set = pd.read_csv(settings.TEST_FILE)
    # test_set = get_preprocessing_pipeline().fit_transform(original_test_set)
    test_set = feature_union.transform(original_test_set)

    test_set = var_thresh.transform(test_set)
    test_set = high_corr.transform(test_set)

    test_set_ = pls.transform(test_set)

    predictions = estimators.predict(test_set_)
    output = pd.DataFrame({
예제 #28
0
파일: PLS.py 프로젝트: imatge-upc/VNeAT
    def __fit__(self,
                correctors,
                predictors,
                observations,
                n_jobs=-1,
                *args,
                **kwargs):
        '''Computes the correction and prediction parameters that best fit the observations according to the
            Partial Least Squares metdhos

            Parameters:

                - correctors: NxC (2-dimensional) matrix, representing the covariates, i.e., features that
                    (may) explain a part of the observational data in which we are not interested, where C
                    is the number of correctors and N the number of elements for each corrector.

                - predictors: NxR (2-dimensional) matrix, representing the predictors, i.e., features to be used
                    to try to explain/predict the observations (experimental data), where R is the number of
                    predictors and N the number of elements for each predictor (the latter is ensured to be the
                    same as that in the 'correctors' argument).

                - observations: NxM (2-dimensional) matrix, representing the observational data, i.e., values
                    obtained by measuring the variables of interest, whose behaviour is wanted to be explained
                    by the correctors and predictors, where M is the number of variables and N the number of
                    observations for each variable (the latter is ensured to be the same as those in the
                    'correctors' and the 'predictors' arguments).


                - num_threads: integer (default -1), indicating the number of threads to be used by the algo-
                    rithm. If set to -1, all CPUs are used. This will only provide speed-up for M > 1 and
                    sufficiently large problems.

            Returns:

                - Correction parameters: (num_comp+2)*CxM (3-dimensional) matrix, representing the parameters that best fit
                    the correctors to the observations for each variable, where M is the number of variables
                    (same as that in the 'observations' argument) and C is the number of correction parameters
                    for each variable (same as the number of correctors).

                - Regression parameters: ((num_comp+2)*R + 2)xM (3-dimensional) matrix, representing the parameters that best fit
                    the predictors to the corrected observations for each variable, where M is the number of
                    variables (same as that in the 'observations' argument) and R is the number of prediction
                    parameters for each variable (same as the number of predictors).
                    The first dimension correspond to (x_rotations, coef, x_mean, y_mean, num_components)
        '''

        # All-at-once approach
        pls_corr = PLSRegression(n_components=self.num_components_corr,
                                 scale=False)
        pls_pred = PLSRegression(n_components=self.num_components_pred,
                                 scale=False)

        M = observations.shape[1]
        R = predictors.shape[1]

        if correctors.size != 0:
            cparams = np.zeros((R * (self.num_components_pred + 2) + 3, M))
            for n in range(M):
                if np.std(observations[:, n]) == 0:
                    continue
                pls_corr.fit(correctors, observations[:, n])
                observations[:, n] = observations[:, n] - np.dot(
                    pls_corr.transform(correctors), pls_corr.y_loadings_.T)

                cparams[:R * self.num_components_corr,
                        n] = pls_corr.x_rotations_.reshape((-1, ))
                cparams[R * self.num_components_corr:R *
                        (self.num_components_corr + 1),
                        n] = pls_corr.coef_.reshape((-1, ))
                cparams[R * (self.num_components_corr + 1):-2,
                        n] = pls_corr.x_mean_.reshape((-1, ))
                cparams[-3, n] = pls_corr.y_mean_.reshape((-1, ))
                cparams[-2, n] = correctors.shape[1]
                cparams[-1, n] = self.num_components_corr
                cparams = np.concatenate((pls_corr.x_rotations_[np.newaxis],
                                          pls_corr.y_loadings_[np.newaxis],
                                          pls_corr.x_mean_[np.newaxis],
                                          pls_corr.y_mean_[np.newaxis]),
                                         axis=0)
        else:
            cparams = np.asarray([[]])

        if predictors.size != 0:
            pparams = np.zeros(
                ((R + 1) * (self.num_components_pred + 1) + R + 2, M))
            for n in range(M):
                if np.std(observations[:, n]) == 0:
                    pparams[-3, n] = np.mean(observations[:, n]).reshape(
                        (-1, ))
                    continue
                pls_pred.fit(predictors, observations[:, n])
                pparams[:R * self.num_components_pred,
                        n] = pls_pred.x_rotations_.reshape((-1, ))
                pparams[R * self.num_components_pred:(R + 1) *
                        self.num_components_pred,
                        n] = pls_pred.y_rotations_.reshape((-1, ))
                pparams[(R + 1) * self.num_components_pred:(R + 1) *
                        self.num_components_pred + R,
                        n] = pls_pred.coef_.reshape((-1, ))
                pparams[(R + 1) * self.num_components_pred + R:-3,
                        n] = pls_pred.x_mean_.reshape((-1, ))
                pparams[-3, n] = pls_pred.y_mean_.reshape((-1, ))
                pparams[-2, n] = R
                pparams[-1, n] = self.num_components_pred

        else:
            pparams = np.asarray([[]])

        return (cparams, pparams)
예제 #29
0
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.datasets import make_classification
from pls_gpu import PLSGPU
import time


if __name__ == '__main__':
    np.random.seed(12227)

    X, y = make_classification(n_samples=10000, n_features=3000, n_classes=2, n_clusters_per_class=1)

    pls = PLSRegression(n_components=10)
    pls.fit(X, y)
    start = time.time()
    pls.transform(X)
    end = time.time()
    print('Projection time PLS [{:.4f}]'.format(end-start))

    pls_gpu = PLSGPU(pls, batch_size=X.shape[0])
    start = time.time()
    pls_gpu.transform(X)
    end = time.time()
    print('Projection time PLSGPU [{:.4f}]'.format(end - start))
예제 #30
0
xt,yt = plscan.fit_transform(dataTrain,Ytrain)
fig = plt.figure()
util.plotData(fig,xt,labelsTrain,classColors)

u = plscan.x_weights_
plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig)
plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)

#%% PLS2
lda = LDA()
nComponents = np.arange(1,nFeatures,8)
pls2Scores = np.zeros((2,np.alen(nComponents)))
for i,n in enumerate(nComponents):
    pls2 = PLSRegression(n_components=n)
    pls2.fit(dataTrain,Ytrain)
    dataTrainT = pls2.transform(dataTrain)
    dataTestT = pls2.transform(dataTest)
    pls2Scores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest)

pls2 = PLSRegression(n_components=2)
xtPLS,yt = pls2.fit_transform(dataTrain,Ytrain)

uPLS = pls2.x_weights_

#%% Canonical Correlation Analysis
nComponents = np.arange(1,nClasses+1)
cca = CCA(n_components=nClasses)
cca.fit(dataTrain,Ytrain)
dataTrainT = cca.transform(dataTrain)
dataTestT = cca.transform(dataTest)
ccaScores = np.zeros((2,np.alen(nComponents)))
예제 #31
0

# In[136]:


# Split data to train and test on 50-50 ratio
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=None)


# In[137]:


pls = PLSRegression(n_components=27)
pls.fit(X_train, X_test)
X_pls = pls.fit_transform(X_train, X_test)
x2 =pls.transform(x)


# In[138]:


x2=pd.DataFrame(x2)
print(x2)
#x2= NormalizeData(x2)
#print(X_pls)
#two_arrays = X_pls
#datapls = np.hstack(two_arrays)
#np.savetxt('lungcancerpls111.csv', datapls, delimiter=',')


# In[139]:
예제 #32
0
            #print "yp_t_not ", yp_t_not.shape
            pls.fit(Xp_t, yp_t_not.astype(int))
            yp_new = pls.predict(Xp_t, copy=True)
            yp_pred = (yp_new[:, 0] > yp_new[:, 1]).astype(int)
            yp_t = yp_t.astype(int)
            #print y_new,y_pred, y_t
            error = ((yp_t - yp_pred)**2).sum()
            print "PLS Training error ", float(error) / yp_t.shape[0]
            yp_new = pls.predict(Xp_v, copy=True)
            yp_pred = (yp_new[:, 0] > yp_new[:, 1]).astype(int)
            #print y_new, y_pred, y_v
            #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
            error = ((yp_v - yp_pred)**2).sum()
            print "PLS Validation error ", float(error) / yp_v.shape[0]

            X_new = pls.transform(X)
            rf = RandomForestClassifier(n_estimators=500,
                                        max_depth=None,
                                        max_features=int(
                                            math.sqrt(n_components)),
                                        min_samples_split=100,
                                        random_state=144,
                                        n_jobs=4)
            #print "shapes ", X_new.shape, y.shape
            #print X_new,y
            X_t, X_v, y_t, y_v = tts(X_new, yd, train_size=0.85)

            rf.fit(X_t, y_t)
            print "Random Forest Classifier: ", rf.get_params()
            print "Covariance Classifier Training score: ", rf.score(X_t, y_t)
            print "Covariance Classifier Validation score: ", rf.score(
예제 #33
0
class metamodel():
    def __init__(self,
                 X,
                 y,
                 bounds=None,
                 testfunction=None,
                 reg=None,
                 name='',
                 testPoints=None,
                 MLEP=True,
                 normtype='std',
                 Lambda=0.01,
                 PLS=False,
                 PLS_order=2,
                 **kwargs):

        self.X_orig = copy.deepcopy(X)
        self.y_orig = copy.deepcopy(y)
        self.X = copy.deepcopy(X)
        self.y = copy.deepcopy(y)

        self.testfunction = testfunction
        self.flag_penal = MLEP
        self.bounds = bounds
        self.name = name
        self.n = self.X.shape[0]  # Nr points
        self.k = self.X.shape[1]  # nr dimensions

        self.non_feasible_mc = None
        self.feasible_mc = None
        self.feasible_y_mc = None
        self.non_feasible_y_mc = None
        self.non_feasible = None
        self.feasible = None
        self.feasible_y = None
        self.non_feasible_y = None

        self.Lambda = 0
        self.sigma = 0

        self.normtype = normtype  #  std if normalized st std is one, else normalized on interval [0, 1]
        self.normRange = []
        self.ynormRange = []
        self.normalizeData()  # normalizes the input data!
        self.PLS = PLS
        self.pls2 = None
        self.PLS_order = PLS_order

        if self.PLS_order > self.X_orig.shape[1]:
            print('Higher PLS than dimension of problem')
            raise (ValueError)

        # lower so that it fits to at least a 3**dim grid!
        if self.n > 3**self.PLS_order:
            self.PLS_order = PLS_order
        else:
            self.PLS_order = int(np.floor(np.log(self.n) / np.log(3)))

        if self.PLS:
            # Compute all directions, reduction is done in later step!
            self.pls2 = PLSRegression(n_components=self.PLS_order)
            # if self.k == 1:
            # self.pls2 = PLSRegression(n_components=1)
            # elif self.k == 2:
            # self.pls2 = PLSRegression(n_components=2)
            # elif self.k > 2:
            # self.pls2 = PLSRegression(n_components=3)
            # else:
            # raise ValueError

            self.pls2.fit(self.X, self.y)
            self.X = self.pls2.transform(self.X)
            # self.X = self.PLS_trans(self.X)

        try:
            self.k = self.X.shape[1]
        except:
            self.k = 1
            self.X = self.X.reshape(-1, 1)

        self.theta = np.ones(self.k)
        self.pl = np.ones(self.k) * 2.
        self.sp = sp(self.k)
        self.reg = reg
        # self.updateData()
        # self.updateModel()
        self.thetamin = 1
        self.thetamax = 15
        self.pmin = 1.7
        self.pmax = 2.3
        self.pl = np.ones(self.k) * 2
        self.Lambda_min = 0.01  #1e-2
        self.Lambda_max = 0.1
        self.Lambda = Lambda  #0.1 #0.03
        # regression order

    def PLS_trans(self, X):
        # The PLS - regression computes a new basis in which the
        bm = self.pls2.x_rotations_  # full rotation
        try:
            Xt = np.linalg.solve(bm, X.T).T
        except:
            print(traceback.format_exc())
        # Pick out only first two components of this vector
        if np.isscalar(X[0]):
            raise (ValueError)

        Xt = Xt[:, :self.PLS_order]  # don't work for pointwise data
        return Xt

    def PLS_inv_rot(self, X):
        bm = self.pls2.x_rotations_  # full rotation
        Xr = np.dot(bm, X.T).T
        return Xr

    def normX(self, X):
        '''    
        :param X: An array of points (self.k long) in physical world units
        :return X: An array normed to our model range of [0,1] for each dimension
        '''

        scalar = False
        if np.isscalar(X[0]):
            X = [X]
            scalar = True

        X_norm = np.ones(np.shape(X)) * np.nan
        for i, row in enumerate(X):  # for every row
            for j, elem in enumerate(row):  # for every element in every row
                if self.normtype == 'std':  # with standard deviation one!
                    X_norm[i,
                           j] = (elem -
                                 self.normRange[j][0]) / self.normRange[j][1]
                else:  # in interval [0,1]
                    X_norm[i, j] = (elem - self.normRange[j][0]) / float(
                        self.normRange[j][1] - self.normRange[j][0])

        if scalar:  # unpack
            [X_norm] = X_norm
            return X_norm

        else:
            return X_norm

    def inversenormX(self, X):
        '''
        :param X: An array of points (with self.k elem) in normalized model units
        :return X : An array of real world units
        '''

        scalar = False
        if np.isscalar(X[0]):
            X = [X]
            scalar = True

        X_inv = np.ones(np.shape(X)) * np.nan
        for i, row in enumerate(X):  # for every row
            for j, elem in enumerate(row):  # for every element in every row
                if self.normtype == 'std':
                    X_inv[i,
                          j] = self.normRange[j][0] + elem * self.normRange[j][
                              1]  # x = mu + u*std(X)
                else:
                    X_inv[i, j] = (elem * float(self.normRange[j][1] -
                                                self.normRange[j][0])
                                   ) + self.normRange[j][0]

        if scalar:  # unpack
            [X_inv] = X_inv
            return X_inv
        else:
            return X_inv

    def normy(self, y):
        '''
        :param y: An array of observed values in real-world units
        :return y: A normalized array of model units in the range of [0,1]
        '''
        if self.normtype == 'std':
            return (y - self.ynormRange[0]
                    ) / self.ynormRange[1]  # u = (x-mu)/std(X)
        else:
            return (y - self.ynormRange[0]) / (self.ynormRange[1] -
                                               self.ynormRange[0])

    def inversenormy(self, y):
        '''
        :param y: A normalized array of model units in the range of [0,1]
        :return: An array of observed values in real-world units
        '''
        if self.normtype == 'std':
            return self.ynormRange[0] + y * self.ynormRange[
                1]  # x = mu + u * std(X)
        else:
            return (
                y *
                (self.ynormRange[1] - self.ynormRange[0])) + self.ynormRange[0]

    def normalizeData(self):
        '''
        This function is called when the initial data in the model is set.
        We find the max and min of each dimension and norm that axis to a range of [0,1]
        '''
        # lower and upper bound of data.
        for i in range(self.X.shape[1]
                       ):  # self.k can be smth different if PLS is used!
            if self.normtype == 'std':
                self.normRange.append([
                    np.mean(self.X[:, i]),
                    np.std(self.X[:, i], dtype=np.float64)
                ])
            else:  # determine the intervals
                self.normRange.append([min(self.X[:, i]), max(self.X[:, i])])

        # Normalize data
        self.X = self.normX(self.X)

        if self.normtype == 'std':
            self.ynormRange.append(np.mean(self.y))
            self.ynormRange.append(np.std(self.y, dtype=np.float64))
        else:  # determine the intervals
            self.ynormRange.append(min(self.y))
            self.ynormRange.append(max(self.y))

        for i in range(self.n):
            self.y[i] = self.normy(self.y[i])

    def animate():
        if animate:

            def init():
                ax.set_xlim([0, 1])
                ax.set_ylim([0, 1])
                ax.set_zlim([0, 250])
                ax.plot_wireframe(X,
                                  Y,
                                  Z,
                                  rstride=3,
                                  cstride=3,
                                  label='Metamodel')
                ax.scatter(spx,
                           spy,
                           self.inversenormy(self.y),
                           color='k',
                           label='Experiments')
                ax.legend(prop={'size': 20})
                if self.testfunction is not None:
                    ax.plot_surface(X,
                                    Y,
                                    ZT,
                                    rstride=3,
                                    cstride=3,
                                    alpha=0.5,
                                    cmap='jet')
                ax.set_xlabel('$X_1$')
                ax.set_ylabel('$X_2$')
                ax.set_zlabel('$\mathbf{G}(X_1, X_2)$')

                # ax.legend()
                return fig,

            def animate(i):
                ax.view_init(elev=10., azim=i)
                return fig,

            # Animate
            anim = animation.FuncAnimation(fig,
                                           animate,
                                           init_func=init,
                                           frames=360,
                                           interval=20,
                                           blit=True)
            # Save
            anim.save(
                r'C:\Users\pettlind\Dropbox\KTH\PhD\Article2\animate\animation.mp4',
                fps=30,
                extra_args=['-vcodec', 'libx264'])

        raise NotImplementedError()

    def plot(self,
             fig=None,
             ax=None,
             labels=False,
             show=True,
             animate=False,
             only_points=False,
             name=None,
             PF=False,
             bounds=None):
        '''
        This function plots 2D and 3D models
        :param labels:
        :param show: If True, the plots are displayed at the end of this call. If False, plt.show() should be called outside this function
        :return:
        https://stackoverflow.com/questions/13316397/matplotlib-animation-no-moviewriters-available
        '''

        if self.X_orig.shape[1] == 1000:  # DESTROYED!
            dim = self.X_orig.shape[1]

            # Multisubplot!

            def comp(x1, x2, x0, bounds):
                ''' compute variation in only two variables at the time.
                Input:
                x1 - index first variable
                x2 - index second variable
                bounds - bounds for all variable
                x0 - nominal value'''

                x = np.linspace(bounds[x1][0], bounds[x1][1], num=20)
                y = np.linspace(bounds[x2][0], bounds[x2][1], num=20)

                # Normalize wrong place!
                # for iter, xp, yp in zip(range(0,len(x)),x,y):
                # x[iter], y[iter] = self.normX(np.array([xp, yp]))

                X, Y = np.meshgrid(x, y)

                modeldata = np.asarray([np.ravel(X),
                                        np.ravel(Y)]).T  # 2d up to here

                pos = np.linspace(0, 9, 10)
                bol1 = pos == x1
                bol2 = pos == x2
                # np.logical_or(pos == x1, pos == x2)
                modeldata_upd = np.ones((modeldata.shape[0], 10)) * np.nan
                test_data = copy.copy(modeldata_upd)

                for ii, xa in enumerate(modeldata):
                    temp = copy.copy(x0)
                    temp[bol1] = xa[0]
                    temp[bol2] = xa[1]
                    modeldata_upd[ii] = copy.copy(temp)
                    test_data[ii] = copy.copy(temp)

                # prediction
                # zs =self.predict(self.PLS_trans(self.normX(modeldata_upd)))
                zs = self.predict(self.pls2.transform(
                    self.normX(modeldata_upd)),
                                  norm=False)
                Z = zs.reshape(X.shape)  # non-normed
                zt = self.testfunction(test_data)
                ZT = zt.reshape(X.shape)
                return Z, ZT

            # specs_fix = np.asarray([{'type': 'surface'}]*5*5).reshape(5, 5).tolist()
            # fig = make_subplots(rows=5, cols=5, specs = specs_fix)
            fig = plt.figure()
            fig, axs = plt.subplots(dim - 1,
                                    dim - 1,
                                    sharex='col',
                                    sharey='row')
            # Plot
            x = np.linspace(0, 1, num=20)
            y = np.linspace(0, 1, num=20)
            X, Y = np.meshgrid(x, y)

            bounds = np.asarray(bounds)
            x0 = 0.5 * bounds[:, 0] + 0.5 * bounds[:, 1]

            num_mat = np.linspace(0, (dim - 1)**2 - 1,
                                  (dim - 1)**2).reshape(dim - 1, dim - 1)
            num_v = []
            for i in range(1, dim):
                for j in range(0, i):
                    Z, ZT = comp(i, j, x0, bounds)
                    num_v.append(num_mat[i - 1, j])
                    # ax = fig.add_subplot(dim - 1, dim - 1, numb)#, projection='3d')
                    # ax.contourf(X, Y, Z, rstride=3, cstride=3, label='Metamodel')
                    # ax.plot_surface(X, Y, ZT, rstride=3, cstride=3, alpha=0.5, cmap='jet')
                    # contour_levels = 10
                    try:
                        contour_levels = np.linspace(180, 360, 11)
                        CS = axs[i - 1, j].contour(X,
                                                   Y,
                                                   -Z,
                                                   contour_levels,
                                                   colors='k',
                                                   linestyles='solid',
                                                   zorder=2)
                        # Change contour levels so that they match int 180-340!
                        # contour_levels = CS.levels

                        # delta = np.abs(contour_levels[0]-contour_levels[1])
                        # contour_levels = np.insert(contour_levels, 0, contour_levels[0]-delta)
                        # contour_levels = np.append(contour_levels, contour_levels[-1]+delta)

                        CT = axs[i - 1, j].contourf(X,
                                                    Y,
                                                    -ZT,
                                                    contour_levels,
                                                    cmap='cividis',
                                                    zorder=1)
                        # ax.plot_surface(X, Y, ZT, )
                        axs[i - 1, j].axis('off')
                    except:
                        pdb.set_trace()

            # Add colorbar
            # Remove empty subplots
            for i in range(0, num_mat.size):
                if not (i == np.asarray(num_v)).any():
                    axs.flat[i].set_visible(False)  # remove these

            # axes = fig.get_axes()[0]
            fig.colorbar(CT, ax=axs.flat)

            # Set common x and  y labels
            for ax in axs.flat:
                ax.set(xlabel='x-label', ylabel='y-label')
            # Hide x labels and tick labels for top plots and y ticks for right plots.
            for ax in axs.flat:
                ax.label_outer()

            if show:
                plt.show()

        elif self.k == 2:

            if fig is None:
                fig = plt.figure(figsize=(8, 6))

            # samplePoints = list(zip(*self.inversenormX(self.X_orig)))  # lists of list of every coordiante
            # Create a set of data to plot
            plotgrid = 50
            if bounds is None:
                x = np.linspace(min(self.X[:, 0]),
                                max(self.X[:, 0]),
                                num=plotgrid)
                y = np.linspace(min(self.X[:, 1]),
                                max(self.X[:, 1]),
                                num=plotgrid)
                nor = False

            else:  # boundries
                x = np.linspace(bounds[0][0], bounds[0][1], num=plotgrid)
                y = np.linspace(bounds[1][0], bounds[1][1], num=plotgrid)

                # Normalize
                for iter, xp, yp in zip(range(0, len(x)), x, y):
                    x[iter], y[iter] = self.normX(np.array([xp, yp]))
                nor = False

            X, Y = np.meshgrid(x, y)

            if not only_points:  # compute the true values at all points!
                modeldata = np.asarray([np.ravel(X), np.ravel(Y)]).T

                zs = np.array(
                    [self.predict(data, norm=nor) for data in modeldata])
                Z = zs.reshape(X.shape)  # non-normed

            if self.testfunction is not None and self.X_orig.shape[1] == 2:
                testdata = np.array(list(zip(np.ravel(X), np.ravel(Y))))

                if self.PLS:  # rotate according to PLS if True
                    testdata = self.PLS_inv_rot(testdata)

                zt = self.testfunction(self.inversenormX(testdata))
                ZT = zt.reshape(X.shape)

            if ax is None:
                # ax = fig.add_subplot(111, projection='3d')
                # ax = Axes3D(fig)
                matplotlib.rcParams['font.family'] = "Times New Roman"
                plt.style.use('seaborn-bright')
                # ax = fig.add_subplot(212, projection='3d')
                # fig = plt.gcf()
                #ax = fig.gca(projection='3d')
                fig2 = plt.figure(figsize=(8, 6))
                ax2 = Axes3D(fig2)
                # ax2.set_xlim([0, 1])
                # ax2.set_ylim([0, 1])
                # ax2.set_zlim([0, 250])
                ax2.scatter(self.X[:, 0],
                            self.X[:, 1],
                            self.inversenormy(self.y),
                            color='k',
                            label='Experiments')

                if PF:
                    if self.feasible is not None:  #
                        ax2.scatter(self.feasible[:, 0],
                                    self.feasible[:, 1],
                                    self.feasible_y,
                                    color='g',
                                    marker="o",
                                    label='Feasible model')
                        ax2.scatter(self.non_feasible[:, 0],
                                    self.non_feasible[:, 1],
                                    self.non_feasible_y,
                                    color='r',
                                    marker="o",
                                    label='Non Feasible model')

                    if self.feasible_mc is not None:  # Monte Carlo
                        ax2.scatter(self.feasible_mc[:, 0],
                                    self.feasible_mc[:, 1],
                                    self.feasible_y_mc,
                                    color='g',
                                    marker='s',
                                    label='Feasible mc')
                        ax2.scatter(self.non_feasible_mc[:, 0],
                                    self.non_feasible_mc[:, 1],
                                    self.non_feasible_y_mc,
                                    color='r',
                                    marker='s',
                                    label='Non Feasible mc')

                if not only_points:
                    ax2.plot_wireframe(X,
                                       Y,
                                       Z,
                                       rstride=3,
                                       cstride=3,
                                       label='Metamodel')
                    if self.testfunction is not None and self.X_orig.shape[
                            1] == 2:
                        ax2.plot_surface(X,
                                         Y,
                                         ZT,
                                         rstride=3,
                                         cstride=3,
                                         alpha=0.5,
                                         cmap='jet')

                ax2.legend(prop={'size': 20})
                ax2.set_xlabel('$X_1$')
                ax2.set_ylabel('$X_2$')
                ax2.set_zlabel('$\mathbf{G}(X_1, X_2)$')
                my_path = os.path.abspath('.')
                plt.savefig(my_path + '\\img\\' + name + '.png',
                            format='png',
                            dpi=1000)

                if show:
                    plt.show()

            else:
                pass

            # pylab.title(self.reg)
            # ax.legend(['Approx fun.', 'True fun.'], loc="upper right")
            # ax.legend(['Approx fun.', 'True fun.'], loc="upper right")
            # Now add the legend with some customizations.
            # legend = ax.legend(loc='upper center', shadow=True)
            # legend = ax.legend(loc='upper center', shadow=True)

        elif self.k == 1:
            if fig is None:
                fig = plt.figure(figsize=(8, 6))

            # Create a set of data to plot
            plotgrid = 50

            if plot_int is None:
                x_vec = np.linspace(self.normRange[0][0],
                                    self.normRange[0][1],
                                    num=plotgrid)

            else:
                xmin, xmax = plot_int
                x_vec = np.linspace(xmin, xmax, num=plotgrid)

            # Predict based on the optimized results

            y = np.array([
                self.predict(np.array(x).reshape(1, )) for x in np.ravel(x_vec)
            ])

            plt.plot(x, y, 'ro')

    def pf(self, mu, coe, MC_num, bounds=[], MC=False, PF=False, threshold=0):
        '''
        Computes Pf 
        
        Input:
        mu - vector of mean values
        coe - coefficient of determination
        MC_num - number of mc samples
        MC - Bool, if pure MC is to be done at the surface
        '''
        # Sample points on the surface using MC
        # X = sp(k=self.X_orig.shape[1]).MC(int(MC_num))

        samples = []
        for m, c, bound in zip(mu, coe, bounds):
            vec = np.random.normal(m, m * c, int(MC_num))

            if len(bound) > 0:  # TRUNCATE!
                vec[vec < bound[0]] = bound[0]
                vec[vec > bound[1]] = bound[1]
            samples.append(vec)

        samples = np.asarray(samples).T
        nor = True

        if self.PLS:
            mtest = self.pls2.transform(self.normX(
                samples))  # apply dimension reduction to the training data.
        else:
            mtest = self.normX(samples)
        if PF:
            f_vec = np.asarray([self.predict(xs, norm=False)
                                for xs in mtest]).reshape(mtest.shape[0])

            self.feasible = mtest[f_vec > threshold]
            self.non_feasible = mtest[f_vec < threshold]
            self.feasible_y = f_vec[f_vec > threshold]
            self.non_feasible_y = f_vec[f_vec < threshold]

            self.Pf = sum(f_vec < threshold) / float(MC_num)

        if MC:
            f_mc = np.asarray(self.testfunction(samples)).reshape(
                mtest.shape[0])
            self.Mc = sum(f_mc < threshold) / float(MC_num)
            self.feasible_mc = mtest[f_mc > threshold]
            self.non_feasible_mc = mtest[f_mc < threshold]
            self.feasible_y_mc = f_mc[f_mc > threshold]
            self.non_feasible_y_mc = f_mc[f_mc < threshold]

            if np.isnan(f_mc).any():  # Left a sanity check here!
                print('Probably wrong input into aircraft function!')
                raise ValueError()

    def RRMSE_R2(self, k, bounds, n=500):
        '''
        This function calculates the mean relative MSE metric of the model by evaluating MSE at a number of points and the Coefficient of determiniation.
        :param n: Points to Sample, the number of points to sample the mean squared error at. Ignored if the points argument is specified
        :param points: an array of points to sample the model at
        :return: the mean value of MSE and the standard deviation of the MSE points
        '''

        inside = 0
        den = 0
        SS_tot = 0
        SS_res = 0
        f_vec = np.zeros((n, ))
        y_vec = np.zeros((n, ))

        #
        #
        #
        # nd = n ** (1 / k)
        # xi = []

        # nump = int(np.floor(nd))
        # if nump < 3:
        #     nump = 3

        # marrays = np.asarray([np.linspace(0,1,nump) for i in range(k)])

        # Do instead LHS - with 100*input samples ?
        marrays = sp(k=k).rlh(n)
        mravel = np.ones(marrays.shape) * np.nan

        # Scale
        for i in range(k):
            mravel[:, i] = bounds[i][0] + (bounds[i][1] -
                                           bounds[i][0]) * marrays[:, i]

        # All points
        # mravel = []
        # for items in product(*marrays):
        #     mravel.append(items)
        mtest = copy.deepcopy(mravel)

        if self.PLS:
            mtest = self.pls2.transform(self.normX(
                mravel))  # apply dimension reduction on the training data.

        f_vec = np.asarray([self.predict(xs, norm=False) for xs in mtest])
        y_vec = self.testfunction(mravel)
        y_bar = np.sum(y_vec) / n**2

        # https://en.wikipedia.org/wiki/Root-mean-square_deviation
        for f_i, y_i in zip(f_vec, y_vec):
            inside += (f_i - y_i)**2
            SS_tot += (y_i - y_bar)**2
            # den += y_i

        # https://www.sciencedirect.com/science/article/pii/S1364032115013258?via%3Dihub
        # https://stats.stackexchange.com/questions/260615/what-is-the-difference-between-rrmse-and-rmsre?rq=1
        # https://en.wikipedia.org/wiki/Coefficient_of_determination
        RMSD = np.sqrt(inside / n**2)
        R_sq = 1 - inside / SS_tot

        if RMSD < 0:  # or RMSD > 1: #  or R_sq > 1:  # R_sq can be less than zero! - fits data worse than horizontal line.
            raise ValueError('Something of with error estimate!')
            pdb.set_trace()

        return R_sq, RMSD  # In percentage!
예제 #34
0
파일: PLS_NIPALS.py 프로젝트: CIMCB/cimcb
class PLS_NIPALS(BaseModel):
    """ Partial least-squares regression using the SIMPLS algorithm.

    Parameters
    ----------
    n_components : int, (default 2)
        Number of components to keep.

    Methods
    -------
    train : Fit model to data.

    test : Apply model to test data.

    evaluate : Evaluate model.

    calc_bootci : Calculate bootstrap intervals for plot_featureimportance.

    plot_featureimportance : Plot coefficient and Variable Importance in Projection (VIP).

    plot_permutation_test : Perform a permutation test and plot.
    """

    parametric = True
    # bootlist = ["model.vip_", "model.coef_"]  # list of metrics to bootstrap
    # bootlist = ["model.vip_", "model.coef_", "model.x_loadings_", "model.x_scores_", "Y_pred", "model.pctvar_", "model.y_loadings_"]  # list of metrics to bootstrap
    bootlist = [
        "model.vip_", "model.coef_", "model.x_loadings_", "model.x_scores_",
        "Y_pred", "model.pctvar_", "model.y_loadings_", "model.metrics"
    ]

    def __init__(self, n_components=2):
        self.model = PLSRegression(
            n_components=n_components)  # Should change this to an empty model
        self.n_component = n_components
        self.k = n_components

        self.__name__ = 'cimcb.model.PLS_NIPALS'
        self.__params__ = {'n_components': n_components}

    def set_params(self, params):
        self.__init__(**params)

    def train(self, X, Y):
        """ Fit the PLS model, save additional stats (as attributes) and return Y predicted values.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Predictor variables, where n_samples is the number of samples and n_features is the number of predictors.

        Y : array-like, shape = [n_samples, 1]
            Response variables, where n_samples is the number of samples.

        Returns
        -------
        y_pred_train : array-like, shape = [n_samples, 1]
            Predicted y score for samples.
        """
        # Error check
        # X, Y = self.input_check(X, Y)

        # Fit model
        self.model.fit(X, Y)

        # Calculate vip, pctvar (Explained variance in X) and flatten coef_ for future use
        # meanX = np.mean(X, axis=0)
        # X0 = X - meanX
        # self.model.pctvar_ = sum(abs(self.model.x_loadings_) ** 2) / sum(sum(abs(X0) ** 2)) * 100
        # self.model.vip_ = vip(self.model)
        # self.model.coef_ = self.model.coef_.flatten()
        y_pred_train = self.model.predict(X).flatten()

        self.model.pctvar_ = []
        for i in range(self.n_component):
            Y_pred = np.dot(self.model.x_scores_[:, i].reshape(
                -1, 1), self.model.y_loadings_[:, i].reshape(-1, 1).T) * Y.std(
                    axis=0, ddof=1) + Y.mean(axis=0)
            explainedvar = r2_score(Y, Y_pred) * 100
            self.model.pctvar_.append(explainedvar)
        self.model.pctvar_ = np.array(self.model.pctvar_)

        # T = self.model.x_scores_
        # W = self.model.x_weights_
        # Q = self.model.y_loadings_
        # w0, w1 = W.shape
        # s = np.sum(T ** 2, axis=0) * np.sum(Q ** 2, axis=0)
        # s_sum = np.sum(s, axis=0)
        # w_norm = np.array([(W[:, i] / np.linalg.norm(W[:, i]))
        #                    for i in range(w1)])
        # self.model.vip_ = np.sqrt(w0 * np.sum(s * w_norm.T ** 2, axis=1) / s_sum)

        t = self.model.x_scores_
        w = self.model.x_weights_
        q = self.model.y_loadings_
        p, h = w.shape
        vips = np.zeros((p, ))
        s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1)
        total_s = np.sum(s)
        for i in range(p):
            weight = np.array([(w[i, j] / np.linalg.norm(w[:, j]))**2
                               for j in range(h)])
            vips[i] = np.sqrt(p * (s.T @ weight) / total_s)
        self.model.vip_ = vips
        # Calculate and return Y predicted value
        y_pred_train = self.model.predict(X).flatten()
        self.model.coef_ = self.model.coef_.flatten()

        self.model.y_loadings_ = self.model.y_weights_
        self.model.x_scores = t
        self.Y_pred = y_pred_train  # Y_pred vs. Y_pred_train
        self.Y_true = Y
        self.X = X
        self.Y = Y  # Y vs. Y_true

        self.metrics_key = []
        self.model.eval_metrics_ = []
        bm = binary_evaluation(Y, y_pred_train)
        for key, value in bm.items():
            self.model.eval_metrics_.append(value)
            self.metrics_key.append(key)

        return y_pred_train

    def test(self, X, Y=None):
        """Calculate and return Y predicted value.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Test variables, where n_samples is the number of samples and n_features is the number of predictors.

        Returns
        -------
        y_pred_test : array-like, shape = [n_samples, 1]
            Predicted y score for samples.
        """
        # Convert to X to numpy array if a DataFrame
        if isinstance(X, pd.DataFrame or pd.Series):
            X = np.array(X)

        # Overwrite x_scores_ from model.fit with using test X (or do model.x_scores_test_) ?
        self.model.x_scores_ = self.model.transform(X)
        # Calculate and return Y predicted value
        y_pred_test = self.model.predict(X).flatten()
        self.Y_pred = y_pred_test

        if Y is not None:
            self.metrics_key = []
            self.model.eval_metrics_ = []
            bm = binary_evaluation(Y, y_pred_test)
            for key, value in bm.items():
                self.model.eval_metrics_.append(value)
                self.metrics_key.append(key)

            self.model.eval_metrics_ = np.array(self.model.eval_metrics_)
        return y_pred_test
예제 #35
0
y = dataset["target"]

# Center each feature and scale the variance to be unitary
X = preprocessing.scale(X)

# Compute the variance for each column
print(numpy.var(X, 0).sum())

# Now use PCA using 3 components
pca = PCA(3)
X2 = pca.fit_transform(X)
print(numpy.var(X2, 0).sum())

pls = PLSRegression(3)
pls.fit(X, y)
X2 = pls.transform(X)
print(numpy.var(X2, 0).sum())

# Make predictions using an SVM with PCA and PLS
pca_error = 0
pls_error = 0
n_folds = 10

svc = LinearSVC()

for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds):
    X_train, X_test = X[train_inds], X[test_inds]
    y_train, y_test = y[train_inds], y[test_inds]

    # Use PCA and then classify using an SVM
    X_train2 = pca.fit_transform(X_train)
예제 #36
0
class PLSClassifier(BaseEstimator, ClassifierMixin):
    __name__ = 'MultiLayeredPLS'

    def __init__(self, estimator=None, n_iter=1500, eps=1e-6, n_comp=10, mode='regression'):
        warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

        self.n_iter = n_iter
        self.eps = eps
        self.n_comp = n_comp
        self.mode = mode
        self.estimator = estimator

        self.estimator_ = None
        self.pls = None

    def fit(self, X, y):
        # if X is not np.array or y is not np.array:
        #     print('x and y must be of type np.array')
        #     raise ValueError
        if X.shape[0] != y.shape[0]:
            raise ValueError()

        if self.estimator is None:
            self.estimator_ = LinearRegression()
        else:
            self.estimator_ = sklearn.base.clone(self.estimator_)

        self.classes_, target = np.unique(y, return_inverse=True)

        target[target == 0] = -1

        if self.mode == 'canonical':
            self.pls = PLSCanonical(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps)
        elif self.mode == 'regression':
            self.pls = PLSRegression(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps)
        proj_x, proj_y = self.pls.fit_transform(X, target)

        self.estimator_.fit(proj_x, target)

        return self

    def predict_value(self, x):
        resp = self.decision_function(x)
        if resp.ndim == 1:
            ans = np.zeros(resp.shape, dtype=np.int32)
            ans[resp > 0] = self.classes_[1]
            ans[resp <= 0] = self.classes_[0]
        else:
            ans = self.classes_[np.argmax(resp, axis=1)]

        return ans

    def predict_confidence(self, x):
        resp = self.decision_function(x)
        return resp[0]

    def decision_function(self, x):
        x = np.array(x).reshape((1, -1))
        proj = self.pls.transform(x)
        resp = self.estimator_.predict(proj)
        return resp

    def predict_proba(self, x):
        resp = self.decision_function(x)
        resp = np.min(-1, resp)
        resp = np.max(1, resp)
        resp -= 1
        resp /= 2
        # resp = np.exp(resp)
        # for r in range(len(resp)):
        #     resp[r] /= np.sum(resp[r])

        return resp
features = []
temp = []
for data in MA_data:
    for i in range(1, numFeatures + 1):
        temp = np.append(temp, data[np.where(ReconRank == i)[0][0]])
    if np.shape(features)[0] == 0:
        features = temp
        temp = []
    else:
        features = np.vstack([features, temp])
        temp = []

#PLS Dimension Reduction
pls2 = PLSRegression(n_components=n_components)
pls2.fit(features, MA_label)
XScore = pls2.transform(features)
# XScore = features

#LDA Classification
kf = KFold(n_splits=5)
kf.get_n_splits(XScore)
mean_acc = 0
for train_index, test_index in kf.split(XScore):
    X_train, X_test = XScore[train_index], XScore[test_index]
    y_train, y_test = MA_label[train_index], MA_label[test_index]
    clf = LDA()
    clf.fit(X_train, y_train)
    Y_predict = clf.predict(X_test)
    for i in range(len(Y_predict)):
        print("Y_Predict {} - Y_Test {}".format(Y_predict[i], y_test[i]))
    acc = accuracy_score(Y_predict, y_test)
예제 #38
0
    plt.xlim(1, np.amax(nComponents))
    plt.title('PLS Cannonical accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'],
               loc='lower right')
    plt.grid(True)

if (0):
    #%% PLS Regression
    nComponents = np.arange(1, nClasses + 1)
    plsRegScores = np.zeros((5, np.alen(nComponents)))
    for i, n in enumerate(nComponents):
        plsReg = PLSRegression(n_components=n)
        plsReg.fit(Xtrain, Ytrain)
        XtrainT = plsReg.transform(Xtrain)
        XtestT = plsReg.transform(Xtest)
        plsRegScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain,
                                           labelsTest)

    plsReg = PLSRegression(n_components=2)
    plsReg.fit(Xtrain, Ytrain)
    xt = plsReg.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig, xt, labelsTrain, classColors)
    plt.title('First 2 components of projected data')

    #%% Plot accuracies for PLSSVD
    plt.figure()
    for i in range(5):
        plt.plot(nComponents, plsRegScores[i, :], lw=3)