예제 #1
0
파일: snippet.py 프로젝트: szabo92/gistable
    [
        DecisionTreeRegressor(),
        {
            "criterion": ["mse", "friedman_mse"],
            "splitter": ["best", "random"],
            "min_samples_split":
            [x for x in range(2, 6)]  # generates a list [2,3,4,5]
        }
    ],
    [
        GradientBoostingRegressor(), {
            "loss": ["ls", "lad", "huber", "quantile"]
        }
    ],
    [GaussianProcessRegressor(), {}],
    [PLSRegression(), {}],
    [AdaBoostRegressor(), {}]
]

# Dataset
train_X = [[5, 3], [9, 1], [8, 6], [5, 4]]
train_Y = [28, 810, 214, 19]
pred_X = [7, 3]

# Train each model individually using grid search
for model in models:
    regressor = model[0]
    param_grid = model[1]

    model = GridSearchCV(regressor, param_grid)
예제 #2
0
파일: cca.py 프로젝트: akumar01/ACTIV
def CCA_across_patients(data_files,
                        alg='cca',
                        freq_clustering='cannonical',
                        bin_size=10,
                        window_size=500,
                        post_shift=0,
                        pre_shift=0,
                        band='alpha',
                        pair=(1, 1)):

    # Assemble the set of feature vectors

    # Send the arguments in units of ms
    samp_factor = 10
    window_size = int(window_size / samp_factor)
    pre_shift = pre_shift / samp_factor
    post_shift = post_shift / samp_factor

    pre_stim_feature_vector = np.array([])
    post_stim_feature_vector = np.array([])

    for data_file in data_files:

        with h5py.File(data_file, 'r') as f:

            # ERSP time series references
            ERSP_refs = f['cfg_PAINT_cond']['ChanERSP']

            for i in range(ERSP_refs.size):
                # Use 32 bit floating precision
                ERSP = np.zeros((250, 51, 95), dtype=np.float64)
                f[ERSP_refs[i][0]].read_direct(ERSP)

                # Need to exclude the maximum nan padding
                leading_nan_count = np.zeros((51, 95))
                trailing_nan_count = np.zeros((51, 95))
                for j in range(51):
                    for k in range(95):
                        x1, x2 = count_leading_trailing_true(
                            np.isnan(ERSP[:, j, k]))
                        leading_nan_count[j, k] = x1
                        trailing_nan_count[j, k] = x2

                # Select pre and post stimulation
                leading_max = int(np.amax(leading_nan_count))
                trailing_max = int(np.amax(trailing_nan_count))

                pre_window_end = int(1000 / samp_factor - pre_shift)
                post_window_start = int(1000 / samp_factor + post_shift)

                # Ensure that we don't encroach on the nan-padding
                window_size1 = min(window_size, pre_window_end - leading_max)

                window_size2 = min(
                    window_size,
                    int(2500 / samp_factor - trailing_max - post_window_start))

                window_size = int(min(window_size1, window_size2))

                pre_stim = ERSP[pre_window_end -
                                window_size:pre_window_end, :, :]
                post_stim = ERSP[post_window_start:post_window_start +
                                 window_size, :, :]

                # Re-arrange axes so that frequency bins are last
                pre_stim = np.swapaxes(pre_stim, 1, 2)
                post_stim = np.swapaxes(post_stim, 1, 2)

                if freq_clustering == 'cannonical':

                    # Average across cannonical frequency bands
                    pre_stim_theta = np.mean(pre_stim[:, :, 0:4], axis=-1)
                    pre_stim_alpha = np.mean(pre_stim[:, :, 4:8], axis=-1)
                    pre_stim_beta = np.mean(pre_stim[:, :, 8:26], axis=-1)
                    pre_stim_gamma = np.mean(pre_stim[:, :, 26::], axis=-1)

                    pre_stim = np.concatenate([
                        pre_stim_theta, pre_stim_alpha, pre_stim_beta,
                        pre_stim_gamma
                    ],
                                              axis=-1)

                    post_stim_theta = np.mean(post_stim[:, :, 0:4], axis=-1)
                    post_stim_alpha = np.mean(post_stim[:, :, 4:8], axis=-1)
                    post_stim_beta = np.mean(post_stim[:, :, 8:26], axis=-1)
                    post_stim_gamma = np.mean(post_stim[:, :, 26::], axis=-1)

                    post_stim = np.concatenate([
                        post_stim_theta, post_stim_alpha, post_stim_beta,
                        post_stim_gamma
                    ],
                                               axis=-1)
                elif freq_clustering == 'equal':
                    # Chop off the lowest frequency bin so we have a non-prime number of bins...
                    pre_stim = pre_stim[..., 1::]
                    post_stim = post_stim[..., 1::]

                    # Average across equal number of frequency bands
                    pre_stim = np.mean(pre_stim.reshape(
                        (pre_stim.shape[0], pre_stim.shape[1], -1, bin_size)),
                                       axis=-1)
                    post_stim = np.mean(post_stim.reshape(
                        (post_stim.shape[0], post_stim.shape[1], -1,
                         bin_size)),
                                        axis=-1)

                    # Collapse
                    pre_stim = pre_stim.reshape(
                        (pre_stim.shape[0],
                         pre_stim.shape[1] * pre_stim.shape[2]))
                    post_stim = post_stim.reshape(
                        (post_stim.shape[0],
                         post_stim.shape[1] * post_stim.shape[2]))

                elif freq_clustering == 'random':

                    # Chop off the lowest frequency bin so we have a non-prime number of bins...
                    pre_stim = pre_stim[..., 1::]
                    post_stim = post_stim[..., 1::]

                    # Average across random collection of frequency bins
                    idxs = np.arange(pre_stim.shape[-1])
                    np.random.shuffle(idxs)
                    idxs = np.split(idxs, int(pre_stim.shape[-1] / bin_size))

                    pre_stim_rand1 = np.mean(pre_stim[:, :, idxs[0]], axis=-1)
                    pre_stim_rand2 = np.mean(pre_stim[:, :, idxs[1]], axis=-1)
                    pre_stim_rand3 = np.mean(pre_stim[:, :, idxs[2]], axis=-1)
                    pre_stim_rand4 = np.mean(pre_stim[:, :, idxs[3]], axis=-1)
                    pre_stim_rand5 = np.mean(pre_stim[:, :, idxs[4]], axis=-1)

                    pre_stim = np.concatenate([
                        pre_stim_rand1, pre_stim_rand2, pre_stim_rand3,
                        pre_stim_rand4, pre_stim_rand5
                    ],
                                              axis=-1)

                    post_stim_rand1 = np.mean(post_stim[:, :, idxs[0]],
                                              axis=-1)
                    post_stim_rand2 = np.mean(post_stim[:, :, idxs[1]],
                                              axis=-1)
                    post_stim_rand3 = np.mean(post_stim[:, :, idxs[2]],
                                              axis=-1)
                    post_stim_rand4 = np.mean(post_stim[:, :, idxs[3]],
                                              axis=-1)
                    post_stim_rand5 = np.mean(post_stim[:, :, idxs[4]],
                                              axis=-1)

                    post_stim = np.concatenate([
                        post_stim_rand1, post_stim_rand2, post_stim_rand3,
                        post_stim_rand4, post_stim_rand5
                    ],
                                               axis=-1)
                elif freq_clustering == 'single_band':

                    if band == 'theta':
                        pre_stim = pre_stim[:, :, 0:4]
                        post_stim = post_stim[:, :, 0:4]
                    elif band == 'alpha':
                        pre_stim = pre_stim[:, :, 4:8]
                        post_stim = post_stim[:, :, 4:8]
                    elif band == 'beta':
                        pre_stim = pre_stim[:, :, 8:26]
                        post_stim = post_stim[:, :, 8:26]
                    elif band == 'gamma':
                        pre_stim = pre_stim[:, :, 26::]
                        post_stim = post_stim[:, :, 26::]
                    elif band == 'topgamma':
                        pre_stim = pre_stim[:, :, 41:51]
                        post_stim = post_stim[:, :, 41:51]
                    elif band == 'all':
                        pass
                elif freq_clustering == 'pairwise':

                    pre_stim = pre_stim[:, :, pair[0]]
                    post_stim = post_stim[:, :, pair[1]]

                # Collpase and append
                if pre_stim_feature_vector.size == 0:
                    pre_stim_feature_vector = np.append(
                        pre_stim_feature_vector, pre_stim.reshape((1, -1)))
                    post_stim_feature_vector = np.append(
                        post_stim_feature_vector, post_stim.reshape((1, -1)))

                    pre_stim_feature_vector = pre_stim_feature_vector.reshape(
                        (1, -1))
                    post_stim_feature_vector = post_stim_feature_vector.reshape(
                        (1, -1))
                else:
                    pre_stim_feature_vector = np.concatenate(
                        [pre_stim_feature_vector,
                         pre_stim.reshape((1, -1))])
                    post_stim_feature_vector = np.concatenate(
                        [post_stim_feature_vector,
                         post_stim.reshape((1, -1))])

    # Convert to 32 bit floating precision
    pre_stim_feature_vector = pre_stim_feature_vector.astype(np.float32)
    post_stim_feature_vector = post_stim_feature_vector.astype(np.float32)

    # Attempt to do a cross-validated CCA across all the features
    # Perform a cross-validated cannonical correlation analysis on the basis of this data

    if alg == 'cca':
        corrmodel = CCA(n_components=1)
        crsval = cross_validate(corrmodel,
                                pre_stim_feature_vector,
                                post_stim_feature_vector,
                                cv=5,
                                return_train_score=True)
        return np.mean(crsval['test_score']), np.mean(crsval['train_score'])

    elif alg == 'pls':
        corrmodel = PLSRegression()
        # Manually cross-validate
        folds = KFold(n_splits=5)
        test_scores = []
        train_scores = []
        for train_index, test_index in folds.split(pre_stim_feature_vector,
                                                   post_stim_feature_vector):
            corrmodel.fit(pre_stim_feature_vector[train_index],
                          post_stim_feature_vector[train_index])
            test_scores.append(
                corrmodel.score(pre_stim_feature_vector[test_index],
                                post_stim_feature_vector[test_index]))
            train_scores.append(
                corrmodel.score(pre_stim_feature_vector[train_index],
                                post_stim_feature_vector[train_index]))
        return np.mean(test_scores), np.mean(train_scores)
예제 #3
0
from sklearn.gaussian_process.kernels import ConstantKernel as C

# sklearn NO random forest KAIKI
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
rte = RandomTreesEmbedding()
mr = MLPRegressor(max_iter=1000)
omp = OrthogonalMatchingPursuit()
ran = RANSACRegressor()
tsr = TheilSenRegressor(random_state=42)
br = BayesianRidge(n_iter=300, tol=0.001)
bgm = BayesianGaussianMixture()
knr = KNeighborsRegressor(n_neighbors=5)
rnr = RadiusNeighborsRegressor(radius=1.0)
pls = PLSRegression(n_components=1)
gnb = GaussianNB()
mnb = MultinomialNB()
svl = SVR(kernel='linear')
svr = SVR()
las = Lasso()
en = ElasticNet()
rr = Ridge()
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

estimators = {
    'LR ': lr,
    'DTR': dtr,
    'RFR': rfr,
    'OMP': omp,
예제 #4
0
plt.xticks(())
plt.yticks(())
plt.show()

###############################################################################
# PLS regression, with multivariate response, a.k.a. PLS2

n = 1000
q = 3
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

###############################################################################
# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
예제 #5
0
GBDT_params={'learning_rate':[0.1,0.1,0.1,0.1],'maxdepth':[2,3,2,2],'n_estimators':[100,100,100,100]}#XGBOOST与GBDT相同 此处共用
ENANN_params = {'max_iter': [100, 100, 200, 300], 'p': [0.3, 0.5, 0.7, 0.5]}
DFN_params = {'learning_rate':[0.1, 0.1, 0.1, 0.001], 'batch': [300, 400, 300, 400]}
LSTM_params = {'learning_rate':[1e-4, 1e-5, 1e-4, 1e-6], 'depth': [2, 2, 1, 2], 'hidden_number': [256]*4}
RNN_params = {'learning_rate':[0.1, 0.1, 0.1, 0.001], 'depth': [1, 1, 2, 1], 'hidden_number': [256]*4}


#**********************2.全样本3/12/24/36个月滑动窗口函数运行**********************************#
path = r'..\DataBase\factor'#96项因子所在路径
factorname = [x[1:-4] for x in os.listdir(path)]
riskfree, timeseries, factor, timeseries2, index = datatransfrom(path)[0], datatransfrom(path)[1], datatransfrom(path)[2], datatransfrom2(path)[0], datatransfrom2(path)[1]
for i in range(4):
    i= 0
    output(window[i],LinearRegression(),'OLS'+str(window[i]),riskfree[i], timeseries)
    FC(window[i], riskfree[i], timeseries, 96,'FC')
    output(window[i], PLSRegression(PLS_params[i]), 'PLS' + str(window[i]), riskfree[i], timeseries)
    output(window[i],Lasso(alpha=lasso_params[i]),'Lasso'+ str(window[i]), riskfree[i], timeseries)
    output(window[i],Ridge(alpha=ridge_params[i]),'Ridge'+str(window[i]),riskfree[i], timeseries)
    output(window[i],ElasticNet(alpha= elasticnet_params['alpha'] [i],l1_ratio= elasticnet_params['l1_ratio'][i]),'ElasticNet'+str(window[i]),riskfree[i], timeseries)
    output(window[i],SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i] ),'SVR'+str(window[i]),riskfree[i], timeseries)
    output(window[i], GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), 'GBDT' + str(window[i]),riskfree[i], timeseries)
    output(window[i], XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), 'XGBOOST' + str(window[i]), riskfree[i], timeseries)
    output(window[i], ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), 'ENANN' + str(window[i]), riskfree[i], timeseries)
    output(window[i], DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), 'DFN' + str(window[i]), riskfree[i], timeseries)
    output2(window[i], rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), 'LSTM'+ str(window[i]) ,riskfree[i], timeseries2)
    output2(window[i], rm.lstmmodule(96,  RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN'), 'RNN'+ str(window[i]), riskfree[i], timeseries2)
    modellist = [DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]),
                 ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]),
                 XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]),
                 GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]),
                 PLSRegression(PLS_params[i]),
예제 #6
0
# %% ###### GRIDSEARCH #######

#### Scalers
scalers = [RobustScaler(), StandardScaler(), MinMaxScaler()]

### Regressions
#cvR=SKF(10).split(feats, data_train['gender'])

#a=list(cvR)

alphas = np.arange(0.001, 10, 0.005)

lasso = LassoCV(alphas=alphas, fit_intercept=True, max_iter=100000)
ridge = RidgeCV(alphas=alphas, fit_intercept=True)
pls = PLSRegression(n_components=10, scale=False, max_iter=1000)
gbr = GradientBoostingRegressor(loss='lad', alpha=0.7)

SVR = SVR(kernel='linear', C=3)
GPR = GaussianProcessRegressor(normalize_y=True,
                               n_restarts_optimizer=50,
                               kernel=RBF())

regressors = [lasso, ridge]

# %% #### Pipeline ####

#cv=SKF(10).split(feats, data_train['site'])
cv = KF(10, shuffle=True)

pipe = Pipeline([('scale', StandardScaler()), ('regress', lasso)])
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y

        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(n_estimators=100,
                                   max_depth=7,
                                   gamma=0,
                                   colsample_bytree=0.1)
        lgbm = LGBMRegressor(boosting_type='gbdt',
                             num_leaves=367,
                             learning_rate=0.06,
                             feature_fraction=0.14,
                             max_depth=28,
                             min_data_in_leaf=8)
        rgf = RGFRegressor(max_leaf=1211,
                           algorithm="RGF",
                           test_interval=100,
                           loss="LS",
                           verbose=False,
                           l2=0.93,
                           min_samples_leaf=2)
        rf = RandomForestRegressor(max_depth=20,
                                   random_state=0,
                                   n_estimators=56,
                                   min_samples_split=2,
                                   max_features=0.21)
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(n_estimators=384,
                                  max_features=2228,
                                  min_samples_split=0.01,
                                  max_depth=856,
                                  min_samples_leaf=1)
        svr = SVR(gamma=9.5367431640625e-07, epsilon=0.0009765625, C=2048.0)

        #test combination
        desNew = make_pipeline(extdescriptorNew(), rf)
        morNew = make_pipeline(extMorganNew(), rf)
        kotNew = make_pipeline(extklekotaTothNew(), rf)
        macNew = make_pipeline(extMACCSNew(), rf)

        desMac = make_pipeline(extDescriptorMACCS(), rf)
        morMac = make_pipeline(extMorganMACCS(), rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(), rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(), rf)
        des = make_pipeline(extOnlyDescriptor(), rf)
        mor = make_pipeline(extOnlyMorgan(), rf)
        kot = make_pipeline(extOnlyklekotaToth(), rf)
        mac = make_pipeline(extOnlyMACCS(), rf)
        all = make_pipeline(extAll(), rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(), rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(), rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(), rf)

        testDic = {
            "Desc+New": desNew,
            "Mor+New": morNew,
            "kot+New": kotNew,
            "MACCS+New": macNew,
            "Des+MAC": desMac,
            "Morgan+Maccs": morMac,
            "Kot+MACCS": kotMac,
            "mor+kot+New": morKotNew,
            "descriptor": des,
            "morgan": mor,
            "kot": kot,
            "MACCS": mac,
            "All": all,
            "All without "
            "new": allwithoutNew,
            "All without MACCS": allwithoutMaccs,
            "All without Des": allwithoutDes
        }

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic = {}
        resultDic2 = {}
        for name, model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv, scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name: [RMSETmp, CORRTmP]})
            print(name, RMSETmp, CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata],
                                  meta_regressor=rf,
                                  verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata],
                                      meta_regressor=lgbm,
                                      verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata],
                                      meta_regressor=xgb,
                                      verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata],
                                      meta_regressor=rgf,
                                      verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3, 11, 1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata],
                                          meta_regressor=pls,
                                          verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(
            gamma=9.5367431640625 / 10000000,
            C=1559.4918100725592,
            epsilon=0.0009765625,
        )
        svr = SVR(kernel='rbf',
                  gamma=9.5367431640625e-07,
                  epsilon=0.0009765625,
                  C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata],
                                      meta_regressor=svr,
                                      verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata],
                                      meta_regressor=ext,
                                      verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1, 10, 100, 1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(), LinearRegression(n_jobs=-1))
        calcACC(lin)

        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata, nbrs, all],
                                      meta_regressor=rf,
                                      verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(
            regressors=[alldata, nbrs, all, xgb, lgbm, rgf],
            meta_regressor=rf,
            verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata, alldata, all],
                                      meta_regressor=rf,
                                      verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(
            regressors=[alldata, nbrs, all, xgb, lgbm, rgf],
            meta_regressor=rf,
            verbose=1)
        testmodel = StackingRegressor(regressors=[alldata, stack1, stack1],
                                      meta_regressor=rf,
                                      verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(
            regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf],
            meta_regressor=rf,
            verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(
            regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf],
            meta_regressor=lgbm,
            verbose=1)
        #0.70613
        testmodel = StackingRegressor(
            regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf, ext],
            meta_regressor=xgb,
            verbose=1)
        #0.71641717
        testmodel = StackingRegressor(
            regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf, ext],
            meta_regressor=rf,
            verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[
            pcaFeature, alldata, nbrs, ridge, rf, xgb, lgbm, rgf, ext
        ],
                                      meta_regressor=rf,
                                      verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 = make_pipeline(extDescriptor(), rgf)
        pipe8 = make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto', kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc = make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20,
                                     random_state=0,
                                     n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(
            regressors=[pipe1, pipe2, pipe3, xgb, lgbm, rgf, rf],
            meta_regressor=ave,
            verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3],
                                   meta_regressor=rf,
                                   verbose=1)
        #0.70
        stack2 = StackingRegressor(
            regressors=[stack1, alldata, rgf, lgbm, xgb],
            meta_regressor=rf,
            verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2, pipe1],
                                   meta_regressor=ave,
                                   verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3],
                                   meta_regressor=rf,
                                   verbose=1)
        stack2 = StackingRegressor(regressors=[stack1, withoutdesc, lgbm, rgf],
                                   meta_regressor=rf,
                                   verbose=1)
        stack3 = StackingRegressor(regressors=[stack2, pipe1, xgb],
                                   meta_regressor=ave,
                                   verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3],
                                   meta_regressor=rf,
                                   verbose=1)
        stack2 = StackingRegressor(regressors=[stack1, nbrs, pipe1],
                                   meta_regressor=rf,
                                   verbose=1)

        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1, X, y, cv=cv)
        St1Scores['test_score'].mean()**(1 / 2)

        St2Scores = cross_validate(stack2, X, y, cv=cv)
        St2Scores['test_score'].mean()**(1 / 2)

        St3Scores = cross_validate(stack3, X, y, cv=cv)
        St3Scores['test_score'].mean()**(1 / 2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1 / 2)

        lgbmScores = cross_validate(lgbm, X, y, cv=cv)
        lgbmScores['test_score'].mean()**(1 / 2)

        rgfScores = cross_validate(rgf, X, y, cv=cv)
        rgfScores['test_score'].mean()**(1 / 2)

        RFScores = cross_validate(rf, X, y, cv=cv)
        RFScores['test_score'].mean()**(1 / 2)

        scores = cross_validate(stack2, X, y, cv=cv)
        scores['test_score'].mean()**(1 / 2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" %
              (scores['test_score'].mean(), scores['test_score'].std(),
               'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy = (10**(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" %
              calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' %
              calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy = (10**(stack1.predict(exX))).tolist()

        sgd.fit(X, y)
        valy = (10**(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X, y)
        valy = (10**(rgf.predict(exX))).tolist()

        nbrs.fit(X, y)
        valy = (10**(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X, y)
        valy = (10**(pipe.predict(exX))).tolist()

        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy = (10**(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" %
              calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' %
              calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy = (10**(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" %
              calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' %
              calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
예제 #8
0
def pls_cal(dbfile,maskfile,outpath,which_elem,testfold,nc,normtype=1,mincomp=0,maxcomp=100,plstype='mlpy',keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,foldfile=None,nfolds=7,seed=None,n_bag=None,skscale=False,n_boost=None,max_samples=0.1,n_elems=9):
    plstype_string=plstype    
    if n_bag!=None:
        plstype_string=plstype+'_bag'
    if n_boost!=None:
        plstype_string=plstype+'_boost'
    if skscale==True:
        plstype_string=plstype+'_scale'
    print 'Reading database'
    sys.stdout.flush()
    spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems)
    oxides=labels[2:]
    compindex=numpy.where(oxides==which_elem)[0]
    
    print 'Choosing spectra'
    
    which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv'
    spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed)
        
    
    print 'Masking spectra'
    spectra,wvl=ccam.mask(spectra,wvl,maskfile)
    
    print 'Normalizing spectra'
    spectra=ccam.normalize(spectra,wvl,normtype=normtype)
    
    
    print 'Assigning Folds'
    if foldfile!=None:
        #if a fold file is specified, use it
        folds=ccam.folds(foldfile,names)
    else:
        #otherwise, define random folds
        folds=ccam.random_folds(names,nfolds,seed=seed)

    names_nofold=names[(folds==0)]
    spect_index_nofold=spect_index[(folds==0)]
    #write a file containing the samples not assigned to folds
    with open(which_removed,'ab') as writefile:
        writer=csv.writer(writefile,delimiter=',',)
        for i in range(len(names_nofold)):
            writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold'])
    
    
    #remove spectra that are not assigned to any fold
    spectra=spectra[(folds!=0),:]
    spect_index=spect_index[(folds!=0)]
    names=names[(folds!=0)]
    comps=comps[(folds!=0),:]
    folds=folds[(folds!=0)]
    
    print 'Defining Training and Test Sets'
    spectra_train=spectra[(folds!=testfold)]
    spect_index_train=spect_index[(folds!=testfold)]
    names_train=names[(folds!=testfold)]
    comps_train=comps[(folds!=testfold),compindex]
    folds_train=folds[(folds!=testfold)]
    folds_train_unique=numpy.unique(folds_train)
    
    spectra_test=spectra[(folds==testfold)]
    spect_index_test=spect_index[(folds==testfold)]
    names_test=names[(folds==testfold)]
    comps_test=comps[(folds==testfold),compindex]
    folds_test=folds[(folds==testfold)]
    
    print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set'
    #define array to hold cross validation predictions and RMSEs
    train_predict_cv=numpy.zeros((len(names_train),nc))
    RMSECV=numpy.zeros(nc)
    
    for i in folds_train_unique:
        print 'Holding out fold #'+str(i)
        
        if skscale==False:
        #mean center those spectra left in
            #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:])
            X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:])
            
            #and those left out
            X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0]   
             
            #mean center compositions left in
            Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)])
        if skscale==True:
            X_cv_in=spectra_train[(folds_train!=i),:]
            X_cv_out=spectra_train[(folds_train==i),:]
            Y_cv_in=comps_train[(folds_train!=i)]
            Y_cv_in_mean=0
       
        #step through each number of components
        for j in range(1,nc+1):
            print 'Training Model for '+str(j)+' components'
            #train the model
            if plstype=='mlpy':
                PLS1model=ccam.mlpy_pls.PLS(j)
                PLS1model.learn(X_cv_in,Y_cv_in)
                    #predict the samples held out
                train_predict_cv[(folds_train==i),j-1]=PLS1model.pred(X_cv_out)+Y_cv_in_mean
                
            if plstype=='sklearn':
                PLS1model=PLSRegression(n_components=j,scale=skscale)
                if n_bag==None and n_boost==None:
                    PLS1model.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean)
                if n_bag!=None:
                    PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1)
                    PLS1bagged.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1bagged.predict(X_cv_out)+Y_cv_in_mean)
                if n_boost!=None:
                    PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost)
                    PLS1boosted.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1boosted.predict(X_cv_out)+Y_cv_in_mean)
    #calculate RMSECV
    for i in range(0,nc):
        sqerr=(train_predict_cv[:,i]-comps_train)**2.0
        RMSECV[i]=numpy.sqrt(numpy.mean(sqerr))
    
    #mean center full model
    if skscale==False:
        X,X_mean=ccam.meancenter(spectra_train)
        X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0]
        X_all=ccam.meancenter(spectra,X_mean=X_mean)[0]
        
        Y,Y_mean=ccam.meancenter(comps_train)
    if skscale==True:
        X=spectra_train
        X_test=spectra_test
        X_all=spectra
        Y=comps_train
        Y_mean=0
    
    #create arrays for results and RMSEs
    trainset_results=numpy.zeros((len(names_train),nc))
    testset_results=numpy.zeros((len(names_test),nc))
    results=numpy.zeros((len(names),nc))    
    
    RMSEP=numpy.zeros(nc)
    RMSEC=numpy.zeros(nc)
    beta=numpy.zeros((len(X[0,:]),nc))
    Q_res=numpy.zeros((len(X[:,0]),nc))
    T2=numpy.zeros((len(X[:,0]),nc))

    [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose())))
    evals=numpy.diag(evals**2)
    if cal_dir!=None:
        print 'Reading cal target data'
        cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir)
        cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile)
        cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype)
        if skscale==True:
            cal_data_centered=cal_data
        if skscale==False:
            cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0]

            
        RMSEP_cal=numpy.zeros(nc)
        RMSEP_cal_good=numpy.zeros(nc)        
        RMSEP_KGAMEDS=numpy.zeros(nc)
        RMSEP_MACUSANITE=numpy.zeros(nc)
        RMSEP_NAU2HIS=numpy.zeros(nc)
        RMSEP_NAU2LOS=numpy.zeros(nc)
        RMSEP_NAU2MEDS=numpy.zeros(nc)
        RMSEP_NORITE=numpy.zeros(nc)
        RMSEP_PICRITE=numpy.zeros(nc)
        RMSEP_SHERGOTTITE=numpy.zeros(nc)
        
        targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file)
        target_comps=ccam.target_comp_lookup(targets,compfile,which_elem)
        cal_results=numpy.zeros((len(targets),nc))
       
    model_list=[]
    #Now step through each # of components with the full model
    for j in range(1,nc+1):
        print 'Training full model for '+str(j)+' components'
        if plstype=='mlpy':
        
            PLS1model=ccam.mlpy_pls.PLS(j)
            PLS1model.learn(X,Y)
            beta[:,j-1]=PLS1model.beta()
            model_list.append([PLS1model])
            trainset_results[:,j-1]=PLS1model.pred(X)+Y_mean
            testset_results[:,j-1]=PLS1model.pred(X_test)+Y_mean
            results[:,j-1]=PLS1model.pred(X_all)+Y_mean
            if cal_dir != None:
                comps_copy=copy.copy(target_comps)
#                if skscale==True:
#                    cal_results[:,j-1]=PLS1model.pred(cal_data)
#                if skscale==False:
                cal_results[:,j-1]=PLS1model.pred(cal_data_centered)+Y_mean
                RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   


        if plstype=='sklearn':
            PLS1model=PLSRegression(n_components=j,scale=skscale)

            if n_bag==None and n_boost==None:
                PLS1model.fit(X,Y)
                T=PLS1model.x_scores_
                #There's probably a more efficient way to calculate T2...
                for k in range(len(X[:,0])):
                    T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:]))
                
                E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose())
                Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal()
                
                trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean)
                beta[:,j-1]=numpy.squeeze(PLS1model.coefs)
                model_list.append([PLS1model])

                    
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   
            if n_bag!=None:
                PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1)
                PLS1bagged.fit(X,Y)
                trainset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_all)+Y_mean)
                beta[:,j-1]=None
                model_list.append([PLS1bagged])
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
            if n_boost!=None:
                PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost)
                PLS1boosted.fit(X,Y)
                trainset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_all)+Y_mean)
                beta[:,j-1]=None
                model_list.append([PLS1boosted])
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   
        RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0))
        RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0))
        
   
    with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile:
            pickle.dump(model_list,picklefile)

 #if cal_dir is specified, read cal target data and calculate RMSEs    
    if cal_dir!=None:

        
        n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0)
        print n_good_cal
        RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal
        RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal]            
                       
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','wb') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            row=['File','Target','Laser Energy','True_Comp']
            row.extend(range(1,nc+1))
            writer.writerow(row)
            for i in range(0,len(targets)):
                row=[cal_filelist[i],targets[i],amps[i],target_comps[i]]
                row.extend(cal_results[i,:])
                writer.writerow(row)
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','wb') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            writer.writerow(['NC','RMSEP Cal Targets (wt.%)'])            
            for i in range(0,nc):
                writer.writerow([i+1,RMSEP_cal[i]])
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals)
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good)
        
    # plot RMSEs
    ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png')
    
    
   
   #Write output info to files

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(Q_res[i,:])
            writer.writerow(row)
    with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=[which_elem]
        writer.writerow(row)
        row=['Min',numpy.percentile(comps[:,compindex],0)]
        writer.writerow(row)
        row=['1st Quartile',numpy.percentile(comps[:,compindex],25)]
        writer.writerow(row)
        row=['Median',numpy.percentile(comps[:,compindex],50)]
        writer.writerow(row)
        row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)]
        writer.writerow(row)
        row=['Max',numpy.percentile(comps[:,compindex],100)]
        writer.writerow(row)

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(T2[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSECV (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSECV[i]])
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEC (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEC[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEP (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEP[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(train_predict_cv[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(trainset_results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_test)):
            row=[names_test[i],spect_index_test[i],folds_test[i],comps_test[i]]
            row.extend(testset_results[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names)):
            row=[names[i],spect_index[i],folds[i],comps[i,compindex]]
            row.extend(results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['wvl']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(wvl)):
            row=[wvl[i]]
            row.extend(beta[i,:])
            writer.writerow(row)        
    
    if skscale==False:
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','wb') as writefile:
            writer=csv.writer(writefile,delimiter=',')        
            writer.writerow([which_elem+' mean',Y_mean])
            for i in range(0,len(wvl)):
                row=[wvl[i],X_mean[i]]
                writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','wb') as writefile:
        writer=csv.writer(writefile,delimiter=',')        
        writer.writerow(['Spectral database =',dbfile])
        writer.writerow(['Spectra Kept =',keepfile])
        writer.writerow(['Spectra Removed =',which_removed])
        writer.writerow(['Fold Definition =',foldfile])
        writer.writerow(['Test Fold =',maskfile])
        writer.writerow(['Mask File =',maskfile])
        writer.writerow(['Algorithm =',plstype_string])
        writer.writerow(['# of components =',nc])
        writer.writerow(['Normalization Type =',normtype])
        writer.writerow(['Composition Min. =',mincomp])
        writer.writerow(['Composition Max. =',maxcomp])
def get_r2(x_learn, x_valid, y_learn, y_valid, regressor='pls'):
    """
    The function Get_R2 takes features and labels from the learning and validation set.

    When using 'pls' as regressor, the MSE is calculated for all LOOCV sets for predicted vs true labels
    (mse = mean_squared_error(y_test_loo, y_pred_loo) for a fixed number of components for PLS regression.
    In the next iteration, the number of components is increased by 1 (number_of_components += 1)
    and the MSE is calculated for this regressor. The loop breaks if i > 9.
    Finally, the model of the single AAindex model with the lowest MSE is chosen.

    When using other regressors the parameters are tuned using GridSearchCV.

    This function returnes performance (R2, (N)RMSE, Pearson's r) and model parameters.
    """
    regressor = regressor.lower()
    mean_squared_error_list = []

    if regressor == 'pls':
        # PLS regression with LOOCV n_components tuning as described by Cadet et al.
        # https://doi.org/10.1186/s12859-018-2407-8
        # https://doi.org/10.1038/s41598-018-35033-y
        # Hyperparameter (N component) tuning of PLS regressor
        for n_comp in range(1, 10):  # n_comp = 1, 2,..., 9
            pls = PLSRegression(n_components=n_comp)
            loo = LeaveOneOut()

            y_pred_loo = []
            y_test_loo = []

            for train, test in loo.split(x_learn):
                x_learn_loo = []
                y_learn_loo = []
                x_test_loo = []

                for j in train:
                    x_learn_loo.append(x_learn[j])
                    y_learn_loo.append(y_learn[j])

                for k in test:
                    x_test_loo.append(x_learn[k])
                    y_test_loo.append(y_learn[k])

                pls.fit(x_learn_loo, y_learn_loo)
                y_pred_loo.append(pls.predict(x_test_loo)[0][0])

            mse = mean_squared_error(y_test_loo, y_pred_loo)

            mean_squared_error_list.append(mse)

        mean_squared_error_list = np.array(mean_squared_error_list)
        # idx = np.where(...) finds best number of components
        idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1

        # Model is fitted with best n_components (lowest MSE)
        best_params = {'n_components': idx}
        regressor_ = PLSRegression(n_components=best_params.get('n_components'))

    # other regression options (CV tuning)
    elif regressor == 'pls_cv':
        params = {'n_components': list(np.arange(1, 10))}  # n_comp = 1, 2,..., 9
        regressor_ = GridSearchCV(PLSRegression(), param_grid=params, iid=False, cv=5)  # iid in future
                                                                                        # versions redundant
    elif regressor == 'rf':
        params = {                 # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073
            'random_state': [42],  # state determined
            'n_estimators': [100, 250, 500, 1000],  # number of individual decision trees in the forest
            'max_features': ['auto', 'sqrt', 'log2']  # “auto” -> max_features=n_features,
            # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features)
        }
        regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, iid=False, cv=5)

    elif regressor == 'svr':
        params = {                      # similar parameter grid as Xu et al.
            'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12],  # Regularization parameter
            'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]  # often 1 / n_features or 1 / (n_features * X.var())
        }
        regressor_ = GridSearchCV(SVR(), param_grid=params, iid=False, cv=5)

    elif regressor == 'mlp':
        params = {
            # feedforward network trained via backpropagation – here only using a single hidden layer
            'hidden_layer_sizes': [i for i in range(1, 12)],  # size of hidden layer [(1,), (2,), ..., (12,)]
            'activation': ['relu'],  # rectified linear unit
            'solver': ['adam', 'lbfgs'],  # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS
            'learning_rate': ['constant'],  # learning rate given by ‘learning_rate_init’
            'learning_rate_init': [0.001, 0.01, 0.1],  # only used when solver=’sgd’ or ‘adam’
            'max_iter': [1000, 200],  # for stochastic solvers (‘sgd’, ‘adam’) determines epochs
            'random_state': [42]
        }
        regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, iid=False, cv=5)

    else:
        raise SystemError("Did not find specified regression model as valid option. See '--help' for valid "
                          "regression model options.")

    regressor_.fit(x_learn, y_learn)  # fit model

    if regressor != 'pls':      # take best parameters for the regressor and the AAindex
        best_params = regressor_.best_params_

    y_pred = []
    for y_p in regressor_.predict(x_valid):  # predict validation entries with fitted model
        y_pred.append(float(y_p))

    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    nrmse = rmse / np.std(y_valid, ddof=1)
    # ranks for Spearman's rank correlation
    y_val_rank = np.array(y_valid).argsort().argsort()
    y_pred_rank = np.array(y_pred).argsort().argsort()
    with warnings.catch_warnings():  # catching RunTime warning when there's no variance in an array, e.g. [2, 2, 2, 2]
        warnings.simplefilter("ignore")  # which would mean divide by zero
        pearson_r = np.corrcoef(y_valid, y_pred)[0][1]
        spearman_rho = np.corrcoef(y_val_rank, y_pred_rank)[0][1]

    return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params
예제 #10
0
def plot_processing_results(figure_name,num_PCs,training_error=0,train_conc=False\
                            ,mixture_error = 0, mix_conc=False):
    r2 = []
    rmse = []
    r2_centered = []
    rmse_centered = []
    r2_cscaled = []
    rmse_cscaled = []
    r2_scaled = []
    rmse_scaled = []
    r2_scaled_std = []
    rmse_scaled_std = []
    r2_PLS = []
    rmse_PLS = []
    r2_PLS_scaled = []
    rmse_PLS_scaled = []

    num_PCs = num_PCs
    crange = 10**np.linspace(-3, 1, num=25, endpoint=True)
    cratio = crange * 4**4 / 4
    print(cratio)
    for c in crange:
        intensities, concentrations = get_training_data(c,training_error\
                                                        ,conc_error=train_conc)
        mixture_int, mixture_conc = get_mixture(c,mixture_error\
                                                , conc_error=mix_conc)

        pca = PCA(intensities,
                  concentrations,
                  center=False,
                  scale=False,
                  add_constant=True)
        pca.get_PCs_and_regressors(num_PCs)
        prediction = pca.predict(mixture_int)
        r2.append(PCA.get_r2(mixture_conc, prediction))
        rmse.append(PCA.get_rmse(mixture_conc, prediction))

        #centered
        pca_centered = PCA(intensities,
                           concentrations,
                           center=True,
                           scale=False)
        pca_centered.get_PCs_and_regressors(num_PCs)
        prediction = pca_centered.predict(mixture_int)
        r2_centered.append(PCA.get_r2(mixture_conc, prediction))
        rmse_centered.append(PCA.get_rmse(mixture_conc, prediction))
        #centered and scaled sigma^2
        pca_cscaled = PCA(intensities, concentrations, center=True, scale=True)
        pca_cscaled.get_PCs_and_regressors(num_PCs)
        prediction = pca_cscaled.predict(mixture_int)
        r2_cscaled.append(PCA.get_r2(mixture_conc, prediction))
        rmse_cscaled.append(PCA.get_rmse(mixture_conc, prediction))

        #scaled sigma^2
        pca_scaled = PCA(intensities,
                         concentrations,
                         center=False,
                         scale=True,
                         add_constant=True)
        pca_scaled.get_PCs_and_regressors(num_PCs)
        prediction = pca_scaled.predict(mixture_int)
        r2_scaled.append(PCA.get_r2(mixture_conc, prediction))
        rmse_scaled.append(PCA.get_rmse(mixture_conc, prediction))

        #scaled sigma
        pca_scaled_std = PCA(intensities,
                             concentrations,
                             center=False,
                             scale=True,
                             scale_type='std',
                             add_constant=True)
        pca_scaled_std.get_PCs_and_regressors(num_PCs)
        prediction = pca_scaled_std.predict(mixture_int)
        r2_scaled_std.append(PCA.get_r2(mixture_conc, prediction))
        rmse_scaled_std.append(PCA.get_rmse(mixture_conc, prediction))

        #PLS
        PLS = PLSRegression(n_components=num_PCs, scale=False, max_iter=500\
                            , tol=1e-06, copy=True, center=True)
        PLS.fit(intensities, concentrations)
        prediction = PLS.predict(mixture_int)
        r2_PLS.append(PCA.get_r2(mixture_conc, prediction))
        rmse_PLS.append(PCA.get_rmse(mixture_conc, prediction))

        #PLS scaled
        PLS_scaled = PLSRegression(n_components=num_PCs, scale=True\
                                   , max_iter=500, tol=1e-06, copy=True, center=True)
        PLS_scaled.fit(intensities, concentrations)
        prediction = PLS_scaled.predict(mixture_int)
        r2_PLS_scaled.append(PCA.get_r2(mixture_conc, prediction))
        rmse_PLS_scaled.append(PCA.get_rmse(mixture_conc, prediction))

    r2 = np.array(r2)
    r2_centered = np.array(r2_centered)
    r2_scaled = np.array(r2_scaled)
    r2_cscaled = np.array(r2_cscaled)
    r2_scaled_std = np.array(r2_scaled_std)
    r2_PLS = np.array(r2_PLS)
    r2_PLS_scaled = np.array(r2_PLS_scaled)

    rmse = np.array(rmse)
    rmse_centered = np.array(rmse_centered)
    rmse_scaled = np.array(rmse_scaled)
    rmse_cscaled = np.array(rmse_cscaled)
    rmse_scaled_std = np.array(rmse_scaled_std)
    rmse_PLS = np.array(rmse_PLS)
    rmse_PLS_scaled = np.array(rmse_PLS_scaled)
    """
    plt.figure(0)
    plt.plot(cratio,np.mean(r2,axis=1),'blue')
    plt.plot(cratio,np.mean(r2_centered,axis=1),'og')
    plt.plot(cratio,np.mean(r2_scaled,axis=1),':r')
    plt.plot(cratio,np.mean(r2_cscaled,axis=1),':g')
    plt.plot(cratio,np.mean(r2_scaled_std,axis=1),'^r')
    plt.plot(cratio,np.mean(r2_PLS,axis=1),'orange')
    plt.plot(cratio,np.mean(r2_PLS_scaled,axis=1),marker='s',color='orange',linewidth=0)
    plt.legend(['PCA unprocessed','PCA centered','PCA $\sigma^{2}$ scaled'
                ,'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled'
                ,'PLS centered', 'PLS centered and $\sigma$ scaled'])
    plt.xscale('log')
    plt.xlabel('max(squared elements)/max(linear elements)')
    plt.ylabel('Average R$^{2}$')
    plt.show()
    """

    plt.figure(1, figsize=(3.5, 3.5))
    Figure_folder = os.path.join(os.path.expanduser("~"), 'Downloads')
    plt.plot(cratio, np.mean(rmse, axis=1), 'blue')
    plt.plot(cratio, np.mean(rmse_centered, axis=1), 'og')
    plt.plot(cratio, np.mean(rmse_scaled, axis=1), ':r')
    plt.plot(cratio, np.mean(rmse_cscaled, axis=1), ':g')
    plt.plot(cratio, np.mean(rmse_scaled_std, axis=1), '^r')
    plt.plot(cratio, np.mean(rmse_PLS, axis=1), color='orange')
    plt.plot(cratio,
             np.mean(rmse_PLS_scaled, axis=1),
             marker='s',
             color='orange',
             linewidth=0)
    plt.legend([
        'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled',
        'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled',
        'PLS centered ', 'PLS centered and $\sigma$ scaled'
    ],
               loc=4)
    plt.xlabel('Max(squared elements)/max(linear elements)')
    #plt.ylabel('Average RMSE')
    plt.xscale('log')
    plt.yscale('log')
    plt.ylim([6 * 10**-4, 4])
    #plt.xticks([])
    #plt.yticks([])
    figure_file = os.path.join(Figure_folder, figure_name + '.jpg')
    plt.savefig(figure_file, format='jpg')
    plt.close()

    plt.figure(2)
    plt.plot(cratio, r2[:, 2], 'blue')
    plt.plot(cratio, r2_centered[:, 2], 'og')
    plt.plot(cratio, r2_scaled[:, 2], ':r')
    plt.plot(cratio, r2_cscaled[:, 2], ':g')
    plt.plot(cratio, r2_scaled_std[:, 2], '^r')
    plt.plot(cratio, r2_PLS[:, 2], 'orange')
    plt.plot(cratio,
             r2_PLS_scaled[:, 2],
             marker='s',
             color='orange',
             linewidth=0)
    plt.legend([
        'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled',
        'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled',
        'PLS centered', 'PLS centered and $\sigma$ scaled'
    ])
    plt.xscale('log')
    plt.xlabel('Max(squared elements)/max(linear elements)')
    plt.ylabel('Average R$^{2}$')
    plt.show()

    plt.figure(3)
    plt.plot(cratio, rmse[:, 2], 'blue')
    plt.plot(cratio, rmse_centered[:, 2], 'og')
    plt.plot(cratio, rmse_scaled[:, 2], ':r')
    plt.plot(cratio, rmse_cscaled[:, 2], ':g')
    plt.plot(cratio, rmse_scaled_std[:, 2], '^r')
    plt.plot(cratio, rmse_PLS[:, 2], color='orange')
    plt.plot(cratio,
             rmse_PLS_scaled[:, 2],
             marker='s',
             color='orange',
             linewidth=0)
    plt.legend([
        'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled',
        'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled',
        'PLS centered ', 'PLS centered and $\sigma$ scaled'
    ])
    plt.xlabel('Max(squared elements)/max(linear elements)')
    plt.ylabel('Average RMSE')
    plt.xscale('log')
    plt.show()
예제 #11
0
x_axis = np.arange(1, np.linalg.matrix_rank(X) + 1)
plt.scatter(x_axis, cummulative_variance_explained)
plt.plot(x_axis, cummulative_variance_explained)
plt.title("Scree Plot")
plt.xlabel("Number of latent vectors used")
plt.ylabel("Percentage of variance explained")
plt.xticks(x_axis, x_axis)
plt.yticks()
plt.show()

# compare to sklearn package results to verify accuracy
import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

X = [[1, 5, 10], [2, 4, 8], [3, 4, 8], [4, 5, 10]]
y = [41, 49, 69, 65]

X = StandardScaler().fit_transform(X)  # population stdev
y = StandardScaler().fit_transform(y)  # population stdev

pls1 = PLSRegression(n_components=2)
scores = pls1.fit_transform(X, y)
T = pls1.x_scores_
W = pls1.x_weights_
P = pls1.y_loadings_

y_pred = pls1.predict(X)
예제 #12
0
import pandas as pd
#from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import scipy
import scipy.io as sio
from sklearn.cross_decomposition import PLSRegression

train = sio.loadmat('Train.mat')
test = sio.loadmat('Dev.mat')
X_train = train["final1"][:, 1:620]
y_train = train["final1"][:, 0]
X_test = test["final"][:, 1:620]
y_test = test["final"][:, 0]
pls2 = PLSRegression(n_components=11)
pls2.fit(X_train, y_train)

XX = [[1, 2, 2], [2, 3, 4]]

y_pred = pls2.predict(X_test)

res = [None] * 54
for i in range(54):
    if y_pred[i] < 7:
        res[i] = 1
    elif y_pred[i] < 20:
        res[i] = 2
    else:
        res[i] = 3
예제 #13
0
def prediction(x_calib, y_calib, x_valid, y_valid, plot_components=False):
    mse = []
    component = np.arange(1, 30)
    for i in component:
        pls = PLSRegression(n_components=i)
        pls.fit(x_calib, y_calib)
        y_pred = pls.predict(x_valid)

        mse_p = mean_squared_error(y_valid, y_pred)
        mse.append(mse_p)

        comp = 100 * (i + 1) / 30
        stdout.write("\r%d%% completed" % comp)
        stdout.flush()
    stdout.write("\n")

    msemin = np.argmin(mse)
    print("Suggested number of components: ", msemin + 1)
    stdout.write("\n")

    if plot_components is True:
        with plt.style.context(('ggplot')):
            plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue')
            plt.plot(component[msemin],
                     np.array(mse)[msemin],
                     'P',
                     ms=10,
                     mfc='red')
            plt.xlabel('Number of PLS components')
            plt.ylabel('MSE')
            plt.title('PLS')
            plt.xlim(xmin=-1)

        plt.show(block=False)
        _ = raw_input("Press [enter] to continue.")
    pls = PLSRegression(n_components=msemin + 1)
    pls.fit(x_calib, y_calib)

    startTime = time.time()
    y_pred = pls.predict(x_valid)
    endTime = time.time()
    print('Time elapsed: %s seconds' % (endTime - startTime))

    lb = preprocessing.LabelBinarizer()
    score_p = r2_score(y_valid, y_pred)
    mse_p = mean_squared_error(y_valid, y_pred)

    lb.fit_transform(y_valid)

    score = r2_score(y_valid, y_pred)
    print('R2: %5.3f' % score_p)
    print('MSE: %5.3f' % mse_p)

    #print

    pr = lb.inverse_transform(y_pred)
    ac = lb.inverse_transform(y_valid)

    #print type(pr[0])
    #print ac

    sum = 0
    for j in range(len(pr)):
        if np.array_equal(pr[j], ac[j]):
            sum += 1
    print('Accuracy: ' + str((float(sum) / float(len(pr))) * 100) + '%')
예제 #14
0
         len(db_comps_trainset[0, :])])
    test_result = numpy.zeros(
        [nc, len(db_comps_testset[:]),
         len(db_comps_testset[0, :])])

    RMSECV = numpy.zeros(
        [nc, len(db_ox_list),
         len(numpy.unique(db_folds_trainset))])

    #loop through each number of components

    jj = numpy.array([0, 2, 3, 4, 5, 6])
    for k in range(1, nc + 1):
        #calculate full training model for the current number of components
        print k
        PLS1 = PLSRegression(n_components=k)
        x_train = numpy.transpose(db_spectra_trainset)
        x_train_mean = numpy.mean(x_train, axis=0)
        x_train_meancenter = x_train - numpy.tile(x_train_mean,
                                                  (x_train.shape[0], 1))

        y_train = numpy.transpose(db_comps_trainset[i, :])
        y_train_mean = numpy.mean(y_train)
        y_train_meancenter = y_train - y_train_mean

        x_test = numpy.transpose(db_spectra_testset)
        x_test_meancenter = x_test - numpy.tile(x_train_mean,
                                                (x_test.shape[0], 1))

        y_test = numpy.transpose(db_comps_testset[i, :])
예제 #15
0
def make_plots(m,
               data,
               colors,
               names,
               groundtruth=None,
               waves=None,
               sample_size=10,
               ux=0,
               remove_mean=False,
               log_x=False,
               ylim=None,
               res_out='',
               title=None):
    inds_sup_train = np.random.choice(data['X'].shape[0], size=sample_size)
    inds_sup_valid = np.random.choice(data['X_valid'].shape[0],
                                      size=sample_size)
    inds_train_x = np.random.choice(data['X_'].shape[0], size=sample_size)
    inds_train_y = np.random.choice(data['_y'].shape[0], size=sample_size)

    y = np.hstack([data['y'], 1 - data['y'].sum(axis=1, keepdims=True)])
    y_valid = np.hstack(
        [data['y_valid'], 1 - data['y_valid'].sum(axis=1, keepdims=True)])
    y_corners = np.vstack((np.eye(data['y'].shape[1]),
                           np.zeros(data['y'].shape[1]))).astype('float32')

    if waves is None:
        waves = np.arange(data['X'].shape[1])

    if remove_mean:
        _ux = ux
    else:
        _ux = 0

    if log_x:
        f = lambda x: np.exp(x)
    else:
        f = lambda x: x

    force_ylim = False
    if ylim is not None:
        force_ylim = True

    pls_XY = PLSRegression(n_components=8, scale=False)
    pls_XY.fit(data['X'], y)
    pred_train_pls = pls_XY.predict(data['X'])
    pred_train_pls = (pred_train_pls.T / np.sum(pred_train_pls, axis=1)).T
    pred_valid_pls = pls_XY.predict(data['X_valid'])
    pred_valid_pls = (pred_valid_pls.T / np.sum(pred_valid_pls, axis=1)).T
    score_pred_train_pls = KL(pred_train_pls, y)
    score_pred_valid_pls = KL(pred_valid_pls, y_valid)

    pls_YX = PLSRegression(n_components=min(8, y.shape[1]), scale=False)
    pls_YX.fit(y, data['X'])
    gen_train_pls = pls_YX.predict(y)
    gen_valid_pls = pls_YX.predict(y_valid)
    score_gen_train_pls = L2(gen_train_pls, data['X'])
    score_gen_valid_pls = L2(gen_valid_pls, data['X_valid'])

    pred_train = m.predict(x=data['X'], deterministic=True)
    pred_train = np.hstack(
        [pred_train, 1 - pred_train.sum(axis=1, keepdims=True)])
    score_pred_train = KL(pred_train, y)
    pred_valid = m.predict(x=data['X_valid'], deterministic=True)
    pred_valid = np.hstack(
        [pred_valid, 1 - pred_valid.sum(axis=1, keepdims=True)])
    score_pred_valid = KL(pred_valid, y_valid)

    if m.model_type in [1, 2]:
        z2_train = m.getZ2(x=data['X'], y=data['y'], deterministic=True)
        z2_valid = m.getZ2(x=data['X_valid'],
                           y=data['y_valid'],
                           deterministic=True)
        z2_train_mean = z2_train.mean(axis=0)
        z2_valid_mean = z2_valid.mean(axis=0)
        z2_gen_train = z2_train_mean * np.ones_like(z2_train).astype('float32')
        z2_gen_valid = z2_valid_mean * np.ones_like(z2_valid).astype('float32')
        z2_gen_endmembers = z2_train_mean * np.ones(
            (y_corners.shape[0], z2_train.shape[1])).astype('float32')
        gen_train = f(
            _ux + m.generate(y=data['y'][inds_sup_train],
                             z2=z2_gen_train[inds_sup_train],
                             deterministic=True)
        )  # true by default for non-variational, variational default is False
        gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid],
                                       z2=z2_gen_valid[inds_sup_valid],
                                       deterministic=True))
        endmembers = f(
            _ux +
            m.generate(y=y_corners, z2=z2_gen_endmembers, deterministic=True))
        if m.variational:
            endmembers_dists = []
            for idx_c, c in enumerate(y_corners):
                endmembers_dist = [
                    f(_ux + m.generate(y=np.atleast_2d(c),
                                       z2=z2_gen_endmembers[idx_c:idx_c + 1],
                                       deterministic=False)).squeeze()
                    for i in range(sample_size)
                ]
                endmembers_dists += [np.asarray(endmembers_dist)]
            endmembers_dists = endmembers_dists
    else:
        gen_train = f(
            _ux + m.generate(y=data['y'][inds_sup_train], deterministic=True)
        )  # true by default for non-variational, variational default is False
        gen_valid = f(
            _ux +
            m.generate(y=data['y_valid'][inds_sup_valid], deterministic=True))
        endmembers = f(_ux + m.generate(y=y_corners, deterministic=True))
        if m.variational:
            endmembers_dists = []
            for idx_c, c in enumerate(y_corners):
                endmembers_dist = [
                    f(_ux + m.generate(y=np.atleast_2d(c), deterministic=False)
                      ).squeeze() for i in range(sample_size)
                ]
                endmembers_dists += [np.asarray(endmembers_dist)]
            endmembers_dists = endmembers_dists
    recon_train = f(_ux +
                    m.generate(x=data['X_'][inds_train_x], deterministic=True))
    recon_sup_valid = f(
        _ux +
        m.generate(x=data['X_valid'][inds_sup_valid], deterministic=True))

    fs = 24
    fs_tick = 18

    # change xticks to be names
    p = 100
    plt.plot(p * y[inds_sup_train][0], 'k', lw=2, label='Ground Truth')
    ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_train)
    plt.plot(p * pred_train[inds_sup_train][0], 'r-.', lw=2, label=ssdgm_label)
    pls_label = 'PLS ({:.3f})'.format(score_pred_train_pls)
    plt.plot(p * pred_train_pls[inds_sup_train][0],
             'b-.',
             lw=2,
             label=pls_label)
    plt.plot(p * y[inds_sup_train].T, 'k', lw=2)
    plt.plot(p * pred_train[inds_sup_train].T, 'r-.', lw=2)
    plt.plot(p * pred_train_pls[inds_sup_train].T, 'b-.', lw=2)
    plt.title('Predicting Composition - Training Error', fontsize=fs)
    plt.ylabel('Composition (%)', fontsize=fs)
    ax = plt.gca()
    ax.set_ylim((0, 1 * p))
    ax.set_xticks(np.arange(y.shape[1]))
    ax.set_xticklabels(names, fontsize=fs)
    ax.tick_params(axis='x',
                   direction='out',
                   top='off',
                   length=10,
                   labelsize=fs_tick)
    lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax = plt.gca()
    plt.savefig(res_out + '/comp_train.png',
                additional_artists=[lgd],
                bbox_inches='tight')
    plt.close()

    plt.plot(p * y_valid[inds_sup_valid][0], 'k', lw=2, label='Ground Truth')
    ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_valid)
    plt.plot(p * pred_valid[inds_sup_valid][0], 'r-.', lw=2, label=ssdgm_label)
    pls_label = 'PLS ({:.3f})'.format(score_pred_valid_pls)
    plt.plot(p * pred_valid_pls[inds_sup_valid][0],
             'b-.',
             lw=2,
             label=pls_label)
    plt.plot(p * y_valid[inds_sup_valid].T, 'k', lw=2)
    plt.plot(p * pred_valid[inds_sup_valid].T, 'r-.', lw=2)
    plt.plot(p * pred_valid_pls[inds_sup_valid].T, 'b-.', lw=2)
    plt.title('Predicting Composition - Validation Error', fontsize=fs)
    plt.ylabel('Composition (%)', fontsize=fs)
    ax = plt.gca()
    ax.set_ylim((0, 1 * p))
    ax.set_xticks(np.arange(y.shape[1]))
    ax.set_xticklabels(names, fontsize=fs)
    ax.tick_params(axis='x',
                   direction='out',
                   top='off',
                   length=10,
                   labelsize=fs_tick)
    lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax = plt.gca()
    plt.savefig(res_out + '/comp_valid.png',
                additional_artists=[lgd],
                bbox_inches='tight')
    plt.close()

    plt.plot(waves, f(_ux + data['X'][inds_sup_train]).T, 'k')
    plt.plot(waves, gen_train.T, 'r-.')
    plt.title('Generating Spectra - Training Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out + '/genspectra_train.png')
    plt.close()

    plt.plot(waves, f(_ux + data['X_valid'][inds_sup_valid]).T, 'k')
    plt.plot(waves, gen_valid.T, 'r-.')
    plt.title('Generating Spectra - Validation Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out + '/genspectra_valid.png')
    plt.close()

    if m.variational:
        for endmember, dist, color, name in zip(endmembers, endmembers_dists,
                                                colors, names):
            plt.plot(waves, endmember, color=color, lw=2, label=name)
            plt.plot(waves, dist.T, '-.', color=color, lw=1)
            plt.title('Generating ' + name + ' with Distribution', fontsize=fs)
            plt.xlabel('Channels', fontsize=fs)
            plt.ylabel('Intensities', fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            ax = plt.gca()
            if force_ylim:
                ax.set_ylim(ylim)
            plt.savefig(res_out + '/endmembers_dist_' + name + '.png',
                        additional_artists=[lgd],
                        bbox_inches='tight')
            plt.close()

    for endmember, color, name in zip(endmembers, colors, names):
        plt.plot(waves, endmember, color=color, lw=2, label=name)
        plt.title('Generating ' + name, fontsize=fs)
        plt.xlabel('Channels', fontsize=fs)
        plt.ylabel('Intensities', fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs_tick)
        lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        # if m.variational:
        #     plt.gca().set_ylim(ax.get_ylim())
        if force_ylim:
            plt.gca().set_ylim(ylim)
        plt.savefig(res_out + '/endmembers_mean_' + name + '.png',
                    additional_artists=[lgd],
                    bbox_inches='tight')
        plt.close()

    # if m.variational:
    #     for endmember, color, name in zip(endmembers,colors,names):
    #         plt.plot(waves,endmember,color=color,lw=2,label=name)
    #     for endmember_dist, color in zip(endmembers_dists,colors):
    #         plt.plot(waves,endmember_dist.T,'-.',color=color,lw=1)
    #     plt.title('Generating Endmembers with Distributions', fontsize=fs)
    #     plt.xlabel('Channels', fontsize=fs)
    #     plt.ylabel('Intensities', fontsize=fs)
    #     plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    #     lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    #     ax = plt.gca()
    #     if force_ylim:
    #         ax.set_ylim(ylim)
    #     plt.savefig(res_out+'/endmembers_dist.png',additional_artists=[lgd],bbox_inches='tight')
    #     plt.close()

    # for endmember, color, name in zip(endmembers,colors,names):
    #     plt.plot(waves,endmember,color=color,lw=2,label=name)
    # plt.title('Generating Endmembers', fontsize=fs)
    # plt.xlabel('Channels', fontsize=fs)
    # plt.ylabel('Intensities', fontsize=fs)
    # plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    # if m.variational:
    #     plt.gca().set_ylim(ax.get_ylim())
    # if force_ylim:
    #     plt.gca().set_ylim(ylim)
    # plt.savefig(res_out+'/endmembers_means.png',additional_artists=[lgd],bbox_inches='tight')
    # plt.close()

    # for endmember, color, name in zip(endmembers,colors,names):
    #     plt.plot(waves,endmember,color=color,lw=2,label=name)
    # for endmember, color, name in zip(groundtruth,colors,names):
    #     plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=0.4)
    # score_gen_endmembers = L2(endmembers,groundtruth[:,:len(waves)])
    # if title is None:
    #     plt.title('Generating Endmembers with Ground Truth ({:.3f})'.format(score_gen_endmembers), fontsize=fs)
    # else:
    #     plt.title(title+' ({:.3f})'.format(score_gen_endmembers), fontsize=fs)
    # plt.xlabel('Channels', fontsize=fs)
    # plt.ylabel('Intensities', fontsize=fs)
    # plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    # lgd = plt.legend(loc='lower right', fontsize=fs)
    # # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    # if m.variational:
    #     plt.gca().set_ylim(ax.get_ylim())
    # if force_ylim:
    #     plt.gca().set_ylim(ylim)
    # plt.savefig(res_out+'/endmembers_means_with_groundtruth.png',additional_artists=[lgd],bbox_inches='tight')
    # plt.close()

    if groundtruth is not None:
        score_gen_endmembers = L2(endmembers, groundtruth[:, :len(waves)])
        for endmember, gt, color, name in zip(endmembers, groundtruth, colors,
                                              names):
            plt.plot(waves, endmember, color=color, lw=2, label=name)
            plt.plot(waves, gt[:len(waves)], color=color, lw=6, alpha=0.4)
            score_gen_endmember = L2(endmember, gt[:len(waves)])
            plt.title(
                'Generating ' + name +
                ' with Ground Truth ({:.3f})'.format(score_gen_endmember),
                fontsize=fs)
            plt.xlabel('Channels', fontsize=fs)
            plt.ylabel('Intensities', fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            # if m.variational:
            #     plt.gca().set_ylim(ax.get_ylim())
            if force_ylim:
                plt.gca().set_ylim(ylim)
            plt.savefig(res_out + '/endmembers_mean_with_groundtruth_' + name +
                        '.png',
                        additional_artists=[lgd],
                        bbox_inches='tight')
            plt.close()

    plt.plot(waves, f(_ux + data['X_'][inds_train_x]).T, 'k')
    plt.plot(waves, recon_train.T, 'r-.')
    plt.title('Reconstructing Spectra - Training Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out + '/recon_train.png')
    plt.close()

    plt.plot(waves, f(_ux + data['X_valid'][inds_sup_valid]).T, 'k')
    plt.plot(waves, recon_sup_valid.T, 'r-.')
    plt.title('Reconstructing Spectra - Validation Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out + '/recon_valid.png')
    plt.close()

    if m.model_type in [1, 2]:
        # need to use vertical lines to denote edges of datasets
        # write dataset i in middle of range on xlabel
        for i in range(z2_train.shape[1]):
            plt.plot(z2_train[:, i], 'r-.')
            plt.title('Nuisance Variable ' + str(i) + ' - Training',
                      fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            plt.savefig(res_out + '/nuisance_train_' + str(i) + '.png')
            plt.close()

            plt.plot(z2_valid[:, i], 'r-.')
            ax = plt.gca()
            ylim = ax.get_ylim()
            # should make this general if possible
            plt.plot([1866, 1866], [-5, 5], 'k--')
            plt.plot([1866 + 1742, 1866 + 1742], [-5, 5], 'k--')
            # plt.plot([1866+1742+1746,1866+1742+1746],[-5,5],'k--')
            ax.set_ylim(ylim)
            plt.title('Nuisance Variable ' + str(i) + ' - Validation',
                      fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            plt.savefig(res_out + '/nuisance_valid_' + str(i) + '.png')
            plt.close()
def save_model(path, aaindex_r2_list, learning_set, validation_set, threshold=5, regressor='pls',
               no_fft=False, train_on_all=False):
    """
    Function Save_Model saves the best -s THRESHOLD models as 'Pickle' files (pickle.dump),
    which can be loaded again for doing predictions. Also, in Save_Model included is the def cross_validation
    -based computing of the k-fold CV performance of the n component-optimized model on all data
    (learning + validation set); by default  k  is 5 (n_samples = 5).
    Plots of the CV performance for the t best models are stored inside the folder CV_performance.
    """
    regressor = regressor.lower()
    try:
        os.mkdir('CV_performance')
    except FileExistsError:
        pass
    try:
        os.mkdir('Pickles')
    except FileExistsError:
        pass

    try:
        os.remove('CV_performance/_CV_Results.txt')
    except FileNotFoundError:
        pass
    file = open('CV_performance/_CV_Results.txt', 'w')
    file.write('5-fold cross-validated performance of top models for validation set across all data.\n\n')
    if no_fft:
        file.write("No FFT used in this model construction, performance represents"
                   " model accuracies on raw encoded sequence data.\n\n")
    file.close()

    for t in range(threshold):
        try:
            idx = aaindex_r2_list[t][0]
            parameter = aaindex_r2_list[t][7]

            # Estimating the CV performance of the n_component-fitted model on all data
            xy_learn = XY(full_path(idx), learning_set)
            xy_test = XY(full_path(idx), validation_set)
            if no_fft is False:
                x_test, y_test, _ = xy_test.get_x_and_y()
                x_learn, y_learn, _ = xy_learn.get_x_and_y()
            else:
                _, y_test, x_test = xy_test.get_x_and_y()
                _, y_learn, x_learn = xy_learn.get_x_and_y()

            x = np.concatenate([x_learn, x_test])
            y = np.concatenate([y_learn, y_test])

            if regressor == 'pls' or regressor == 'pls_cv':
                # n_components according to lowest MSE for validation set
                regressor_ = PLSRegression(n_components=parameter.get('n_components'))

            elif regressor == 'rf':
                regressor_ = RandomForestRegressor(
                    random_state=parameter.get('random_state'),
                    n_estimators=parameter.get('n_estimators'),
                    max_features=parameter.get('max_features')
                )

            elif regressor == 'svr':
                regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma'))

            elif regressor == 'mlp':
                regressor_ = MLPRegressor(
                    hidden_layer_sizes=parameter.get('hidden_layer_sizes'),
                    activation=parameter.get('activation'),
                    solver=parameter.get('solver'),
                    learning_rate=parameter.get('learning_rate'),
                    learning_rate_init=parameter.get('learning_rate_init'),
                    max_iter=parameter.get('max_iter'),
                    random_state=parameter.get('random_state')
                )

            else:
                raise SystemError("Did not find specified regression model as valid option. "
                                  "See '--help' for valid regression model options.")

            # perform 5-fold cross-validation on all data (on X and Y)
            n_samples = 5
            y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples)

            r_squared = r2_score(y_test_total, y_predicted_total)
            rmse = np.sqrt(mean_squared_error(y_test_total, y_predicted_total))
            stddev = np.std(y_test_total, ddof=1)
            nrmse = rmse / stddev
            pearson_r = np.corrcoef(y_test_total, y_predicted_total)[0][1]
            # ranks for Spearman correlation
            y_test_total_rank = np.array(y_test_total).argsort().argsort()
            y_predicted_total_rank = np.array(y_predicted_total).argsort().argsort()
            spearman_rho = np.corrcoef(y_test_total_rank, y_predicted_total_rank)[0][1]

            with open('CV_performance/_CV_Results.txt', 'a') as f:
                f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format(
                    regressor.upper(), parameter, idx[:-4]))
                f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};'
                        ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho))

            figure, ax = plt.subplots()
            ax.scatter(y_test_total, y_predicted_total, marker='o', s=20, linewidths=0.5, edgecolor='black')
            ax.plot([min(y_test_total) - 1, max(y_test_total) + 1],
                    [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=2)
            ax.legend([
                '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format(
                    round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3))
                + r'$\rho$ = {}'.format(str(round(spearman_rho, 3)))
            ])
            ax.set_xlabel('Measured')
            ax.set_ylabel('Predicted')
            plt.savefig('CV_performance/' + idx[:-4] + '_' + str(n_samples) + '-fold-CV.png', dpi=250)
            plt.close('all')

            if train_on_all:
                # fit on all available data (learning + validation set; FFT or noFFT is defined already above)
                regressor_.fit(x, y)
            else:
                # fit (only) on full learning set (FFT or noFFT is defined already above)
                regressor_.fit(x_learn, y_learn)

            file = open(os.path.join(path, 'Pickles/'+idx[:-4]), 'wb')
            pickle.dump(regressor_, file)
            file.close()

        except IndexError:
            break

    return ()
def prediction(X_calib, Y_calib, X_valid, Y_valid, plot_components=False):
    # Run PLS including a variable number of components, up to 40,  and calculate MSE
    mse = []
    component = np.arange(1, 40)
    for i in component:
        pls = PLSRegression(n_components=i)
        # Fit
        pls.fit(X_calib, Y_calib)
        # Prediction
        Y_pred = pls.predict(X_valid)

        mse_p = mean_squared_error(Y_valid, Y_pred)
        mse.append(mse_p)

        comp = 100 * (i + 1) / 40
        # Trick to update status on the same line
        stdout.write("\r%d%% completed" % comp)
        stdout.flush()
    stdout.write("\n")

    # Calculate and print the position of minimum in MSE
    msemin = np.argmin(mse)
    print("Suggested number of components: ", msemin + 1)
    stdout.write("\n")

    if plot_components is True:
        with plt.style.context(('ggplot')):
            plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue')
            plt.plot(component[msemin],
                     np.array(mse)[msemin],
                     'P',
                     ms=10,
                     mfc='red')
            plt.xlabel('Number of PLS components')
            plt.ylabel('MSE')
            plt.title('PLS')
            plt.xlim(xmin=-1)

        plt.show()

    # Run PLS with suggested number of components
    pls = PLSRegression(n_components=msemin + 1)
    pls.fit(X_calib, Y_calib)
    Y_pred = pls.predict(X_valid)

    # Calculate and print scores
    score_p = r2_score(Y_valid, Y_pred)
    mse_p = mean_squared_error(Y_valid, Y_pred)
    sep = np.std(Y_pred[:, 0] - Y_valid)
    rpd = np.std(Y_valid) / sep
    bias = np.mean(Y_pred[:, 0] - Y_valid)

    print('R2: %5.3f' % score_p)
    print('MSE: %5.3f' % mse_p)
    print('SEP: %5.3f' % sep)
    print('RPD: %5.3f' % rpd)
    print('Bias: %5.3f' % bias)

    # Plot regression and figures of merit
    rangey = max(Y_valid) - min(Y_valid)
    rangex = max(Y_pred) - min(Y_pred)

    z = np.polyfit(Y_valid, Y_pred, 1)
    with plt.style.context(('ggplot')):
        fig, ax = plt.subplots(figsize=(9, 5))
        ax.scatter(Y_pred, Y_valid, c='red', edgecolors='k')
        ax.plot(z[1] + z[0] * Y_valid, Y_valid, c='blue', linewidth=1)
        ax.plot(Y_valid, Y_valid, color='green', linewidth=1)
        plt.xlabel('Predicted')
        plt.ylabel('Measured')
        plt.title('Prediction')

        # Print the scores on the plot
        plt.text(
            min(Y_pred) + 0.05 * rangex,
            max(Y_valid) - 0.1 * rangey, 'R$^{2}=$ %5.3f' % score_p)
        plt.text(
            min(Y_pred) + 0.05 * rangex,
            max(Y_valid) - 0.15 * rangey, 'MSE: %5.3f' % mse_p)
        plt.text(
            min(Y_pred) + 0.05 * rangex,
            max(Y_valid) - 0.2 * rangey, 'SEP: %5.3f' % sep)
        plt.text(
            min(Y_pred) + 0.05 * rangex,
            max(Y_valid) - 0.25 * rangey, 'RPD: %5.3f' % rpd)
        plt.text(
            min(Y_pred) + 0.05 * rangex,
            max(Y_valid) - 0.3 * rangey, 'Bias: %5.3f' % bias)
        plt.show()
예제 #18
0
파일: OPS.py 프로젝트: ramongonze/ops
def regressionVector(X, Y, numLV):

    mdl = PLSRegression(n_components=numLV).fit(X, Y)
    coefs = abs(mdl.coef_)
    return coefs[:, 0]
예제 #19
0
    r2 = []
    nRMSE = []
    maps = []

    for i in tqdm(range(Boots)):
        # select random number
        idx = np.random.choice(N, N, replace=True)
        idx2 = list(set(range(N)) - set(idx))
        # select samples using idx
        x_train = np.array(x.loc[idx, :])
        x_val = np.array(x.loc[idx2, :])
        y_train = np.array(y[idx])
        y_val = np.array(y[idx2])

        # PLSR model
        trainPLSR = PLSRegression(n_components=bestComp)
        trainPLSR.fit(x_train, y_train)

        # predict
        predictt = trainPLSR.predict(x_val)
        predictt = unlist(predictt)

        # predict to map
        mapp = trainPLSR.predict(r_data)
        mapp = unlist(mapp)

        # backtransform maps to 3D array
        mapp = mapp.reshape(img[:, :, 0].shape)

        # get accuracies
        R2 = (np.corrcoef(predictt, y_val)[0, 1])**2
# autoscaling
if do_autoscaling:
    autoscaled_Xtrain = (Xtrain - Xtrain.mean(axis=0)) / Xtrain.std(axis=0,
                                                                    ddof=1)
    autoscaled_ytrain = (ytrain - ytrain.mean()) / ytrain.std(ddof=1)
    autoscaled_Xtest = (Xtest - Xtrain.mean(axis=0)) / Xtrain.std(axis=0,
                                                                  ddof=1)
else:
    autoscaled_Xtrain = Xtrain.copy()
    autoscaled_ytrain = ytrain.copy()
    autoscaled_Xtest = Xtest.copy()

if regression_method_flag == 1:  # Ordinary Least Squares
    regression_model = LinearRegression()
elif regression_method_flag == 2:  # Partial Least Squares with constant component
    regression_model = PLSRegression(n_components=pls_component_number)
elif regression_method_flag == 3:  # Partial Least Squares
    pls_components = np.arange(
        1,
        min(
            np.linalg.matrix_rank(autoscaled_Xtrain) + 1,
            max_pls_component_number + 1), 1)
    r2all = list()
    r2cvall = list()
    for pls_component in pls_components:
        pls_model_in_cv = PLSRegression(n_components=pls_component)
        pls_model_in_cv.fit(autoscaled_Xtrain, autoscaled_ytrain)
        calculated_y_in_cv = np.ndarray.flatten(
            pls_model_in_cv.predict(autoscaled_Xtrain))
        estimated_y_in_cv = np.ndarray.flatten(
            model_selection.cross_val_predict(pls_model_in_cv,
예제 #21
0
def optimise_pls_cv(X, y, n_comp, plot_components=True):
    '''Run PLS including a variable number of components, up to n_comp,
       and calculate MSE '''
    mse = []
    component = np.arange(1, n_comp + 1)
    for i in range(1, n_comp + 1):
        pls = PLSRegression(n_components=i)
        # Cross-validation
        y_cv = cross_val_predict(pls, X, y, cv=10)
        mse.append(mean_squared_error(y, y_cv))
        comp = 100 * (i) / n_comp
        # Trick to update status on the same line
        stdout.write("\r%d%% completed" % comp)
        stdout.flush()
    stdout.write("\n")
    # Calculate and print the position of minimum in MSE
    msemin = np.argmin(mse)
    print("Suggested number of components: ", msemin + 1)
    stdout.write("\n")
    if plot_components is True:
        with plt.style.context(('ggplot')):
            plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue')
            plt.plot(component[msemin],
                     np.array(mse)[msemin],
                     'P',
                     ms=10,
                     mfc='red')
            plt.xlabel('Number of PLS components')
            plt.ylabel('MSE')
            plt.title('PLS')
            plt.xlim(left=-1)
        plt.show()
    # Define PLS object with optimal number of components
    pls_opt = PLSRegression(n_components=msemin + 1)
    # Fir to the entire dataset
    pls_opt.fit(X, y)
    y_c = pls_opt.predict(X)
    # Cross-validation
    y_cv = cross_val_predict(pls_opt, X, y, cv=10)
    # Calculate scores for calibration and cross-validation
    score_c = r2_score(y, y_c)
    score_cv = r2_score(y, y_cv)
    # Calculate mean squared error for calibration and cross validation
    mse_c = mean_squared_error(y, y_c)
    mse_cv = mean_squared_error(y, y_cv)
    print('R2 calib: %5.3f' % score_c)
    print('R2 CV: %5.3f' % score_cv)
    print('MSE calib: %5.3f' % mse_c)
    print('MSE CV: %5.3f' % mse_cv)
    # Plot regression and figures of merit
    # rangey = max(y) - min(y)
    # rangex = max(y_c) - min(y_c)
    # Fit a line to the CV vs response
    # z = np.polyfit(y, y_c, 1)
    # with plt.style.context(('ggplot')):
    #     fig, ax = plt.subplots(figsize=(9, 5))
    #     ax.scatter(y_c, y, c='red', edgecolors='k')
    #     #Plot the best fit line
    #     ax.plot(np.polyval(z,y), y, c='blue', linewidth=1)
    #     #Plot the ideal 1:1 line
    #     ax.plot(y, y, color='green', linewidth=1)
    #     plt.title('$R^{2}$ (CV): '+str(score_cv))
    #     plt.xlabel('Predicted $^{\circ}$Brix')
    #     plt.ylabel('Measured $^{\circ}$Brix')
    #     plt.show()
    return pls_opt, mse, y_c, y_cv
예제 #22
0
def final_train(x, y, x_test, y_test, out_list, mn, age_group_all):
    model = []
    best_score = []

    if mn == 'LAD':
        print(out_list)
        [C_list,
         score_list] = zip(*[(item[6]['C'], item[5]) for item in out_list])
        C_final = np.median(C_list)
        best_score = np.mean(score_list)
        print('in final LAD')
        print('para', C_list, C_final, 'score', score_list, best_score)
        model = LAD(epsilon=0.0,
                    tol=0.0001,
                    C=C_final,
                    loss='epsilon_insensitive',
                    fit_intercept=True,
                    intercept_scaling=1.0,
                    dual=True,
                    verbose=0,
                    random_state=None,
                    max_iter=10000)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RFR':
        [n_est_list, score_list] = zip(*[(item[6]['n_estimators'], item[5])
                                         for item in out_list])
        n_est = int(np.median(n_est_list))
        best_score = np.mean(score_list)
        print('in final RFR')
        print('n_est_list', n_est_list, n_est, 'score', score_list, best_score)
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [n_est]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'PLSR':
        [n_comp_list, score_list] = zip(*[(item[6]['n_components'], item[5])
                                          for item in out_list])
        n_comp = int(np.median(n_comp_list))
        best_score = np.mean(score_list)
        print('in final PLSR')
        print('n_comp_list', n_comp_list, n_comp, 'score', score_list,
              best_score)
        pls_reg = PLSRegression()
        params = {'n_components': [n_comp]}
        model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RR':
        from sklearn.linear_model import Ridge, RidgeCV
        [n_comp_list,
         score_list] = zip(*[(item[6]['alpha'], item[5]) for item in out_list])
        n_comp = int(np.median(n_comp_list))
        best_score = np.mean(score_list)
        print('in final RR')
        print('n_comp_list', n_comp_list, n_comp, 'score', score_list,
              best_score)
        ridge = Ridge()
        params = {'alpha': [n_comp]}
        model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0)
        model.fit(x, y)
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'RVM':
        from skrvm import RVR
        print('in final RVM')
        model = RVR(kernel='linear')
        model.fit(x, y)
        best_score = 0
        pred_var = predict(mn, model, x_test, y_test)

    elif mn == 'COMB':
        print('IN COMB')
        group_lad = dict()
        from mord import LAD
        from sklearn.ensemble import RandomForestRegressor

        print('shapes', x.shape, y.shape)

        lad1 = LAD(epsilon=0.0,
                   tol=0.0001,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        broad_lad = GridSearchCV(lad1,
                                 param_grid=params,
                                 cv=5,
                                 scoring='neg_mean_absolute_error',
                                 verbose=0)
        broad_lad.fit(x, y)

        for ages in age_group_all:
            # print('ages', ages)
            idx_grp = list()
            for item in ages:  # for every age in the age group collect the training data by getting the indices
                for idx, val in enumerate(y):
                    if val == item:
                        idx_grp.append(idx)

            key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages))
            x_samples_train = x[idx_grp]
            y_samples_train = y[idx_grp]
            # print('y_samples_train', y_samples_train)

            lad2 = LAD(epsilon=0.0,
                       tol=0.0001,
                       loss='epsilon_insensitive',
                       fit_intercept=True,
                       intercept_scaling=1.0,
                       dual=True,
                       verbose=0,
                       random_state=None,
                       max_iter=10000)
            params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
            specific_lad = GridSearchCV(lad2,
                                        param_grid=params2,
                                        cv=5,
                                        scoring='neg_mean_absolute_error',
                                        verbose=0)
            specific_lad.fit(x_samples_train, y_samples_train)
            group_lad[key_age_grp] = specific_lad

        pred_all = make_predictions(x, broad_lad, group_lad)
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(pred_all, y)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))

        best_score = model.best_score_
        pred_all_test = make_predictions(x_test, broad_lad, group_lad)
        pred_var = predict(mn, model, pred_all_test, y_test)

    return model, best_score, pred_var
예제 #23
0
import os
import copy
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn import metrics
import pickle

os.chdir("D:/11. Programming/ML/01. FabWideSimulation6/")
pls = PLSRegression(n_components=6, scale=False, max_iter=500, copy=True)
lamda_PLS = 0.1
Tgt = np.array([0, 50])
A_p1 = np.array([[0.5, -0.2], [0.25, 0.15]])
d_p1 = np.array([[0.1, 0], [0.05, 0]])
C_p1 = np.transpose(
    np.array([[0, 0.5, 0.05, 0, 0.15, 0], [0.085, 0, 0.025, 0.2, 0, 0]]))

L1 = 0.55 * np.identity(2)
L2 = 0.75 * np.identity(2)
I = np.identity(2)

N = 120
DoE_Queue = []


def sampling_up():
    u1_p1 = np.random.normal(0.4, np.sqrt(0.2))
    u2_p1 = np.random.normal(0.6, np.sqrt(0.2))
    u_p1 = np.array([u1_p1, u2_p1])
    return u_p1
예제 #24
0
def train(m, x_train, y_train, x_test, y_test):
    print('training', m)
    model = []
    pred_var = {}

    if m == 'LAD':
        from mord import LAD
        lad = LAD(epsilon=0.0,
                  tol=0.0001,
                  loss='epsilon_insensitive',
                  fit_intercept=True,
                  intercept_scaling=1.0,
                  dual=True,
                  verbose=0,
                  random_state=None,
                  max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(lad,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)

        y_train = y_train.astype(float).round()
        y_train = y_train.astype(int)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LAD grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'MCLog':  # this class is not avaialble
        from sklearn.linear_model import LogisticRegression
        mcl = LogisticRegression(multi_class='multinomial',
                                 max_iter=10000,
                                 solver='newton-cg',
                                 fit_intercept=True)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(mcl,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] MCLog grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LogAT':  # takes quite some time
        from mord import LogisticAT
        lat = LogisticAT()
        params = {"alpha": np.linspace(0, 1, 5)}
        model = GridSearchCV(lat,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LogAT grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LinearSVC':
        from sklearn.svm import LinearSVC
        svm = LinearSVC()
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LinearSVC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        params = {"n_estimators": [10, 100, 500, 1000]}
        model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'Lasso':
        from sklearn.linear_model import Lasso
        from sklearn.linear_model import LassoCV
        svm = Lasso()
        params = {"alpha": [10]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))
        # model = LassoCV(n_alphas=10, cv=5, verbose=3)
        # model.fit(x_train, y_train)
        # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_))

    elif m == 'RFR':
        from sklearn.ensemble import RandomForestRegressor
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RR':
        from sklearn.linear_model import Ridge, RidgeCV
        ridge = Ridge()
        params = {
            'alpha':
            [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        }
        model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        print("[INFO] Ridge Regression grid search best parameters: {}".format(
            model.best_params_))
        # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5)
        # model.fit(x_train, y_train)
        # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_))
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'PLSR':
        from sklearn.cross_decomposition import PLSRegression
        pls_reg = PLSRegression()
        params = {
            'n_components': [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
                19, 20
            ]
        }
        model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0)
        # pdb.set_trace()
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        print("[INFO] PLS Regression grid search best parameters: {}".format(
            model.best_params_))
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'RVM':
        from skrvm import RVR
        print('in RVM')
        model = RVR(kernel='linear')
        # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse',
        #                                                             num_rounds=3, random_seed=123)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

        # print('Average expected loss: %.3f' % avg_expected_loss)
        # print('Average bias: %.3f' % avg_bias)
        # print('Average variance: %.3f' % avg_var)

    elif m == 'DTR':
        from sklearn.tree import DecisionTreeRegressor
        model = DecisionTreeRegressor()
        # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2],
        #           "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]}
        # params = {"max_depth": [2,4,6]}
        # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'COMB':
        from sklearn.ensemble import RandomForestRegressor
        from mord import LAD
        from group_pred import create_age_groups
        print('IN COMB')
        group_lad = dict()

        print('shapes', x_train.shape, y_train.shape)

        lad1 = LAD(epsilon=0.0,
                   tol=0.0001,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        broad_lad = GridSearchCV(lad1,
                                 param_grid=params,
                                 cv=5,
                                 scoring='neg_mean_absolute_error',
                                 verbose=0)

        y_train_r = y_train.astype(float).round()
        y_train_r = y_train_r.astype(int)

        broad_lad.fit(x_train, y_train_r)

        age_group_all = create_age_groups(y_train_r, 10, 5)

        for ages in age_group_all:
            # print('ages', ages)
            idx_grp = list()
            for item in ages:  # for every age in the age group collect the training data by getting the indices
                for idx, val in enumerate(y_train_r):
                    if val == item:
                        idx_grp.append(idx)

            print('group info', ages, len(idx_grp))
            if len(idx_grp) > 5:
                key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages))
                x_samples_train = x_train[idx_grp]
                y_samples_train = y_train_r[idx_grp]
                # print('y_samples_train', y_samples_train)

                lad2 = LAD(epsilon=0.0,
                           tol=0.0001,
                           loss='epsilon_insensitive',
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           dual=True,
                           verbose=0,
                           random_state=None,
                           max_iter=10000)
                params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
                specific_lad = GridSearchCV(lad2,
                                            param_grid=params2,
                                            cv=5,
                                            scoring='neg_mean_absolute_error',
                                            verbose=0)
                specific_lad.fit(x_samples_train, y_samples_train)
                group_lad[key_age_grp] = specific_lad

        print('len_groups', len(group_lad))
        pred_all = make_predictions(x_train, broad_lad, group_lad)

        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model_2.fit(pred_all, y_train)

        # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True,
        #            intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000)
        # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0)
        # model_2.fit(pred_all, y_train_r)

        train_var = predict(m, model_2, pred_all, y_train)
        print("[INFO] RFR grid search best parameters: {}".format(
            model_2.best_params_))

        pred_all_test = make_predictions(x_test, broad_lad, group_lad)
        pred_var = predict(m, model_2, pred_all_test, y_test)
        model = [broad_lad, group_lad, model_2]
    else:
        print('unknown model')

    if m == 'RVM' or 'DTR':
        return model, 0, 0, pred_var, train_var
    elif m == 'COMB':
        return model, model_2.best_score_, model_2.best_params_, pred_var, train_var
    else:
        return model, model.best_score_, model.best_params_, pred_var, train_var
# オートスケーリング
autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()

if method_name == 'pls':
    # CV による成分数の最適化
    components = []  # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加
    r2_in_cv_all = []  # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加
    for component in range(
            1,
            min(np.linalg.matrix_rank(autoscaled_x_train),
                max_number_of_principal_components) + 1):
        # PLS
        model = PLSRegression(n_components=component)  # PLS モデルの宣言
        estimated_y_in_cv = pd.DataFrame(
            cross_val_predict(
                model, autoscaled_x_train, autoscaled_y_train,
                cv=fold_number))  # クロスバリデーション推定値の計算し、DataFrame型に変換
        estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean(
        )  # スケールをもとに戻す
        r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv)  # r2 を計算
        print(component, r2_in_cv)  # 成分数と r2 を表示
        r2_in_cv_all.append(r2_in_cv)  # r2 を追加
        components.append(component)  # 成分数を追加

    # 成分数ごとの CV 後の r2 をプロットし、CV 後のr2が最大のときを最適成分数に
    optimal_component_number = sample_functions.plot_and_selection_of_hyperparameter(
        components, r2_in_cv_all, 'number of components', 'cross-validated r2')
    print('\nCV で最適化された成分数 :', optimal_component_number)
예제 #26
0
# fill empty values in the dataset
X = X.fillna(method='ffill')
# print("..........................")
# print(X.isnull().any())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

plsr = PLSRegression(n_components=2,
                     scale=True,
                     max_iter=500,
                     tol=1e-06,
                     copy=True)

# plsr = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# plsr
# model = plsr.fit(X_train, y_train)
# forest
model = plsr.fit(X_train, y_train.ravel())

pred = model.predict(X_test)

# accuracy
# print(model.score(X_test, y_test))
예제 #27
0
def pls_cal(dbfile,maskfile,outpath,which_elem,nc,normtype=1,mincomp=0,maxcomp=100,keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,testsetfile=None,nfolds=5,seed=None,skscale=False,max_samples=0.1,n_elems=9):
    plstype_string='sklearn'
    plstype='sklearn'
    if skscale==True:
        plstype_string=plstype+'_scale'
    print('Reading database')
    sys.stdout.flush()
    spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems)
    oxides=labels[2:]
    compindex=numpy.where(oxides==which_elem)[0]
    
    print('Choosing spectra')
    
    which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv'
    spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed)
        
    
    print('Masking spectra')
    spectra,wvl=ccam.mask(spectra,wvl,maskfile)
    
    print('Normalizing spectra')
    spectra=ccam.normalize(spectra,wvl,normtype=normtype)
    
    print('Removing Test Set')
    if testsetfile!=None:
         data=pandas.read_csv(testsetfile,header=None)
         
         testnames=numpy.array(data.iloc[:,0])
         testind=numpy.in1d(names,testnames)
         trainind=numpy.in1d(names,testnames,invert=True)
         names_test=names[testind]
         spectra_test=spectra[testind]
         spect_index_test=spect_index[testind]
         comps_test=comps[testind,compindex]
         
         plot.subplot(2,3,0)        
         plot.hist(comps_test,bins=20,range=[min(comps[:,compindex]),max(comps[:,compindex])])
         plot.xlabel(which_elem+' wt.%',fontsize=20)
         plot.ylabel('# of samples',fontsize=20)
         plot.title('Test Set',fontsize=23)
         
         names_train=names[trainind]
         traintest=numpy.zeros_like(names)
         traintest[trainind]='Train'
         traintest[testind]='Test'
         spectra_train=spectra[trainind]
         spect_index_train=spect_index[trainind]
         names_train=names[trainind]
         comps_train=comps[trainind,compindex]
         
         
    print('Assigning Folds')
    
        #if a fold file is specified, use it
    #    folds=ccam.folds(foldfile,names)
    #else:
        #otherwise, define random folds
    #    folds=ccam.random_folds(names,nfolds,seed=seed)

    names_unique,uniqueindex=numpy.unique(names_train,return_index=True)
    comps_unique_train=comps_train[uniqueindex]


    names_unique_sorted=names_unique[comps_unique_train.argsort()]
    folds=list(range(1,nfolds+1))
    while len(folds)<len(names_unique_sorted):
        folds.extend(range(1,nfolds+1))
    folds_train=numpy.zeros(len(names_train))
    for i in range(len(names_unique_sorted)):
        print(names_unique_sorted[i])
        print(folds[i])
        folds_train[numpy.in1d(names_train,names_unique_sorted[i])]=int(folds[i])

    names_nofold=names[(folds_train==0)]
    spect_index_nofold=spect_index[(folds_train==0)]
    #write a file containing the samples not assigned to folds
    with open(which_removed,'ab') as writefile:
        writer=csv.writer(writefile,delimiter=',',)
        for i in range(len(names_nofold)):
            writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold'])
    
    
    #remove spectra that are not assigned to any fold
    spectra_train=spectra_train[(folds_train!=0),:]
    spect_index_train=spect_index_train[(folds_train!=0)]
    names_train=names_train[(folds_train!=0)]
    comps_train=comps_train[(folds_train!=0)]
    folds_train=folds_train[(folds_train!=0)]
    
    
    

    
    print('Do Leave One Label Out (LOLO) cross validation with all folds but the test set')
    #define array to hold cross validation predictions and RMSEs
    train_predict_cv=numpy.zeros((len(names_train),nc))
    RMSECV=numpy.zeros(nc)
    
    for i in numpy.array(range(nfolds))+1:
        
        plot.subplot(2,3,i)        
        plot.hist(comps_train[(folds_train==i)],bins=20,range=[min(comps[:,compindex]),max(comps[:,compindex])])
        plot.xlabel(which_elem+' wt.%')
        plot.ylabel('# of samples')
        plot.title('Fold '+str(i))
        print('Holding out fold #'+str(i))
        
        if skscale==False:
        #mean center those spectra left in
            #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:])
            X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:])
            
            #and those left out
            X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0]   
             
            #mean center compositions left in
            Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)])
        if skscale==True:
            X_cv_in=spectra_train[(folds_train!=i),:]
            X_cv_out=spectra_train[(folds_train==i),:]
            Y_cv_in=comps_train[(folds_train!=i)]
            Y_cv_in_mean=0
       
        #step through each number of components
        for j in range(1,nc+1):
            print('Training Model for '+str(j)+' components')
            #train the model
            PLS1model=PLSRegression(n_components=j,scale=skscale)
            PLS1model.fit(X_cv_in,Y_cv_in)
            train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean)
               
    plot.tight_layout()
    fig=plot.gcf()
    fig.savefig(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_fold_hist.png',dpi=600)
    fig.clf()  
    
    #calculate RMSECV
    for i in range(0,nc):
        sqerr=(train_predict_cv[:,i]-comps_train)**2.0
        RMSECV[i]=numpy.sqrt(numpy.mean(sqerr))
    
    #mean center full model
    if skscale==False:
        X,X_mean=ccam.meancenter(spectra_train)
        X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0]
        X_all=ccam.meancenter(spectra,X_mean=X_mean)[0]
        
        Y,Y_mean=ccam.meancenter(comps_train)
    if skscale==True:
        X=spectra_train
        X_test=spectra_test
        X_all=spectra
        Y=comps_train
        Y_mean=0
    
    #create arrays for results and RMSEs
    trainset_results=numpy.zeros((len(names_train),nc))
    testset_results=numpy.zeros((len(names_test),nc))
    results=numpy.zeros((len(names),nc))    
    
    RMSEP=numpy.zeros(nc)
    RMSEC=numpy.zeros(nc)
    beta=numpy.zeros((len(X[0,:]),nc))
    Q_res=numpy.zeros((len(X[:,0]),nc))
    T2=numpy.zeros((len(X[:,0]),nc))

    [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose())))
    evals=numpy.diag(evals**2)
    #set up variables for cal target calculation    
    if cal_dir!=None:
        print('Reading cal target data')
        cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir)
        cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile)
        cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype)
        if skscale==True:
            cal_data_centered=cal_data
        if skscale==False:
            cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0]

            
        RMSEP_cal=numpy.zeros(nc)
        RMSEP_cal_good=numpy.zeros(nc)        
        RMSEP_KGAMEDS=numpy.zeros(nc)
        RMSEP_MACUSANITE=numpy.zeros(nc)
        RMSEP_NAU2HIS=numpy.zeros(nc)
        RMSEP_NAU2LOS=numpy.zeros(nc)
        RMSEP_NAU2MEDS=numpy.zeros(nc)
        RMSEP_NORITE=numpy.zeros(nc)
        RMSEP_PICRITE=numpy.zeros(nc)
        RMSEP_SHERGOTTITE=numpy.zeros(nc)
        
        targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file)
        target_comps=ccam.target_comp_lookup(targets,compfile,which_elem)
        cal_results=numpy.zeros((len(targets),nc))
       
    model_list=[]
    #Now step through each # of components with the full model
    for j in range(1,nc+1):
        print('Training full model for '+str(j)+' components')
        
        
        PLS1model=PLSRegression(n_components=j,scale=skscale)

        
        PLS1model.fit(X,Y)
        T=PLS1model.x_scores_
        #There's probably a more efficient way to calculate T2...
        for k in range(len(X[:,0])):
            T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:]))
        
        E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose())
        Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal()
        
        trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean)
        testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean)
        results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean)
        beta[:,j-1]=numpy.squeeze(PLS1model.coefs)
        model_list.append([PLS1model])

                
        if cal_dir != None:
            cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean)
            RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)


        RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0))
        RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0))
        
   
    #pickle the PLS model    
    with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile:
            pickle.dump(model_list,picklefile)

   
    if cal_dir!=None:
        n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0)
        print(n_good_cal)
        RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal
        RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal]            
                       
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            row=['File','Target','Laser Energy','True_Comp']
            row.extend(range(1,nc+1))
            writer.writerow(row)
            for i in range(0,len(targets)):
                row=[cal_filelist[i],targets[i],amps[i],target_comps[i]]
                row.extend(cal_results[i,:])
                writer.writerow(row)
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            writer.writerow(['NC','RMSEP Cal Targets (wt.%)'])            
            for i in range(0,nc):
                writer.writerow([i+1,RMSEP_cal[i]])
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals)
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good)
        
    # plot RMSEs
    ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png')
    
    
   
   #Write output info to files

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(Q_res[i,:])
            writer.writerow(row)
    with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=[which_elem]
        writer.writerow(row)
        row=['Min',numpy.percentile(comps[:,compindex],0)]
        writer.writerow(row)
        row=['1st Quartile',numpy.percentile(comps[:,compindex],25)]
        writer.writerow(row)
        row=['Median',numpy.percentile(comps[:,compindex],50)]
        writer.writerow(row)
        row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)]
        writer.writerow(row)
        row=['Max',numpy.percentile(comps[:,compindex],100)]
        writer.writerow(row)

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(T2[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSECV (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSECV[i]])
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEC (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEC[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEP (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEP[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(train_predict_cv[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(trainset_results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_test)):
            row=[names_test[i],spect_index_test[i],comps_test[i]]
            row.extend(testset_results[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Set','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names)):
            row=[names[i],spect_index[i],traintest[i],comps[i,compindex]]
            row.extend(results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['wvl']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(wvl)):
            row=[wvl[i]]
            row.extend(beta[i,:])
            writer.writerow(row)        
    
    if skscale==False:
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')        
            writer.writerow([which_elem+' mean',Y_mean])
            for i in range(0,len(wvl)):
                row=[wvl[i],X_mean[i]]
                writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')        
        writer.writerow(['Spectral database =',dbfile])
        writer.writerow(['Spectra Kept =',keepfile])
        writer.writerow(['Spectra Removed =',which_removed])
        writer.writerow(['Test Set File =',testsetfile])
        writer.writerow(['Mask File =',maskfile])
        writer.writerow(['Algorithm =',plstype_string])
        writer.writerow(['# of components =',nc])
        writer.writerow(['Normalization Type =',normtype])
        writer.writerow(['Composition Min. =',mincomp])
        writer.writerow(['Composition Max. =',maxcomp])
예제 #28
0
# regr = LinearRegression()
# regr.fit(X_reduced,y)
# ypc=regr.predict(X_reduced)
# print('The R2 score is: ', r2_score(ypc,y))

result = np.genfromtxt("final_pca_reduced_data_with_label.txt",
                       dtype=None,
                       delimiter=',',
                       skip_header=1)
result = result.astype('float')
print(result.shape)

X_reduced = result[:, 0:2]
y = result[:, 2]

pls = PLSRegression()
pls.fit(X_reduced, y)

# y_test_pred=pls.predict(X_test)
# print('The R2 score is: ', r2_score(y_test_pred,Y_test))

print('Starting gathering testing samples')

testfilescount = 6
testfilesnames = [
    '2013-06', '2014-07', '2015-08', '2013-01', '2014-12', '2015-02'
]
final_mae = []
final_mse = []
final_r2 = []
예제 #29
0
 def __init__(self, A, d, C, seed):
     self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True)
     np.random.seed(seed)
     self.A = A
     self.d = d
     self.C = C
예제 #30
0
    def bgap_pred_all_features(slef):
        df = pd.read_csv(path + '/ML/data/dft_data_with_features.csv',
                         sep='\t')
        df = df.drop([
            'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1',
            'B1_frac', 'B2', 'B2_frac', 'O', 'O_frac', 'atom_numO',
            'mend_numO', 'atomic_rO', 'O_X', 'M_O', 'V_O', 'therm_con_O',
            'polarizability_O', 'lattice_const_O', 'Row_O', 'Group_O', 'nO',
            'rO'
        ],
                     axis=1)
        df_x = df.drop(['Ehull', 'Bandgap'], axis=1)
        df_y = df[['Bandgap']]
        algo_dict_mse = {
            'DT': [],
            'SVR': [],
            'PLS': [],
            'EN': [],
            'KNN': [],
            'RAND': [],
            'GBR': []
        }
        algo_dict_mae = {
            'DT': [],
            'SVR': [],
            'PLS': [],
            'KNN': [],
            'RAND': [],
            'GBR': []
        }
        for i in range(20):
            X_train, X_test, y_train, y_test = train_test_split(
                df_x, df_y.values.ravel(), test_size=0.2, random_state=i)
            pipelines = []
            pipelines.append(('DT',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('DT', DecisionTreeRegressor())])))
            pipelines.append(
                ('SVR', Pipeline([('Scaler', StandardScaler()),
                                  ('SVR', SVR())])))  #
            pipelines.append(('PLS',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('PLS', PLSRegression())])))
            pipelines.append(('KNN',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('KNN', KNeighborsRegressor())])))
            pipelines.append(('RAND',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('RAND', RandomForestRegressor())])))
            pipelines.append(
                ('GBR',
                 Pipeline([('Scaler', StandardScaler()),
                           ('GBR', GradientBoostingRegressor())])))

            results = []
            names = []
            for name, model in pipelines:
                # cv = KFold(n_splits=10, random_state=10)
                cv = LeaveOneOut()
                cv_results_mse = cross_val_score(
                    model,
                    X_train,
                    y_train,
                    cv=cv,
                    scoring='neg_mean_squared_error')
                cv_results_mae = cross_val_score(
                    model,
                    X_train,
                    y_train,
                    cv=cv,
                    scoring='neg_mean_absolute_error')
                msg_mse = "%s: MSE %f (%f)" % (name, cv_results_mse.mean(),
                                               cv_results_mse.std())
                msg_mae = "%s: MAE %f (%f)" % (name, cv_results_mae.mean(),
                                               cv_results_mae.std())
                print(msg_mse)
                print(msg_mae)
                algo_dict_mse[name].append(np.sqrt(-1 * cv_results_mse.mean()))
                algo_dict_mae[name].append(-1 * cv_results_mae.mean())
            print('\n')
        print('DT 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['DT']).mean(),
               np.array(algo_dict_mae['DT']).mean(),
               np.array(algo_dict_mae['DT']).std()))
        print('SVR 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['SVR']).mean(),
               np.array(algo_dict_mae['SVR']).mean(),
               np.array(algo_dict_mae['SVR']).std()))
        print('PLS 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['PLS']).mean(),
               np.array(algo_dict_mae['PLS']).mean(),
               np.array(algo_dict_mae['PLS']).std()))
        print('KNN 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['KNN']).mean(),
               np.array(algo_dict_mae['KNN']).mean(),
               np.array(algo_dict_mae['KNN']).std()))
        print('RAND 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['RAND']).mean(),
               np.array(algo_dict_mae['RAND']).mean(),
               np.array(algo_dict_mae['RAND']).std()))
        print('GBR 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['GBR']).mean(),
               np.array(algo_dict_mae['GBR']).mean(),
               np.array(algo_dict_mae['GBR']).std()))