示例#1
0
def regtree(par, *data):
    X_train, X_test, Y_train, Y_test = data
    regTreeModel=tree.DecisionTreeRegressor\
        (max_features=par[0],min_samples_split=par[1],min_samples_leaf=par[2],
         min_weight_fraction_leaf=par[3],max_leaf_nodes=int(par[4]))
    fitModel = linear_model.LinearRegression()
    Yp,Yptrain,regTreeModel,fitModelList,predind=\
        SSRS.RegressionTree(X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field,
                            doFitSelection=0,doMultiBand=1)
    rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test)
    print(rmse)
    return rmse
示例#2
0
Y = UCData
attrind = np.array(range(1, 51) + range(62, 78, 3))
Field = [Field[i] for i in range(1, 51) + range(62, 78, 3)]
X = AttrData[:, attrind]
X[np.isnan(X)] = 0
scaler = preprocessing.StandardScaler().fit(X)
Xn = scaler.fit_transform(X)

### cluster
model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000)
model = AffinityPropagation(preference=-150, verbose=True)
#model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True)
model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100),
                  bin_seeding=True)

label = SSRS.Cluster(X, model)

### classification
model = tree.DecisionTreeClassifier()
model = GaussianNB()
model = svm.SVC()
model = SGDClassifier()

Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model)
SSRS.plotErrorMap(label, Tp)

### regression
regModel = linear_model.LinearRegression()
#regModel=svm.SVC()
regModel = KNeighborsRegressor(n_neighbors=10)
regModel = tree.DecisionTreeRegressor()
示例#3
0
Xn=scaler.fit_transform(X)


### cluster
model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000)
model = AffinityPropagation(preference=-150,verbose=True)
#model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True)
model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100), bin_seeding=True)

label=SSRS.Cluster(X, model)

### classification
model = tree.DecisionTreeClassifier()
model = GaussianNB()
model = svm.SVC()
model = SGDClassifier()

Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model)
SSRS.plotErrorMap(label, Tp)


### regression
regModel=linear_model.LinearRegression()
#regModel=svm.SVC()
regModel=KNeighborsRegressor(n_neighbors=10)
regModel = tree.DecisionTreeRegressor()
regModel = GaussianNB()

rmse_band,Yp,Ytest=SSRS.RegressionLearn(X,XXn,0.2,regModel)

示例#4
0
## plot tree
regModel.fit(X2_train, Y2_train)
savedir=r"/Volumes/wrgroup/Kuai/USGSCorr/figure_tree/"
savedir=r"Y:\Kuai\USGSCorr\figure_tree\\"

with open(savedir+"tree.dot", 'w') as f:
    f = tree.export_graphviz(regModel, out_file=f,feature_names=[Field[i] for i in attrsel],
                             label='none',node_ids=True)
os.system("dot -Tpng tree.dot -o tree.png")

regTree=regModel.tree_
feature_names=[Field[i] for i in attrind]
Xin=X2_train
Yin=Y2_train
string,nodeind,leaf,label=SSRS.traverseTree(regTree,feature_names,Xin)
for i in range(0,regTree.node_count):
    plt.figure()
    plt.boxplot(Yin[nodeind[i],:])
    plt.title(string[i],fontsize=8)
    #plt.tight_layout()
    plt.savefig(savedir+"Train_node%i"%i)
    plt.close()

Xin=X2_test
Yin=Y2_test
string,nodeind,leaf,label=SSRS.traverseTree(regTree,feature_names,Xin)
for i in range(0,regTree.node_count):
    plt.figure()
    plt.boxplot(Yin[nodeind[i],:])
    plt.title(string[i],fontsize=8)
示例#5
0
def testModel(predListTest):
    nmodel = predListTest.__len__()
    nit = 50
    # test 1: different size of training and test
    testErr = np.ones([nmodel, 5, 50])
    testsize = [0.2, 0.4, 0.5, 0.6, 0.7]
    for k in range(0, nit):
        print(k)
        for j in range(0, nmodel):
            for i in range(0, 5):
                ind = range(0, nind)
                X_train,X_test,Y_train,Y_test,ind_train,ind_test = \
                    cross_validation.train_test_split(Xn,dist,ind,test_size=testsize[i],random_state=k)
                regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20,
                                                          min_samples_leaf=20)
                fitModel = linear_model.LinearRegression()
                predSel = predListTest[j]
                predName = [Field[jj] for jj in predSel]
                Yp,Yptrain,regTreeModel,fitModelList,predind=\
                    SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName,
                                        doFitSelection=0,doMultiBand=1)
                rmse, rmse_band = SSRS.RMSECal(Yp, Y_test)
                testErr[j, i, k] = rmse

    # test 2: use 1 HUC2 as test
    testErr1_huc2_rt = np.ones([nmodel, 18])
    trainErr1_huc2_rt = np.ones([nmodel, 18])
    IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat"
    mat = sio.loadmat(IDhucfile)
    IDhuc = mat["IDhuc"]
    huc2 = IDhuc[indvalid, 1]
    for k in range(0, nit):
        print(k)
        for i in range(0, 18):
            ind = range(0, nind)
            X_train,X_test,Y_train,Y_test,ind_train,ind_test = \
                cross_validation.train_test_split(Xn,dist,ind,test_size=0.2,random_state=k)
            ind_test = np.where(huc2 == i + 1)[0]
            X_test = Xn[ind_test, :]
            Y_test = dist[ind_test, :]
            for j in range(0, nmodel):
                regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20)
                fitModel = linear_model.LinearRegression()
                predSel = predListTest[j]
                predName = [Field[jj] for jj in predSel]
                Yp,Yptrain,regTreeModel,fitModelList,predind=\
                    SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName,
                                        doFitSelection=0,doMultiBand=1)
                rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train)
                trainErr1_huc2_rt[j, i] = rmse
                rmse, rmse_band = SSRS.RMSECal(Yp, Y_test)
                testErr1_huc2_rt[j, i] = rmse

    # test 3: leave out 1 HUC2 one time
    testErr1_huc2 = np.ones([nmodel, 18])
    trainErr1_huc2 = np.ones([nmodel, 18])
    IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat"
    mat = sio.loadmat(IDhucfile)
    IDhuc = mat["IDhuc"]
    huc2 = IDhuc[indvalid, 1]
    for i in range(0, 18):
        ind_test = np.where(huc2 == i + 1)[0]
        ind_train = np.where(huc2 != i + 1)[0]
        X_train = Xn[ind_train, :]
        X_test = Xn[ind_test, :]
        Y_train = dist[ind_train, :]
        Y_test = dist[ind_test, :]
        for j in range(0, nmodel):
            regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20)
            fitModel = linear_model.LinearRegression()
            predSel = predListTest[j]
            predName = [Field[jj] for jj in predSel]
            Yp,Yptrain,regTreeModel,fitModelList,predind=\
                SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName,
                                    doFitSelection=0,doMultiBand=1)
            rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train)
            trainErr1_huc2[j, i] = rmse
            rmse, rmse_band = SSRS.RMSECal(Yp, Y_test)
            testErr1_huc2[j, i] = rmse

    # test 4: leave out 2 HUC2 one time
    testErr2_huc2 = np.ones([nmodel, 18 * 17])
    trainErr2_huc2 = np.ones([nmodel, 18 * 17])
    hucTab = np.ones([18 * 17, 2])
    IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat"
    mat = sio.loadmat(IDhucfile)
    IDhuc = mat["IDhuc"]
    huc2 = IDhuc[indvalid, 1]
    n = -1
    for i in range(0, 18):
        print(i)
        for j in range(0, 18):
            if i == j:
                continue
            n = n + 1
            hucTab[n, 0] = i
            hucTab[n, 1] = j
            ind_test = np.where((huc2 == i + 1) | (huc2 == j + 1))[0]
            ind_train = np.where((huc2 != i + 1) & (huc2 != j + 1))[0]
            X_train = Xn[ind_train, :]
            X_test = Xn[ind_test, :]
            Y_train = dist[ind_train, :]
            Y_test = dist[ind_test, :]
            for k in range(0, nmodel):
                regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20)
                fitModel = linear_model.LinearRegression()
                predSel = predListTest[k]
                predName = [Field[jj] for jj in predSel]
                Yp,Yptrain,regTreeModel,fitModelList,predind=\
                    SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName,
                                        doFitSelection=0,doMultiBand=1)
                rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train)
                trainErr2_huc2[k, n] = rmse
                rmse, rmse_band = SSRS.RMSECal(Yp, Y_test)
                testErr2_huc2[k, n] = rmse
    return testErr,trainErr1_huc2,testErr1_huc2,\
           trainErr1_huc2_rt,testErr1_huc2_rt,\
           trainErr2_huc2,testErr2_huc2,hucTab
示例#6
0
X = np.delete(X, indnan, 0)
Y = np.delete(Y, indnan, 0)
indvalid = np.delete(indvalid, indnan, 0)

scaler = preprocessing.StandardScaler().fit(X)
Xn = scaler.fit_transform(X)
[nind, nband] = Y.shape
[nind, nattr] = X.shape

# test for k in kmean
score_cluster = np.zeros(8)
for i in range(2, 10):
    print(i)
    nc = i
    model = KMeans(init='k-means++', n_clusters=nc, n_init=10, max_iter=1000)
    label, center = SSRS.Cluster(Y, model, doplot=0)
    score_cluster[i - 2] = metrics.silhouette_score(Y, label)
plt.plot(range(2, 10), score_cluster, '-*')

## cluster
nc = 6
model = KMeans(init='k-means++',
               n_clusters=nc,
               n_init=15,
               max_iter=1000,
               tol=1e-15,
               verbose=True)
label, center = SSRS.Cluster(Y, model, doplot=0)

## PCA
pca = PCA(n_components=nband)
示例#7
0
scaler = preprocessing.StandardScaler().fit(X)
Xn = scaler.fit_transform(X)
[nind, nband] = Y.shape
[nind, nattr] = X.shape

################################################################
# CLUSTER
################################################################
nc = 6
model = KMeans(init='k-means++',
               n_clusters=nc,
               n_init=15,
               max_iter=1000,
               tol=1e-15,
               verbose=True)
label, center = SSRS.Cluster(Y, model, doplot=0)

## PCA
pca = PCA(n_components=nband)
pca.fit(Y)
Ypca = pca.transform(Y)
Cpca = pca.transform(center)
Ypca[:, 0] = -Ypca[:, 0]
Cpca[:, 0] = -Cpca[:, 0]

## rename clusters
ythe = np.array([0])
label, Cpca, center = SSRS.Cluster_rename(label, ythe, Cpca, center)

## plot PCA and cluster after resign name
SSRS.Cluster_plot(Y, label, center)
示例#8
0
# nn = Regressor(
#     layers=[
#         Layer("Sigmoid", units=200),
#         Layer("Sigmoid", units=200),
#         Layer("Linear")],
#     learning_rate=0.1,
#     n_iter=200,verbose=1)

X_train,X_test,Y_train,Y_test = cross_validation.train_test_split(\
        Xn,Y,test_size=0.2,random_state=0)

# predict correlation
Yp,Yptrain,regModelList=SSRS.Regression\
    (X_train,X_test,Y_train,Y_test,multiband=1,regModel=regModel,doplot=0)
rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test)
rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train)
print(rmse)
print(rmse_train)
print(np.corrcoef(Yp[:, 0], Y_test[:, 0]))

par = [1.0, 16, 6, 20, 0.15]
regTreeModel=tree.DecisionTreeRegressor\
    (max_features=par[0],max_depth=par[1],min_samples_split=par[2],min_samples_leaf=par[3],
     min_weight_fraction_leaf=par[4],max_leaf_nodes=18)
fitModel = linear_model.LinearRegression()
Yp,Yptrain,regTreeModel,fitModelList,predind=SSRS.RegressionTree\
    (X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field,doFitSelection=0)
rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test)
rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train)
print(rmse)
示例#9
0
Y2=np.argmax(Y[:,15:30],axis=1)

## Regression
regModel=linear_model.LinearRegression()
#regModel=svm.SVC()
regModel=KNeighborsRegressor(n_neighbors=20)
regModel=tree.DecisionTreeRegressor()
regModel=GaussianNB()
regModel=sklearn.linear_model.SGDRegressor()
regModel=RandomForestRegressor()

X_train,X_test,Y_train,Y_test = cross_validation.train_test_split(\
        Xn,np.column_stack((Y1,Y2)),test_size=0.2,random_state=0)

Yp,rmse,rmse_train,rmse_band,rmse_band_train=SSRS.Regression\
    (X_train,X_test,Y_train,Y_test,multiband=1,regModel=regModel,doplot=0)
print(rmse)
print(rmse_train)

## Classification
model = tree.DecisionTreeClassifier()
model = GaussianNB()
model = svm.SVC()
model = SGDClassifier()
model=sklearn.ensemble.RandomForestClassifier()

Yin=Y1
Tp = SSRS.Classification_cross(Xn, T=Yin, nfold=10, model=model)
SSRS.plotErrorMap(Yin, Tp)
np.sqrt(((Yin - Tp) ** 2).mean())
np.count_nonzero(np.abs(Yin-Tp)<2)/4627.