Пример #1
0
def main():
    qresult = connect_db('solar.db', 'dip')
    smiles, compounds, gaps = get_data(qresult)
    mols = get_mols(smiles)
    fps_morgan, failed_mols = get_fingerprints(mols)
    refine_compounds(compounds, mols, gaps, failed_mols)
    compound_array = np.array(compounds)
    gaps_array = np.array(gaps)
    train_id, test_id, y_train, y_test = train_test_split(compound_array,
                                                          gaps_array,
                                                          test_size=0.20,
                                                          random_state=0)
    train_fps = get_fp_from_id(compounds, fps_morgan, train_id)
    test_fps = get_fp_from_id(compounds, fps_morgan, test_id)
    xgb1 = XGBRegressor(n_estimators=2000,
                        learning_rate=0.03,
                        max_depth=7,
                        colsample_bytree=0.6,
                        nthread=8,
                        scale_pos_weight=1,
                        gamma=0,
                        random_state=0,
                        subsample=0.6,
                        min_child_weight=3,
                        early_stopping_rounds=10,
                        reg_alpha=1)
    modelfit(xgb1, train_fps, y_train)
    #xgb1 = joblib.load('gbdt_dip_xgb.joblib')
    #joblib.dump(xgb1, 'gbdt_dip_xgb2.joblib')
    y_pred_cv = cvp(xgb1, train_fps, y_train, cv=4, n_jobs=8)
    y_train_pred = xgb1.predict(train_fps)
    y_pred_test = xgb1.predict(test_fps)
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    train_df['id'] = pd.Series(train_id)
    train_df['dip_exp'] = pd.Series(y_train)
    train_df['dip_cv'] = pd.Series(y_pred_cv)
    train_df['dip_gbdt'] = pd.Series(y_train_pred)
    train_df['Group'] = 'Train'
    test_df['id'] = pd.Series(test_id)
    test_df['dip_exp'] = pd.Series(y_test)
    test_df['dip_cv'] = pd.Series(y_pred_test)
    test_df['dip_gbdt'] = pd.Series(y_pred_test)
    test_df['Group'] = 'Test'
    result_df = pd.concat([train_df, test_df])

    result_df.to_csv('dip_xgb_train_test.csv')
    test_err = mean_squared_error(y_pred_test, y_test)
    print('Test error: {:4f}'.format(np.sqrt(test_err)))
Пример #2
0
 def scan_fit(self,X,y):
     self.n_classes = len(np.unique(y))
     newX,newy,scan_round_total = self._sample_slicer(X,y)
     sample_vector_list = []
     for estimator in self.estimators:
         estimator.fit(newX, newy)
         if self.k_fold > 1:# use cv
             predict_ = cvp(estimator, newX, newy, cv=self.k_fold, n_jobs = -1)
         else:#use oob
             predict_ = estimator.oob_decision_function_
             #fill default value if meet nan
             inds = np.where(np.isnan(predict_))
             predict_[inds] = 1./self.n_classes
         sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes))
         sample_vector_list.append(sample_vector)
     return np.hstack(sample_vector_list)
Пример #3
0
    def predict(self, X, y):
        """
        Returns a generator containing the predictions for each of the
        internal models (using cross_val_predict and a CV=12).

        Parameters
        ----------

        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.

        """
        for model in self.models:
            yield cvp(model, X, y, cv=12)
Пример #4
0
    def predict(self, X, y):
        """
        Returns a generator containing the predictions for each of the
        internal models (using cross_val_predict and a CV=12).

        Parameters
        ----------

        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.

        """
        for model in self.models:
            yield cvp(model, X, y, cv=12)
Пример #5
0
print(set_sizes[0])
print('here', set_sizes[nrows] * 0.7)

X_train = X.head(int(set_sizes[nrows] * 0.7))
X_test = X.tail(int(set_sizes[nrows] * 0.3))

Y_train = Y.head(int(set_sizes[nrows] * 0.7))
Y_test = Y.tail(int(set_sizes[nrows] * 0.3))

ne_lr = LinearRegression(minibatches=None)
Y2 = pd.to_numeric(Y, downcast='float')
print("here", type((Y2)))

print(type(Y_train))

ne_lr.fit(X_train, pd.to_numeric(Y_train, downcast='float'))

print(ne_lr)

y_pred = ne_lr.predict(X_test)

res = mean_squared_error(Y_test, y_pred)
#res = scoring(y_target=Y_test, y_predicted=y_pred, metric='rmse')
print("results: ", res)

lin = linear_model.LinearRegression()

lin.fit(X_train, Y_train)

predictedCV = cvp(lin, X, Y, cv=10)
print("rmse cross val", mean_squared_error(Y, predictedCV))
Пример #6
0
    return data


def make_matrix(l):
    matrix = np.full((ROW, COL), 0)
    for d in l:
        matrix[d[0]][d[1]] = d[2]
    return matrix


if __name__ == "__main__":
    train = read_csv(TRAIN)
    gender = read_csv(GENDER)
    year = read_csv(YEAR)

    X1 = make_matrix(train)
    X2 = X1.T
    Y1 = np.asarray(gender).T[0]
    Y2 = np.asarray(year).T[0]

    clf1 = logr()
    scores = cvs(clf1, X1, Y1, cv=10)
    print("Min CV error: {}".format(1 - max(scores)))

    clf2 = logr(solver="saga", multi_class="multinomial")
    pred = cvp(clf2, X2, Y2, cv=10)
    mse1 = mse(Y2, pred)
    mse2 = mse(Y2, np.full_like(Y2, np.mean(Y2)))
    print("Regression MSE: {}".format(mse1))
    print("Naive MSE: {}".format(mse2))
Пример #7
0
# 可以使用 Scikit-Image, Pillow, OpenCV 等等讓某些模式更為突出, 例如閉環等等

# 從這裡開始以下的輸出結果還沒有確認過(設備問題, 執行時間太長)
# 多標籤分類
# 一般來說每個實例都只會被分在一個類別裡, 若希望分類器為實例分出多個標籤
# 注意: 不是所有的分類器都支援多標籤分類
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)  # 儲存大於等於7的標籤
y_train_odd = (y_train % 2 == 1)  # 儲存奇數
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()  # default: n_neighbors = 5
knn_clf.fit(X_train, y_multilabel)
# print(knn_clf.predict([some_digit])) # [[False False]] 2 既非大於等於7也非奇數
# 計算 f1_score
y_train_knn_pred = cvp(knn_clf, X_train, y_train, cv=3)
print(f1_score(y_train, y_train_knn_pred, average="macro"))

# 多輸出分類
# 多標籤分類的泛化, 其標籤也可以是多種類別(兩個以上的值)
# 由以下例子說明: 構建一個系統去除圖片的雜訊
# 注意: 此分類器的輸出是多個標籤(一個pixel 一個label, 像素強度範圍 0~255)
# 首先, 先把乾淨的圖片加入雜訊並創建訓練集和測試集
rnd = np.RandomState(42)
noise_train = rnd.randint(0, 100, (len(X_train)), 784)
noise_test = rnd.randint(0, 100, (len(X_test)), 784)
X_train_mod = X_train + noise_train
X_test_mod = X_test + noise_test
y_train_mod = X_train
y_test_mod = X_test
Пример #8
0
    def fit(self, X_train, y_train):
        self.n_classes = len(np.unique(y_train))
        self.estimators_levels = []
        klass = self.base_estimator.__class__
        predictions_levels = []
        self.classes = np.unique(y_train)

        # first level
        estimators = [klass(**params) for params in self.params_list]
        self.estimators_levels.append(estimators)
        predictions = []
        for estimator in estimators:
            estimator.fit(X_train, y_train)
            if self.k_fold > 1:  # use cv
                predict_ = cvp(estimator,
                               X_train,
                               y_train,
                               cv=self.k_fold,
                               n_jobs=-1)
            else:  # use oob
                predict_ = estimator.oob_decision_function_
                # fill default value if meet nan
                inds = np.where(np.isnan(predict_))
                predict_[inds] = 1. / self.n_classes
            predictions.append(predict_)
        attr_to_next_level = np.hstack(predictions)
        y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0),
                                            axis=1),
                                  axis=0)
        self.max_accuracy = self.evaluate(y_pre, y_train)

        # cascade step
        while True:
            print('level {}, CV accuracy: {}'.format(
                len(self.estimators_levels), self.max_accuracy))
            estimators = [klass(**params) for params in self.params_list]
            self.estimators_levels.append(estimators)
            predictions = []
            X_train_step = np.hstack((attr_to_next_level, X_train))
            for estimator in estimators:
                estimator.fit(X_train_step, y_train)
                if self.k_fold > 1:  # use cv
                    predict_ = cvp(estimator,
                                   X_train_step,
                                   y_train,
                                   cv=self.k_fold,
                                   n_jobs=-1)
                else:  # use oob
                    predict_ = estimator.oob_decision_function_
                    # fill default value if meet nan
                    inds = np.where(np.isnan(predict_))
                    predict_[inds] = 1. / self.n_classes
                predictions.append(predict_)
            attr_to_next_level = np.hstack(predictions)
            y_pre = self.classes.take(np.argmax(
                np.array(predictions).mean(axis=0), axis=1),
                                      axis=0)
            accuracy = self.evaluate(y_pre, y_train)
            if accuracy > self.max_accuracy:
                self.max_accuracy = accuracy
            else:
                self.estimators_levels.pop()
                break