예제 #1
0
파일: models.py 프로젝트: atanna/neptune
 def fit_and_count_av_score(self, X, Y, cv=3, n_estimators=100, test_size=0.4):
     X_train, X_test, y_train, y_test = cross_validation\
         .train_test_split(X, Y, test_size=test_size)
     self.fit(X_train, y_train, n_estimators)
     auc_scores = cross_val_score(self.M, X_test, y_test, cv=cv, n_jobs=-1,
                              scoring='roc_auc')
     y_pred = self.predict(X_test)
     print("roc_auc_score", auc_scores.mean())
     print("bac_score: ", bac_metric(y_test, y_pred))
     print(classification_report(y_test, self.bin(y_pred, 0)))
예제 #2
0
파일: models.py 프로젝트: atanna/neptune
    def preprocess_bin_cl(self, X, Y, list_clf=None, n_estimators=50, test_size=0.4):
        X_train, X_test, y_train, y_test = cross_validation \
            .train_test_split(X, Y, test_size=test_size)

        if list_clf is None:
            list_clf = [
                self._get_clf(n_estimators, []),
                self._get_clf(n_estimators, [("SelectFpr",
                                              SelectFpr())]),
                self._get_clf(n_estimators, [("SelectFwe",
                                              SelectFwe())]),
                self._get_clf(n_estimators, [("SelectFdr",
                                              SelectFdr())]),
                self._get_clf(n_estimators, [("SelectPercentile",
                                              SelectPercentile())])
                # self._get_clf(n_estimators, [("PCA", PCA())]),
                # self._get_clf(n_estimators, [("LDA", LDA())]),
                # self._get_clf(n_estimators, [('linearSVC', LinearSVC())])
            ]

        best_auc_clf, best_auc, best_auc__bac = None, 0, 0
        best_bac_clf, best_bac, best_bac__auc = None, 0, 0
        parameters = {'RF__criterion': ('gini', 'entropy'),
                      'RF__max_features': ('auto', 'sqrt'),
                      'RF__max_depth': (50, 100, None)
                      }

        for cl in list_clf:
            # try:
            param = dict(parameters)
            for step in cl.steps:
                if step[0] in {"SelectFpr", "SelectFwe", "SelectFdr"}:
                    param.update({"{}__alpha"
                                 .format(step[0]): (0.01, 0.05, 0.1)})
                if step[0] == "SelectPercentile":
                    param.update(SelectPercentile__percentile=(10, 15, 20, 40, 80))

            gs = GridSearchCV(cl, param,
                              scoring='roc_auc',
                              n_jobs=-1).fit(X_train, y_train)
            cl = gs.best_estimator_
            y_pred = cl.predict_proba(X_test)[:, 1]
            auc_cv = cross_val_score(cl, X_test, y_test, scoring="roc_auc", cv=3, n_jobs=-1)
            auc = roc_auc_score(y_test, y_pred)
            bac = bac_metric(y_test, y_pred)
            print("n_params: {}  ({})".format(cl.transform(X[0:1])
                                              .shape[1], X.shape[1]))
            print("steps: {steps}\n"
                  "auc: {auc} {auc_cv_mean} ({auc_cv})\nbac: {bac}\n"
                  .format(steps=cl.steps,
                          auc=auc,
                          auc_cv_mean=auc_cv.mean(),
                          auc_cv=auc_cv,
                          bac=bac))
            auc = auc_cv.mean()
            if auc > best_auc:
                best_auc_clf, best_auc, best_auc__bac = cl, auc, bac
            if bac > best_bac:
                best_bac_clf, best_bac, best_bac__auc = cl, bac, auc

        self.best_clf = best_auc_clf
        print("best_auc_clf:")
        print("steps:{} auc: {}  bac: {}"
              .format([step[0] for step in best_auc_clf.steps],
                      best_auc,
                      best_auc__bac))

        print("best_bac_clf:")
        print("steps:{} auc: {}  bac: {}"
              .format([step[0] for step in best_bac_clf.steps],
                      best_bac__auc,
                      best_bac))

        d_bac = best_bac - best_auc__bac
        d_auc = best_auc - best_bac__auc

        if d_bac - d_auc > 0:
            self.best_clf = best_bac_clf
        else:
            self.best_clf = best_auc_clf