예제 #1
0
    def fast_objective(
        max_depth,
        max_leaf,
        l1,
        l2,
        min_samples_leaf,
        learning_rate,
    ):
        max_leaf = int(max_leaf)
        max_depth = int(max_depth)
        min_samples_leaf = int(min_samples_leaf)

        assert type(max_leaf) == int
        assert type(max_depth) == int
        assert type(min_samples_leaf) == int

        model = FastRGFClassifier(
            max_leaf=max_leaf,
            max_depth=max_depth,
            l1=l1,
            l2=l2,
            min_samples_leaf=min_samples_leaf,
            learning_rate=learning_rate,
        )
        model.fit(train_m, label_m)
        pred_proba = model.predict_proba(train_val)
        score = roc_auc_score(label_val, pred_proba[:, 1])
        return score
예제 #2
0
def train_model(X_train, y_train, params):
    l1 = params["l1"]
    l2 = params["l2"]
    learning_rate = params["learning_rate"]
    max_leaf = int(params["max_leaf"])
    max_depth = int(params["max_depth"])
    min_samples_leaf = int(params["min_samples_leaf"])

    model = FastRGFClassifier(
        max_leaf=max_leaf,
        max_depth=max_depth,
        l1=l1,
        l2=l2,
        min_samples_leaf=min_samples_leaf,
        learning_rate=learning_rate,
    )
    model.fit(X_train, y_train)

    return model
예제 #3
0
def run_model(X_train, y_train, X_val, y_val, params):
    l1 = params["l1"]
    l2 = params["l2"]
    learning_rate = params["learning_rate"]
    max_leaf = int(params["max_leaf"])
    max_depth = int(params["max_depth"])
    min_samples_leaf = int(params["min_samples_leaf"])

    model = FastRGFClassifier(
        max_leaf=max_leaf,
        max_depth=max_depth,
        l1=l1,
        l2=l2,
        min_samples_leaf=min_samples_leaf,
        learning_rate=learning_rate,
    )
    model.fit(X_train, y_train)
    pred_proba = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, pred_proba)

    return pred_proba, score
예제 #4
0
 def __init__(self, task, fast=False):
     if task == 'classification':
         self.metric = 'roc_auc'
         self.task = "classification"
         if fast:
             self.model = FastRGFClassifier()
         else:
             self.model = RGFClassifier(loss="Log")
     else:
         self.metric = 'neg_mean_squared_error'
         self.task = "regression"
         if fast:
             self.model = FastRGFRegressor()
         else:
             self.model = RGFRegressor(loss="LS", normalize=True)
     self.X_test = None
     self.X_train = None
     self.y_test = None
     self.y_train = None
     self.grid_search = None
     self.y_predict = None
     self.test_score = None
예제 #5
0
    def fit(self, x_train: np.ndarray, y_train: np.ndarray,
            x_valid: np.ndarray, y_valid: np.ndarray, config: dict,
            **kwargs) -> Tuple[RGFModel, dict]:
        model_params = config["model"]["model_params"]
        mode = config["model"]["train_params"]["mode"]
        if mode == "regression":
            model = FastRGFRegressor(**model_params)
        else:
            model = FastRGFClassifier(**model_params)

        x_train = (pd.DataFrame(x_train).replace(
            [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32"))
        y_train = (pd.DataFrame(y_train).replace(
            [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32"))

        model.fit(x_train, y_train)

        x_valid = (pd.DataFrame(x_valid).replace(
            [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32"))
        y_valid = (pd.DataFrame(y_valid).replace(
            [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32"))
        best_score = {"valid_score": model.score(x_valid, y_valid)}

        return model, best_score
예제 #6
0
def train_classifiers(X_data, y):
    """
    Trains several classifiers and reporting model quality.
    :param X_data:
    :param y:
    :return: trained models
    """
    # Split the dataset into Train and Test
    seed = 7
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed)

    svm = SVC()
    svm_params = {
        'C': [1, 10, 100, 1000],
        'gamma': [1, 0.1, 0.001, 0.0001],
        'kernel': ['linear', 'rbf']
    }
    svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    knn = KNeighborsClassifier()
    knn_params = {
        'n_neighbors': [5, 6, 7, 8, 9, 10],
        'leaf_size': [1, 2, 3, 5],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'n_jobs': [-1]
    }
    knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    # Train the XGboost Model for Classification
    xgb_model = xgb.XGBClassifier()

    # brute force scan for all parameters, here are the tricks
    # usually max_depth is 6,7,8
    # learning rate is around 0.05, but small changes may make big diff
    # tuning min_child_weight subsample colsample_bytree can have
    # much fun of fighting against overfit
    # n_estimators is how many round of boosting
    # finally, ensemble xgboost with multiple seeds may reduce variance
    xgb_parameters = {
        'nthread': [4],  # when use hyperthread, xgboost may become slower
        'objective': ['binary:logistic'],
        'learning_rate': [0.05, 0.1],  # so called `eta` value
        'max_depth': [6, 7, 8],
        'min_child_weight': [1, 11],
        'silent': [1],
        'subsample': [0.8],
        'colsample_bytree': [0.7, 0.8],
        'n_estimators':
        [5, 100,
         1000],  # number of trees, change it to 1000 for better results
        'missing': [-999],
        'seed': [1337]
    }

    train_model1, xgb_grid = train_single_classifier_type(
        xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test)

    rfc = RandomForestClassifier()

    rfc_parameters = {
        'max_depth': [4, 5, 6],
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [2, 5, 10],
    }

    rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest",
                                                       rfc_parameters, X_train,
                                                       X_test, y_train, y_test)

    ext = ExtraTreesClassifier()

    ext_parameters = {
        'n_estimators': [50, 100],
        'max_features': [5, 10, 25],
        'min_samples_leaf': [2, 5, 10],
        'min_samples_split': [2, 5, 10],
    }

    ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees",
                                                       ext_parameters, X_train,
                                                       X_test, y_train, y_test)

    lgbm = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        n_jobs=-1,  # Updated from 'nthread'
        silent=True)
    # Create parameters to search
    lgbm_parameters = {
        'max_depth': [5, 6, 7, 8, 9, 10, 15, 20],
        'learning_rate': [0.005],
        'n_estimators': [100, 150, 500],
        'num_leaves': [6, 8, 12, 16],
        'boosting_type': ['gbdt'],
        'objective': ['binary'],
        'random_state': [501],  # Updated from 'seed'
        'colsample_bytree': [0.65],
        'subsample': [0.7],
        'reg_alpha': [1, 10],
        'reg_lambda': [10, 100],
    }
    lgbm_model, lgbm_grid = train_single_classifier_type(
        lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test)

    rgf = RGFClassifier()
    rgf_parameters = {
        'max_leaf': [900],
        'l2': [0.1, 0.05, 1.0],
        'min_samples_leaf': [5, 4, 3],
        'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"],
        'loss': ["Log"],
    }

    rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF",
                                                       rgf_parameters, X_train,
                                                       X_test, y_train, y_test)

    frgf = FastRGFClassifier()
    frgf_parameters = {
        'max_leaf': [100, 200, 900],
        'n_estimators': [100, 1000],
        'max_bin': [10, 100],
        'l2': [0.1, 100, 1000],
        'min_samples_leaf': [5, 6],
        'opt_algorithm': ['rgf'],
        'loss': ["LS"],
    }

    frgf_model, frgf_grid = train_single_classifier_type(
        frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test)

    return svm_model, svm_grid, \
           train_model1, xgb_grid, \
           rfc_model, rfc_grid, \
           ext_model, ext_grid, \
           lgbm_model, lgbm_grid, \
           rgf_model, rgf_grid, \
           frgf_model, frgf_grid
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

start = time.time()
clf = RGFClassifier()
clf.fit(iris.data, iris.target)
score = clf.score(iris.data, iris.target)
end = time.time()
print("RGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
clf = FastRGFClassifier()
clf.fit(iris.data, iris.target)
score = clf.score(iris.data, iris.target)
end = time.time()
print("FastRGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
clf = GradientBoostingClassifier()
clf.fit(iris.data, iris.target)
score = clf.score(iris.data, iris.target)
end = time.time()
print("Gradient Boosting: {} sec".format(end - start))
print("score: {}".format(score))
예제 #8
0
                       n_estimators=200,
                       learning_rate=0.2,
                       max_depth=15,
                       scale_pos_weight=1.5,
                       gamma=1))),
])

rgf_pipeline_cnt = Pipeline([
    ('tfidf',
     TfidfVectorizer(stop_words=stop_words,
                     min_df=4,
                     max_features=30000,
                     max_df=.99)),
    ('clf',
     OneVsRestClassifier(
         FastRGFClassifier(n_estimators=500, max_depth=6,
                           min_samples_leaf=10))),
])

rgf_pipeline_tfidf = Pipeline([
    ('tfidf',
     TfidfVectorizer(stop_words=stop_words,
                     min_df=4,
                     max_features=30000,
                     max_df=.99)),
    ('clf',
     OneVsRestClassifier(
         FastRGFClassifier(n_estimators=500, max_depth=6,
                           min_samples_leaf=10))),
])

# In[92]: