def fast_objective( max_depth, max_leaf, l1, l2, min_samples_leaf, learning_rate, ): max_leaf = int(max_leaf) max_depth = int(max_depth) min_samples_leaf = int(min_samples_leaf) assert type(max_leaf) == int assert type(max_depth) == int assert type(min_samples_leaf) == int model = FastRGFClassifier( max_leaf=max_leaf, max_depth=max_depth, l1=l1, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, ) model.fit(train_m, label_m) pred_proba = model.predict_proba(train_val) score = roc_auc_score(label_val, pred_proba[:, 1]) return score
def train_model(X_train, y_train, params): l1 = params["l1"] l2 = params["l2"] learning_rate = params["learning_rate"] max_leaf = int(params["max_leaf"]) max_depth = int(params["max_depth"]) min_samples_leaf = int(params["min_samples_leaf"]) model = FastRGFClassifier( max_leaf=max_leaf, max_depth=max_depth, l1=l1, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, ) model.fit(X_train, y_train) return model
def run_model(X_train, y_train, X_val, y_val, params): l1 = params["l1"] l2 = params["l2"] learning_rate = params["learning_rate"] max_leaf = int(params["max_leaf"]) max_depth = int(params["max_depth"]) min_samples_leaf = int(params["min_samples_leaf"]) model = FastRGFClassifier( max_leaf=max_leaf, max_depth=max_depth, l1=l1, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, ) model.fit(X_train, y_train) pred_proba = model.predict_proba(X_val)[:, 1] score = roc_auc_score(y_val, pred_proba) return pred_proba, score
def __init__(self, task, fast=False): if task == 'classification': self.metric = 'roc_auc' self.task = "classification" if fast: self.model = FastRGFClassifier() else: self.model = RGFClassifier(loss="Log") else: self.metric = 'neg_mean_squared_error' self.task = "regression" if fast: self.model = FastRGFRegressor() else: self.model = RGFRegressor(loss="LS", normalize=True) self.X_test = None self.X_train = None self.y_test = None self.y_train = None self.grid_search = None self.y_predict = None self.test_score = None
def fit(self, x_train: np.ndarray, y_train: np.ndarray, x_valid: np.ndarray, y_valid: np.ndarray, config: dict, **kwargs) -> Tuple[RGFModel, dict]: model_params = config["model"]["model_params"] mode = config["model"]["train_params"]["mode"] if mode == "regression": model = FastRGFRegressor(**model_params) else: model = FastRGFClassifier(**model_params) x_train = (pd.DataFrame(x_train).replace( [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32")) y_train = (pd.DataFrame(y_train).replace( [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32")) model.fit(x_train, y_train) x_valid = (pd.DataFrame(x_valid).replace( [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32")) y_valid = (pd.DataFrame(y_valid).replace( [np.inf, -np.inf], np.nan).fillna(-999.0).values.astype("float32")) best_score = {"valid_score": model.score(x_valid, y_valid)} return model, best_score
def train_classifiers(X_data, y): """ Trains several classifiers and reporting model quality. :param X_data: :param y: :return: trained models """ # Split the dataset into Train and Test seed = 7 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=test_size, random_state=seed) svm = SVC() svm_params = { 'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf'] } svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params, X_train, X_test, y_train, y_test) knn = KNeighborsClassifier() knn_params = { 'n_neighbors': [5, 6, 7, 8, 9, 10], 'leaf_size': [1, 2, 3, 5], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'n_jobs': [-1] } knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params, X_train, X_test, y_train, y_test) # Train the XGboost Model for Classification xgb_model = xgb.XGBClassifier() # brute force scan for all parameters, here are the tricks # usually max_depth is 6,7,8 # learning rate is around 0.05, but small changes may make big diff # tuning min_child_weight subsample colsample_bytree can have # much fun of fighting against overfit # n_estimators is how many round of boosting # finally, ensemble xgboost with multiple seeds may reduce variance xgb_parameters = { 'nthread': [4], # when use hyperthread, xgboost may become slower 'objective': ['binary:logistic'], 'learning_rate': [0.05, 0.1], # so called `eta` value 'max_depth': [6, 7, 8], 'min_child_weight': [1, 11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7, 0.8], 'n_estimators': [5, 100, 1000], # number of trees, change it to 1000 for better results 'missing': [-999], 'seed': [1337] } train_model1, xgb_grid = train_single_classifier_type( xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test) rfc = RandomForestClassifier() rfc_parameters = { 'max_depth': [4, 5, 6], 'n_estimators': [100, 200], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], } rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest", rfc_parameters, X_train, X_test, y_train, y_test) ext = ExtraTreesClassifier() ext_parameters = { 'n_estimators': [50, 100], 'max_features': [5, 10, 25], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10], } ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees", ext_parameters, X_train, X_test, y_train, y_test) lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=-1, # Updated from 'nthread' silent=True) # Create parameters to search lgbm_parameters = { 'max_depth': [5, 6, 7, 8, 9, 10, 15, 20], 'learning_rate': [0.005], 'n_estimators': [100, 150, 500], 'num_leaves': [6, 8, 12, 16], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'random_state': [501], # Updated from 'seed' 'colsample_bytree': [0.65], 'subsample': [0.7], 'reg_alpha': [1, 10], 'reg_lambda': [10, 100], } lgbm_model, lgbm_grid = train_single_classifier_type( lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test) rgf = RGFClassifier() rgf_parameters = { 'max_leaf': [900], 'l2': [0.1, 0.05, 1.0], 'min_samples_leaf': [5, 4, 3], 'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"], 'loss': ["Log"], } rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF", rgf_parameters, X_train, X_test, y_train, y_test) frgf = FastRGFClassifier() frgf_parameters = { 'max_leaf': [100, 200, 900], 'n_estimators': [100, 1000], 'max_bin': [10, 100], 'l2': [0.1, 100, 1000], 'min_samples_leaf': [5, 6], 'opt_algorithm': ['rgf'], 'loss': ["LS"], } frgf_model, frgf_grid = train_single_classifier_type( frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test) return svm_model, svm_grid, \ train_model1, xgb_grid, \ rfc_model, rfc_grid, \ ext_model, ext_grid, \ lgbm_model, lgbm_grid, \ rgf_model, rgf_grid, \ frgf_model, frgf_grid
iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] start = time.time() clf = RGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("RGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = FastRGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("FastRGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = GradientBoostingClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("Gradient Boosting: {} sec".format(end - start)) print("score: {}".format(score))
n_estimators=200, learning_rate=0.2, max_depth=15, scale_pos_weight=1.5, gamma=1))), ]) rgf_pipeline_cnt = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=stop_words, min_df=4, max_features=30000, max_df=.99)), ('clf', OneVsRestClassifier( FastRGFClassifier(n_estimators=500, max_depth=6, min_samples_leaf=10))), ]) rgf_pipeline_tfidf = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=stop_words, min_df=4, max_features=30000, max_df=.99)), ('clf', OneVsRestClassifier( FastRGFClassifier(n_estimators=500, max_depth=6, min_samples_leaf=10))), ]) # In[92]: