예제 #1
0
def test_gridsearch_crossval(
    model=SVC(random_state=0),
    return_model=False,
    param_grid=None,
    opt_score=0.9298,
    assertions=True,
    scoring=None,
    verbose=False,
):
    data = load_breast_cancer()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.3,
        random_state=0,
        stratify=data["target"],
    )

    # List the parameters to search across
    if param_grid is None:
        param_grid = {
            'C': [1, 10, 100, 120, 150],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf'],
        }

    # Grid-search all parameter combinations WITHOUT a validation set.
    gs = GridSearch(
        model=model,
        param_grid=param_grid,
    )
    gs.fit(X_train, y_train, scoring=scoring, verbose=False)

    # Compare with default model without hyperopt
    default = SVC(random_state=0)
    default.fit(X_train, y_train)

    default_score = round(default.score(X_test, y_test), 4)
    gs_score = round(gs.score(X_test, y_test), 4)

    if verbose:
        print('Default score:', default_score, '| GridSearch Score:', gs_score)

    if assertions:
        assert (gs_score == opt_score)

    if return_model:
        return gs
예제 #2
0
class Prediction:
    def __init__(self, data, model, prefix, param_grid=[]):
        self.train_df, self.test_df = data
        self.model = model
        self.param_grid = param_grid
        self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M')
        self.X = self.train_df.loc[:, self.train_df.columns != 'precio']
        self.y = self.train_df['precio'].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=1)

    def manualGridSearch(self):
        best_score = math.inf
        for g in self.param_grid:
            print(g)
            self.model.set_params(**g)
            self.model.fit(self.X_train, self.y_train)
            score = mean_absolute_error(self.model.predict(self.X_val),
                                        self.y_val)
            print(score)
            # save if best
            if score < best_score:
                self.best_score = score
                self.best_grid = g

    def gridSearchTrain(self):
        print('Training...')
        self.gscv = GridSearchCV(self.model,
                                 self.param_grid,
                                 scoring='neg_mean_absolute_error',
                                 verbose=10)
        self.gscv.fit(self.X_train, self.y_train)
        self.best_params = self.gscv.best_params_
        self.score = self.gscv.best_score_
        self.predicted = self.gscv.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def HypOptTrain(self):
        print('Training...')
        self.opt = GridSearch(model=self.model, param_grid=self.param_grid)
        self.opt.fit(self.X_train,
                     self.y_train,
                     self.X_val,
                     self.y_val,
                     scoring='neg_mean_squared_error')
        self.best_params = self.opt.best_params_
        self.score = self.opt.score(X_val, y_val)
        self.predicted = self.opt.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def train(self):
        print('Training...')
        self.model.fit(self.X_train, self.y_train)
        self.score = mean_absolute_error(self.model.predict(self.X_val),
                                         self.y_val)
        print(self.score)
        self.predicted = self.model.predict(self.test_df)

    def crossValidation(self, cv=5):
        cv_scores = cross_val_score(
            self.model,
            self.X,
            self.y,
            cv=cv,
            scoring='neg_mean_absolute_error'
        )  #print each cv score (accuracy) and average them
        self.score = np.mean(cv_scores)
        print(self.score)

    def save(self):
        if self.param_grid == []:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.model, f)
        else:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.gscv, f)

    def submit(self):
        self.test_ids = pd.read_csv('data/test.csv')['id']
        answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)),
                              columns=['id', 'target'])
        answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))),
                      sep=',',
                      index=False)
        # print('feature counts: {0}'.format(len(features)))

        X_train = vect_val.transform(X_train)
        X_val = vect_val.transform(X_val)
        print('******** GridSearch ********')
        param_grid = {
            'n_estimators': [40, 60, 80, 100, 120],
            'learning_rate': [0.1, 0.15, 0.2],
            'max_depth': [6, 7, 8, 9, 10]
        }
        scorer = make_scorer(f2)

        gs = GridSearch(model=GradientBoostingClassifier())
        gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring=scorer)
        print('params: ', gs.get_best_params())
        print('Test Score for Optimized Parameters:', gs.score(X_val, y_val))

    # print('******** GradientBoostingClassifier ********')
    # gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, max_depth=7)
    # gb, preds_train, preds = train_and_predict(gb, X_train, y_train, X_val)
    # print_scores(y_val, preds, y_train, preds_train)
    #
    # print('******** AdamBoostingClassifier ********')
    # ada = AdaBoostClassifier()
    # ada, preds_train, preds = train_and_predict(ada, X_train, y_train, X_val)
    # print_scores(y_val, preds, y_train, preds_train)
    #
    # print('******** XgBoostClassifier ********')
    # xgb = XGBClassifier()
    # xgb, preds_train, preds = train_and_predict(xgb, X_train, y_train, X_val)
    # print_scores(y_val, preds, y_train, preds_train)
예제 #4
0
# Create a validation set.
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.3, 
    random_state = 0,
    stratify = y_train,
)

# List the parameters to search across
# List the parameters to search across
param_grid = {
    'C': [1, 10, 100, 120, 150], 
    'gamma': [0.001, 0.0001], 
    'kernel': ['rbf'],
}

# Grid-search all parameter combinations using a validation set.
gs = GridSearch(model = SVC(random_state=0), param_grid=param_grid, parallelize=False)
# You can choose the metric to optimize (f1, auc_roc, accuracy, etc.)
# scoring = None will default to optimizing model.score()
_ = gs.fit(X_train, y_train, X_val, y_val, scoring = 'f1')

# Compare with default model without hyperopt
default = SVC(random_state=0)
_ = default.fit(X_train, y_train)
print('\nTest score comparison (larger is better):')
print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4))
print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))

예제 #5
0
default = MLPClassifier(max_iter=50, random_state=0)
default.fit(X_train, y_train)
test_score = round(default.score(X_test, y_test), 4)
val_score = round(default.score(X_val, y_val), 4)
print('\nTEST SCORE (default parameters):', test_score)
print('VALIDATION SCORE (default parameters):', val_score)

# In[5]:


gs_val = GridSearch(model = MLPClassifier(max_iter=50, random_state=0), param_grid=param_grid,\
     parallelize=False)
print("Grid-search using a validation set.\n", "-" * 79)
get_ipython().magic(
    u"time gs_val.fit(X_train, y_train, X_val, y_val, scoring = 'accuracy')")
test_score = round(gs_val.score(X_test, y_test), 4)
val_score = round(gs_val.score(X_val, y_val), 4)
print('\nTEST SCORE (hyper-parameter optimization with validation set):',
      test_score)
print('VALIDATION SCORE (hyper-parameter optimization with validation set):',
      val_score)

# In[6]:

gs_cv = GridSearch(model=MLPClassifier(max_iter=50, random_state=0),
                   param_grid=param_grid,
                   cv_folds=6)
print(
    "\n\nLet's see how long grid-search takes to run when we don't use a validation set."
)
print("Grid-search using cross-validation.\n", "-" * 79)
예제 #6
0
def test_regression(
    model=SVR(),
    return_model=False,
    param_grid=None,
    gs_score=.4532,
    assertions=True,
    scoring=None,
    verbose=False,
):
    from sklearn.datasets import load_boston

    data = load_boston()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.1,
        random_state=0,
    )

    # Create a validation set.
    X_train, X_val, y_train, y_val = train_test_split(
        X_train,
        y_train,
        test_size=0.1,
        random_state=0,
    )

    # List the parameters to search across
    if param_grid is None:
        param_grid = {
            'C': [1, 10, 100, 120, 150],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf'],
        }

    # Grid-search all parameter combinations using a validation set.
    gs = GridSearch(
        model=model,
        param_grid=param_grid,
    )
    gs.fit(
        X_train,
        y_train,
        X_val,
        y_val,
        scoring=scoring,
        verbose=True,
    )

    # Compare with default model without hyperopt
    default = model
    default.fit(X_train, y_train)

    default_score = round(default.score(X_test, y_test), 4)
    gridsearch_score = round(gs.score(X_test, y_test), 4)

    if verbose:
        print('Default score:', default_score, '| GridSearch Score:',
              gridsearch_score)

    if assertions:
        assert (default_score == .0175)
        assert (gridsearch_score is not None)

    if return_model:
        return gs
def classifier(classifier, train, truth, validate, validate_truth, test,
               test_truth, datatype):
    np.random.seed(0)
    rng = np.random.permutation(1)[0]
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    test = pd.DataFrame(test)
    logger = logging.getLogger('myapp')
    hdlr = logging.FileHandler('classifiers.log')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.WARN)
    if classifier.lower(
    ) == 'svm':  #best: C = 50, gamma = 0.0001, kernel = rbf
        model = svm.SVC(random_state=rng)
        hyperparameter = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 1.5, 10, 50, 100, 200],
            'gamma': [1e-7, 1e-4]
        }
    elif classifier.lower() == 'randomforest':  #120
        model = RandomForestClassifier(random_state=rng)
        hyperparameter = {'n_estimators': np.arange(10, 300, 10)}
    elif classifier.lower() == 'adaboost':
        model = AdaBoostClassifier(random_state=rng)
        hyperparameter = {
            'n_estimators': np.arange(10, 300, 10),
            'algorithm': ('SAMME', 'SAMME.R')
        }
    elif classifier.lower() == 'knn':  #120
        model = KNeighborsClassifier()
        hyperparameter = dict(n_neighbors=list(range(1, 100)))
    else:  ## assume it's asking for neural network (multi-layer perceptron)
        model = MLPClassifier(
            max_iter=100
        )  #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs
        hyperparameter = {
            'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20),
                                   (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)],
            'learning_rate': ['adaptive'],
            'activation': ['tanh', 'relu', 'logistic'],
            'solver': ['lbfgs', 'sgd', 'adam']
        }
    tuned_model = GridSearch(model=model, param_grid=hyperparameter)
    tuned_model.fit(train, truth)
    prediction = tuned_model.score(test, test_truth)
    logger.warn(classifier + ' ' + datatype + ' validate    ' +
                str(prediction))
    tuned_model.fit(train, truth, validate, validate_truth)
    prediction = tuned_model.score(test, test_truth)
    target_names = [
        'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s',
        't-SC-m'
    ]
    prediction = tuned_model.predict(test)
    print(
        classification_report(test_truth,
                              prediction,
                              target_names=target_names))
    logger.warn(classifier + ' ' + datatype + '    ' + str(prediction))
    return