示例#1
0
def test_ec_cross_val_score(X, y):
    """Test ExtendedClassifier.cross_val_score."""
    clf = LogisticRegression(solver='liblinear')
    model = ExtendedClassifier(clf)
    method_score = model.cross_val_score(X, y, cv=3)
    score = cross_val_score(clf, X, y, cv=3).mean()
    assert method_score == score
    assert model.profile['cv_score'] == method_score
示例#2
0
def test_ec_params_strategy():
    """Test ExtendedClassifier.params_strategy."""
    init_params = {'C': 0.57}
    best_params = {'C': 1.25}
    clf = LogisticRegression(**init_params)
    model = ExtendedClassifier(clf)
    model.best_params = {'C': 1.25}
    model.params_strategy = 'best'
    assert model.clf.get_params()['C'] == best_params['C']
    model.params_strategy = 'init'
    assert model.clf.get_params()['C'] == init_params['C']
示例#3
0
def test_ec_grid_search_cv(X, y):
    """Test ExtendedClassifier.grid_search_cv."""
    clf = SVC(gamma='scale')
    model = ExtendedClassifier(clf)
    param_grid = {'C': [0.01, 0.1, 1, 10]}
    method_best_params, method_best_score = model.grid_search_cv(X,
                                                                 y,
                                                                 param_grid,
                                                                 cv=2)
    grid = GridSearchCV(clf, param_grid, cv=2)
    grid.fit(X, y)
    assert grid.best_params_ == model.profile['gs_best_params']
示例#4
0
def test_ec_init(clf, name):
    """Test ExtendedClassifier.__init__."""
    model = ExtendedClassifier(clf)
    assert isinstance(model.clf, clf.__class__)
    assert model.profile == {}
    assert model.init_params == clf.get_params()
    assert model.best_params is None
    assert model._params_strategy == 'init'
    assert model.last_step_name == name
def cross_validate_voting(X_train, y_train, pipes, grids, kfolds):
    """Cross-validate VotingClassifier."""
    estimators = [('logreg', pipes['logreg']), ('forest', pipes['forest']), ('svc', pipes['svc'])]
    voting = ExtendedClassifier.cross_validate(VotingClassifier(estimators, voting='hard'),
                                               X_train, y_train,
                                               sklearn_cvs_kws={'cv': kfolds},
                                               param_strategy='init',
                                               logdir_path=r'logs/models/voting',
                                               serialize_to=r'models/voting.pickle')
    return voting
示例#6
0
def test_ec_dump_profile_to_log(tmpdir):
    """Test ExtendedClassifier._dump_profile_to_log."""
    clf = LogisticRegression()
    model = ExtendedClassifier(clf)
    model.profile['cv_timestamp'] = str(123)
    model.profile['cv_scoring'] = 'abc'
    model.profile['cv_score'] = 0.456
    model._dump_profile_to_log(tmpdir)
    assert tmpdir.join('123_abc_0.456.log').isfile()
def cross_validate_logreg(X_train, y_train, pipes, grids, kfolds):
    """Cross-validate LogisticRegression pipeline."""
    pipes['logreg'] = make_pipeline(SimpleDataFrameImputer(median_cols=['Age', 'Fare'],
                                                           mode_cols=['Embarked']),
                                    DataFrameDummifier(),
                                    LogisticRegression(solver='liblinear'))
    grids['logreg'] = {'logisticregression__C': [0.01, 0.1, 0.5, 0.8, 1, 1.2, 2, 5, 10]}
    grids['logreg'] = {'logisticregression__C': [0.6, 0.75, 0.8, 0.85, 0.9]}

    logreg = ExtendedClassifier.cross_validate(pipes['logreg'], X_train, y_train, grids['logreg'],
                                               sklearn_gscv_kws={'cv': 3},
                                               sklearn_cvs_kws={'cv': kfolds},
                                               param_strategy='best',
                                               logdir_path=r'logs/models/logreg',
                                               serialize_to=r'models/logreg.pickle')
    return logreg
def cross_validate_svc(X_train, y_train, pipes, grids, kfolds):
    """Cross-validate SVC pipeline."""
    pipes['svc'] = make_pipeline(SimpleDataFrameImputer(median_cols=['Age', 'Fare'],
                                                        mode_cols=['Embarked']),
                                 DataFrameDummifier(),
                                 SVC(kernel='linear', C=0.1, probability=False))
    C = [0.001, 0.01, 0.1, 1, 10]
    gamma = [0.001, 0.01, 0.1, 1]

    grids['svc'] = {'svc__C': C, 'svc__gamma': gamma}

    svc = ExtendedClassifier.cross_validate(pipes['svc'], X_train, y_train,
                                            sklearn_cvs_kws={'cv': kfolds},
                                            param_strategy='init',
                                            logdir_path=r'logs/models/svc',
                                            serialize_to=r'models/svc.pickle')
    return svc
def cross_validate_forest(X_train, y_train, pipes, grids, kfolds, random_search=False):
    """Cross-validate RandomForestClassifier pipeline."""
    pipes['forest'] = make_pipeline(CategoricalToString(),
                                    SimpleDataFrameImputer(median_cols=['Age', 'Fare'],
                                                           mode_cols=['Embarked']),
                                    OrdinalEncoder(cols=['Title', 'Deck', 'Embarked'],
                                                   handle_unknown='impute'),
                                    RandomForestClassifier(**{'bootstrap': True,
                                                              'max_depth': 70,
                                                              'max_features': 'auto',
                                                              'min_samples_leaf': 4,
                                                              'min_samples_split': 10,
                                                              'n_estimators': 64,
                                                              'random_state': RANDOM_SEED}))
    if random_search:
        n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 4]
        bootstrap = [True, False]

        random_grid = {'randomforestclassifier__n_estimators': n_estimators,
                       'randomforestclassifier__max_features': max_features,
                       'randomforestclassifier__max_depth': max_depth,
                       'randomforestclassifier__min_samples_split': min_samples_split,
                       'randomforestclassifier__min_samples_leaf': min_samples_leaf,
                       'randomforestclassifier__bootstrap': bootstrap}
        pprint.pprint(random_grid)
        randsearch = RandomizedSearchCV(pipes['forest'], random_grid, n_iter=50, cv=3,
                                        verbose=0, random_state=42)
        start = time.time()
        randsearch.fit(X_train, y_train)
        finish = time.time()
        print('randsearch.fit execution time:', finish - start)
        pprint.pprint(randsearch.best_params_)

    forest = ExtendedClassifier.cross_validate(pipes['forest'], X_train, y_train,
                                               sklearn_cvs_kws={'cv': kfolds},
                                               param_strategy='init',
                                               logdir_path=r'logs/models/forest',
                                               serialize_to=r'models/forest.pickle')
    return forest
示例#10
0
from pprint import pprint
from titanic.modelling import ExtendedClassifier

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

plt.rcParams['figure.figsize'] = [15, 4.5]

# ## Deserialize models

# In[3]:

models = dict()
models['LogisticRegression'] = ExtendedClassifier.deserialize(
    r'../models/logreg.pickle')
models['RandomForestClassifier'] = ExtendedClassifier.deserialize(
    r'../models/forest.pickle')
models['SVC'] = ExtendedClassifier.deserialize(r'../models/svc.pickle')
models['VotingClassifier'] = ExtendedClassifier.deserialize(
    r'../models/voting.pickle')
models

# ## Compare models

# In[4]:

scores = {name: model.profile['cv_score'] for name, model in models.items()}
scores

# In[5]: