def test_ec_cross_val_score(X, y): """Test ExtendedClassifier.cross_val_score.""" clf = LogisticRegression(solver='liblinear') model = ExtendedClassifier(clf) method_score = model.cross_val_score(X, y, cv=3) score = cross_val_score(clf, X, y, cv=3).mean() assert method_score == score assert model.profile['cv_score'] == method_score
def test_ec_params_strategy(): """Test ExtendedClassifier.params_strategy.""" init_params = {'C': 0.57} best_params = {'C': 1.25} clf = LogisticRegression(**init_params) model = ExtendedClassifier(clf) model.best_params = {'C': 1.25} model.params_strategy = 'best' assert model.clf.get_params()['C'] == best_params['C'] model.params_strategy = 'init' assert model.clf.get_params()['C'] == init_params['C']
def test_ec_grid_search_cv(X, y): """Test ExtendedClassifier.grid_search_cv.""" clf = SVC(gamma='scale') model = ExtendedClassifier(clf) param_grid = {'C': [0.01, 0.1, 1, 10]} method_best_params, method_best_score = model.grid_search_cv(X, y, param_grid, cv=2) grid = GridSearchCV(clf, param_grid, cv=2) grid.fit(X, y) assert grid.best_params_ == model.profile['gs_best_params']
def test_ec_init(clf, name): """Test ExtendedClassifier.__init__.""" model = ExtendedClassifier(clf) assert isinstance(model.clf, clf.__class__) assert model.profile == {} assert model.init_params == clf.get_params() assert model.best_params is None assert model._params_strategy == 'init' assert model.last_step_name == name
def cross_validate_voting(X_train, y_train, pipes, grids, kfolds): """Cross-validate VotingClassifier.""" estimators = [('logreg', pipes['logreg']), ('forest', pipes['forest']), ('svc', pipes['svc'])] voting = ExtendedClassifier.cross_validate(VotingClassifier(estimators, voting='hard'), X_train, y_train, sklearn_cvs_kws={'cv': kfolds}, param_strategy='init', logdir_path=r'logs/models/voting', serialize_to=r'models/voting.pickle') return voting
def test_ec_dump_profile_to_log(tmpdir): """Test ExtendedClassifier._dump_profile_to_log.""" clf = LogisticRegression() model = ExtendedClassifier(clf) model.profile['cv_timestamp'] = str(123) model.profile['cv_scoring'] = 'abc' model.profile['cv_score'] = 0.456 model._dump_profile_to_log(tmpdir) assert tmpdir.join('123_abc_0.456.log').isfile()
def cross_validate_logreg(X_train, y_train, pipes, grids, kfolds): """Cross-validate LogisticRegression pipeline.""" pipes['logreg'] = make_pipeline(SimpleDataFrameImputer(median_cols=['Age', 'Fare'], mode_cols=['Embarked']), DataFrameDummifier(), LogisticRegression(solver='liblinear')) grids['logreg'] = {'logisticregression__C': [0.01, 0.1, 0.5, 0.8, 1, 1.2, 2, 5, 10]} grids['logreg'] = {'logisticregression__C': [0.6, 0.75, 0.8, 0.85, 0.9]} logreg = ExtendedClassifier.cross_validate(pipes['logreg'], X_train, y_train, grids['logreg'], sklearn_gscv_kws={'cv': 3}, sklearn_cvs_kws={'cv': kfolds}, param_strategy='best', logdir_path=r'logs/models/logreg', serialize_to=r'models/logreg.pickle') return logreg
def cross_validate_svc(X_train, y_train, pipes, grids, kfolds): """Cross-validate SVC pipeline.""" pipes['svc'] = make_pipeline(SimpleDataFrameImputer(median_cols=['Age', 'Fare'], mode_cols=['Embarked']), DataFrameDummifier(), SVC(kernel='linear', C=0.1, probability=False)) C = [0.001, 0.01, 0.1, 1, 10] gamma = [0.001, 0.01, 0.1, 1] grids['svc'] = {'svc__C': C, 'svc__gamma': gamma} svc = ExtendedClassifier.cross_validate(pipes['svc'], X_train, y_train, sklearn_cvs_kws={'cv': kfolds}, param_strategy='init', logdir_path=r'logs/models/svc', serialize_to=r'models/svc.pickle') return svc
def cross_validate_forest(X_train, y_train, pipes, grids, kfolds, random_search=False): """Cross-validate RandomForestClassifier pipeline.""" pipes['forest'] = make_pipeline(CategoricalToString(), SimpleDataFrameImputer(median_cols=['Age', 'Fare'], mode_cols=['Embarked']), OrdinalEncoder(cols=['Title', 'Deck', 'Embarked'], handle_unknown='impute'), RandomForestClassifier(**{'bootstrap': True, 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 64, 'random_state': RANDOM_SEED})) if random_search: n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_features': max_features, 'randomforestclassifier__max_depth': max_depth, 'randomforestclassifier__min_samples_split': min_samples_split, 'randomforestclassifier__min_samples_leaf': min_samples_leaf, 'randomforestclassifier__bootstrap': bootstrap} pprint.pprint(random_grid) randsearch = RandomizedSearchCV(pipes['forest'], random_grid, n_iter=50, cv=3, verbose=0, random_state=42) start = time.time() randsearch.fit(X_train, y_train) finish = time.time() print('randsearch.fit execution time:', finish - start) pprint.pprint(randsearch.best_params_) forest = ExtendedClassifier.cross_validate(pipes['forest'], X_train, y_train, sklearn_cvs_kws={'cv': kfolds}, param_strategy='init', logdir_path=r'logs/models/forest', serialize_to=r'models/forest.pickle') return forest
from pprint import pprint from titanic.modelling import ExtendedClassifier pd.options.display.max_columns = None pd.options.display.max_rows = None pd.options.display.max_colwidth = -1 plt.rcParams['figure.figsize'] = [15, 4.5] # ## Deserialize models # In[3]: models = dict() models['LogisticRegression'] = ExtendedClassifier.deserialize( r'../models/logreg.pickle') models['RandomForestClassifier'] = ExtendedClassifier.deserialize( r'../models/forest.pickle') models['SVC'] = ExtendedClassifier.deserialize(r'../models/svc.pickle') models['VotingClassifier'] = ExtendedClassifier.deserialize( r'../models/voting.pickle') models # ## Compare models # In[4]: scores = {name: model.profile['cv_score'] for name, model in models.items()} scores # In[5]: