def evaluate_model(dataset, save_file, random_state, pipeline_components, pipeline_parameters, n_combos, label): features, labels, feature_names = read_file(dataset, label) # pipelines = [dict(zip(pipeline_parameters.keys(), list(parameter_combination))) # for parameter_combination in itertools.product(*pipeline_parameters.values())] # Create a temporary folder to store the transformers of the pipeline cachedir = mkdtemp() memory = Memory(cachedir=cachedir, verbose=0) # print ( pipeline_components) # print(pipeline_parameters) with warnings.catch_warnings(): # Squash warning messages. Turn this off when debugging! warnings.simplefilter('ignore') cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state) hyperparameters = {} for k, v in pipeline_parameters.items(): for param, pvals in v.items(): hyperparameters.update({k + '__' + param: pvals}) pipeline = Pipeline(pipeline_components, memory=memory) # run Randomized Search CV to tune the hyperparameter settings est = RandomizedSearchCV(estimator=pipeline, param_distributions=hyperparameters, n_iter=n_combos, cv=cv, random_state=random_state, refit=True, error_score=0.0) est.fit(features, labels) best_est = est.best_estimator_ # generate cross-validated predictions for each data point using the best estimator cv_predictions = cross_val_predict(estimator=best_est, X=features, y=labels, cv=cv) # get cv probabilities skip = False if getattr(best_est, "predict_proba", None): method = "predict_proba" elif getattr(best_est, "decision_function", None): method = "decision_function" else: skip = True if not skip: cv_probabilities = cross_val_predict(estimator=best_est, X=features, y=labels, method=method, cv=cv) if method == "predict_proba": cv_probabilities = cv_probabilities[:, 1] accuracy = accuracy_score(labels, cv_predictions) macro_f1 = f1_score(labels, cv_predictions, average='macro') balanced_accuracy = balanced_accuracy_score(labels, cv_predictions) try: roc_auc = roc_auc_score(labels, cv_probabilities) except ValueError as ve: print("roc_auc_score: %s" % (str(ve))) roc_auc = -1 preprocessor_classes = [p[0] for p in pipeline_components[:-1]] preprocessor_param_string = 'default' for preprocessor_class in preprocessor_classes: if preprocessor_class in pipeline_parameters.keys(): preprocessor_param_string = ','.join([ '{}={}'.format( parameter, '|'.join([x.strip() for x in str(value).split(',')])) for parameter, value in pipeline_parameters[preprocessor_class].items() ]) classifier_class = pipeline_components[-1][0] param_string = ','.join( ['{}={}'.format(p, v) for p, v in est.best_params_.items()]) # for parameter, value in pipeline_parameters[classifier_class].items()]) out_text = '\t'.join([ dataset.split('/')[-1].split('.')[0], ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string, str(random_state), str(accuracy), str(macro_f1), str(balanced_accuracy), str(roc_auc) ]) print(out_text) with open(save_file, 'a') as out: out.write(out_text + '\n') sys.stdout.flush() # write feature importances est_name = classifier_class feature_importance(save_file, best_est, est_name, feature_names, features, labels, random_state, ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string) # write roc curves if not skip: roc(save_file, best_est, labels, cv_probabilities, random_state, ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string) # Delete the temporary cache before exiting rmtree(cachedir)
y=y_train, bounds=bounds, metric=single_balanced_accuracy_score, init_points=10, n_iter=15, groups=uuid_groups) nb_clf = GaussianNB() clf = FlexOneVsRestClassifier(nb_clf, n_estimators=y_train.shape[1]) clf.fit(X_train_clean, y_train) y_pred = clf.predict(X_test_clean) print( "Balanced accuracy NB: ", balanced_accuracy_score(y_test.T, y_pred, average="macro", zero_default=0)) lr_clf = LogisticRegression(solver="lbfgs", tol=1e-3, max_iter=500) bounds = {"C": (0.0001, 1)} clf = FlexOneVsRestClassifier(lr_clf, n_estimators=y_train.shape[1]) clf.tune_hyperparams(X=X_train_clean, y=y_train, bounds=bounds, metric=single_balanced_accuracy_score, init_points=6, n_iter=9, groups=uuid_groups) dump(clf.get_params(), "params_separated_lr.joblib")
from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from metrics import balanced_accuracy_score import pandas as pd dataset = pd.read_csv('d_heart.csv') X = StandardScaler().fit_transform(dataset.drop('class', axis=1)) y = dataset['class'] X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.25, shuffle=False) clf = DecisionTreeClassifier(max_depth=4, criterion='gini').fit(X_t, y_t) print('train score:', balanced_accuracy_score(y_t, clf.predict(X_t))) print('test score:', balanced_accuracy_score(y_v, clf.predict(X_v))) print('feature importances:', clf.feature_importances_) import numpy as np print('argsort: ', np.argsort(clf.feature_importances_))
int_params=int_params) dump(clf.get_params(), "params_separated.joblib") param_dict = load("params_separated.joblib") print(param_dict) preprocess = time.time() print("Preprocess: {}".format(preprocess - start)) clf.fit(X_train, y_train, ignore_nan=True) fit_time = time.time() print("Fit time: {}".format(fit_time - preprocess)) y_pred = clf.predict(X_test) y_pred_bias = clf.predict(X_train) pred_time = time.time() print("Prediction time: {}".format(pred_time - fit_time)) print("Balanced accuracy: ", balanced_accuracy_score(y_test, y_pred, average="macro", zero_default=0)) print("Balanced accuracy bias:", balanced_accuracy_score(y_train, y_pred_bias, average="macro", zero_default=0)) score_time = time.time() print("Score time: {}".format(score_time - pred_time))