def model(timestamps, predictors, classes, classifier=None, prediction_attribute='predict_proba', hyperparams=None, roc_bounds=None, verbose=False): ''' Creates several models using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. prediction_attribute: Name of the attribute of the classifier that returns the probability of belonging to the positive class. roc_bounds : [min, max] values to use when computing partial AUC. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. roc_ax : the matplotlib axes object containing all of the ROC curves. pr_ax : the matplotlib axes object containing all of the PR curves. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9]) pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9]) roc_fig.subplots_adjust(left=0.07, right=0.67) pr_fig.subplots_adjust(left=0.07, right=0.67) clfs = dict() auc_rocs = [] for yr in range(start, stop + 1): is_not_yr = timestamps != yr train_indices = np.array(is_not_yr) test_indices = np.array(~is_not_yr) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices, :], classes[train_indices]) clfs[yr] = clf predictions = getattr(clf, prediction_attribute)( predictors.ix[test_indices, :])[:, 1] auc_roc = viz.roc(predictions, classes[test_indices], block_show=False, ax=roc_ax, bounds=roc_bounds)[3] auc_pr = viz.precision_recall(predictions, classes[test_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_rocs.append(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label( str(yr) + ' - AUC: {0:.5f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label( str(yr) + ' - AUC: {0:.5f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width + 1, fi)) return clfs, auc_rocs, roc_ax, pr_ax
hyperparams=hyperparams, roc_bounds=partial_auc_bounds, verbose=args.verbose) # Plotting ## ROC ### Make ROC yearly lines more transparent c = roc_ax.get_lines() for line in c: line.set_alpha(.7) ### Plot ROC curve for EPA model fpr, tpr, threshes, auc_roc = viz.roc( epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235, ax=roc_ax, block_show=False, bounds=partial_auc_bounds, mark_threshes=[235.0]) ### Format the EPA line auc_roc = float(auc_roc) epa_line = roc_ax.get_lines()[-3] epa_line.set_color([0, 0, 0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.5f}'.format(auc_roc)) roc_ax.get_lines()[-2].set_label('EPA Model @ 235.0') ### Prettify the axis roc_ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
def model(timestamps, predictors, classes, classifier=None, hyperparams=None, verbose=False): ''' Creates several GBMs using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_ax = plt.subplots(1)[1] pr_ax = plt.subplots(1)[1] clfs = dict() for yr in range(start, stop+1): train_indices = np.array((timestamps < yr) | (timestamps > yr)) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices,:], classes[train_indices]) clfs[yr] = clf predictions = clf.predict_proba(predictors.ix[~train_indices,:])[:,1] auc_roc = viz.roc(predictions, classes[~train_indices], block_show=False, ax=roc_ax)[3] auc_pr = viz.precision_recall(predictions, classes[~train_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi)) return clfs, roc_ax, pr_ax
hyperparams = { # Parameters that effect computation 'n_estimators':250, 'max_depth':5, # Misc parameters 'n_jobs':-1, 'verbose':False } clfs, roc_ax, pr_ax = model(timestamps, predictors, classes, classifier=sklearn.ensemble.RandomForestClassifier, hyperparams=hyperparams) # Add the EPA model to the ROC and PR curves, prettify c = roc_ax.get_lines() for line in c: line.set_alpha(.75) auc_roc = viz.roc(epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235, ax=roc_ax, block_show=False)[3] auc_roc = float(auc_roc) epa_line = roc_ax.get_lines()[-2] epa_line.set_color([0,0,0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.4f}'.format(auc_roc)) roc_ax.legend(loc=4) roc_ax.grid(True, which='major') c = pr_ax.get_children() for line in c: line.set_alpha(.75) auc_pr = viz.precision_recall(epa_model_df['Drek_Prediction'],
def model(timestamps, predictors, classes, classifier=None, prediction_attribute='predict_proba', hyperparams=None, roc_bounds=None, verbose=False): ''' Creates several models using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. prediction_attribute: Name of the attribute of the classifier that returns the probability of belonging to the positive class. roc_bounds : [min, max] values to use when computing partial AUC. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. roc_ax : the matplotlib axes object containing all of the ROC curves. pr_ax : the matplotlib axes object containing all of the PR curves. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9]) pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9]) roc_fig.subplots_adjust(left=0.07, right=0.67) pr_fig.subplots_adjust(left=0.07, right=0.67) clfs = dict() auc_rocs = [] for yr in range(start, stop+1): is_not_yr = timestamps != yr train_indices = np.array(is_not_yr) test_indices = np.array(~is_not_yr) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices,:], classes[train_indices]) clfs[yr] = clf predictions = getattr(clf, prediction_attribute)(predictors.ix[test_indices,:])[:,1] auc_roc = viz.roc(predictions, classes[test_indices], block_show=False, ax=roc_ax, bounds=roc_bounds)[3] auc_pr = viz.precision_recall(predictions, classes[test_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_rocs.append(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi)) return clfs, auc_rocs, roc_ax, pr_ax
classifier=sklearn.ensemble.RandomForestClassifier, hyperparams=hyperparams, roc_bounds=partial_auc_bounds, verbose=args.verbose) # Plotting ## ROC ### Make ROC yearly lines more transparent c = roc_ax.get_lines() for line in c: line.set_alpha(.7) ### Plot ROC curve for EPA model fpr, tpr, threshes, auc_roc = viz.roc(epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235, ax=roc_ax, block_show=False, bounds=partial_auc_bounds, mark_threshes=[235.0]) ### Format the EPA line auc_roc = float(auc_roc) epa_line = roc_ax.get_lines()[-3] epa_line.set_color([0,0,0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.5f}'.format(auc_roc)) roc_ax.get_lines()[-2].set_label('EPA Model @ 235.0') ### Prettify the axis roc_ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) roc_ax.set_aspect('auto') # we are going to be zooming around, set it to auto