Пример #1
0
def model(timestamps,
          predictors,
          classes,
          classifier=None,
          prediction_attribute='predict_proba',
          hyperparams=None,
          roc_bounds=None,
          verbose=False):
    '''
    Creates several models using leave-one-year-out cross validation.

    ROC and PR curves are plotted as a side-effect.

    Parameters
    ----------
    timestamps : Nx1 pandas series of timestamps.
                 Each element should have a "year" attribute.
    predictors : NxM pandas DataFrame, all values should be numeric,
                 and there should be no NaN values.
    classes    : Nx1 array like of binary outcomes, e.g. True or False.
    classifier : sklearn classifier, should have the attributes "fit"
                 and "predict_proba" at the least.
    hyperparams: Dictionary of hyper parameters to pass to the
                 classifier method.
    prediction_attribute:
                 Name of the attribute of the classifier that returns
                 the probability of belonging to the positive class.
    roc_bounds : [min, max] values to use when computing partial AUC.
    verbose    : True if the clf.feature_importances_ should be printed

    Returns
    -------
    clfs   : Dictionary of (year, classifier) pairs, where the classifier
             is the model found by leaving the specified year out of the
             training set.
    roc_ax : the matplotlib axes object containing all of the ROC curves.
    pr_ax  : the matplotlib axes object containing all of the PR curves.
    '''
    if classifier is None:
        classifier = sklearn.ensemble.GradientBoostingClassifier
    if hyperparams is None:
        hyperparams = {}

    timestamps = timestamps.map(lambda x: x.year)

    start = timestamps.min()
    stop = timestamps.max()

    stop = min(stop, 2014)  # do not include 2015

    roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9])
    pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9])

    roc_fig.subplots_adjust(left=0.07, right=0.67)
    pr_fig.subplots_adjust(left=0.07, right=0.67)

    clfs = dict()
    auc_rocs = []

    for yr in range(start, stop + 1):
        is_not_yr = timestamps != yr
        train_indices = np.array(is_not_yr)
        test_indices = np.array(~is_not_yr)

        clf = classifier(**hyperparams)
        clf.fit(predictors.ix[train_indices, :], classes[train_indices])

        clfs[yr] = clf

        predictions = getattr(clf, prediction_attribute)(
            predictors.ix[test_indices, :])[:, 1]

        auc_roc = viz.roc(predictions,
                          classes[test_indices],
                          block_show=False,
                          ax=roc_ax,
                          bounds=roc_bounds)[3]
        auc_pr = viz.precision_recall(predictions,
                                      classes[test_indices],
                                      block_show=False,
                                      ax=pr_ax)[3]

        auc_roc = float(auc_roc)
        auc_rocs.append(auc_roc)
        auc_pr = float(auc_pr)

        roc_ax.get_lines()[-2].set_label(
            str(yr) + ' - AUC: {0:.5f}'.format(auc_roc))
        pr_ax.get_lines()[-2].set_label(
            str(yr) + ' - AUC: {0:.5f}'.format(auc_pr))

        if verbose:
            print('Year ' + str(yr))
            print('Feature importances:')
            feat_imps = clf.feature_importances_
            idxs = np.argsort(feat_imps)[::-1]
            max_width = max([len(c) for c in predictors.columns])

            for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]):
                print('  {0:<{1}} : {2:.5f}'.format(c, max_width + 1, fi))

    return clfs, auc_rocs, roc_ax, pr_ax
Пример #2
0
        hyperparams=hyperparams,
        roc_bounds=partial_auc_bounds,
        verbose=args.verbose)

    # Plotting
    ## ROC
    ### Make ROC yearly lines more transparent
    c = roc_ax.get_lines()
    for line in c:
        line.set_alpha(.7)

    ### Plot ROC curve for EPA model
    fpr, tpr, threshes, auc_roc = viz.roc(
        epa_model_df['Drek_Prediction'],
        epa_model_df['Escherichia.coli'] > 235,
        ax=roc_ax,
        block_show=False,
        bounds=partial_auc_bounds,
        mark_threshes=[235.0])
    ### Format the EPA line
    auc_roc = float(auc_roc)
    epa_line = roc_ax.get_lines()[-3]
    epa_line.set_color([0, 0, 0])
    epa_line.set_ls('--')
    epa_line.set_linewidth(3)
    epa_line.set_alpha(.85)
    epa_line.set_label('EPA Model - AUC: {0:.5f}'.format(auc_roc))
    roc_ax.get_lines()[-2].set_label('EPA Model @ 235.0')

    ### Prettify the axis
    roc_ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
def model(timestamps, predictors, classes, classifier=None, hyperparams=None, verbose=False):
    '''
    Creates several GBMs using leave-one-year-out cross validation.

    ROC and PR curves are plotted as a side-effect.

    Parameters
    ----------
    timestamps : Nx1 pandas series of timestamps.
                 Each element should have a "year" attribute.
    predictors : NxM pandas DataFrame, all values should be numeric,
                 and there should be no NaN values.
    classes    : Nx1 array like of binary outcomes, e.g. True or False.
    classifier : sklearn classifier, should have the attributes "fit"
                 and "predict_proba" at the least.
    hyperparams: Dictionary of hyper parameters to pass to the
                 classifier method.
    verbose    : True if the clf.feature_importances_ should be printed

    Returns
    -------
    clfs : Dictionary of (year, classifier) pairs, where the classifier
           is the model found by leaving the specified year out of the
           training set.
    '''
    if classifier is None:
        classifier = sklearn.ensemble.GradientBoostingClassifier
    if hyperparams is None:
        hyperparams = {}

    timestamps = timestamps.map(lambda x: x.year)

    start = timestamps.min()
    stop = timestamps.max()
    stop = min(stop, 2014) # do not include 2015

    roc_ax = plt.subplots(1)[1]
    pr_ax = plt.subplots(1)[1]

    clfs = dict()

    for yr in range(start, stop+1):
        train_indices = np.array((timestamps < yr) | (timestamps > yr))

        clf = classifier(**hyperparams)
        clf.fit(predictors.ix[train_indices,:], classes[train_indices])

        clfs[yr] = clf

        predictions = clf.predict_proba(predictors.ix[~train_indices,:])[:,1]

        auc_roc = viz.roc(predictions, classes[~train_indices],
                          block_show=False, ax=roc_ax)[3]
        auc_pr = viz.precision_recall(predictions, classes[~train_indices],
                                      block_show=False, ax=pr_ax)[3]

        auc_roc = float(auc_roc)
        auc_pr = float(auc_pr)
        roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_roc))
        pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_pr))

        if verbose:
            print('Year ' + str(yr))
            print('Feature importances:')
            feat_imps = clf.feature_importances_
            idxs = np.argsort(feat_imps)[::-1]
            max_width = max([len(c) for c in predictors.columns])

            for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]):
                print('  {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi))

    return clfs, roc_ax, pr_ax
    hyperparams = {
        # Parameters that effect computation
        'n_estimators':250, 'max_depth':5,
        # Misc parameters
        'n_jobs':-1, 'verbose':False
    }
    clfs, roc_ax, pr_ax = model(timestamps, predictors, classes,
                                classifier=sklearn.ensemble.RandomForestClassifier,
                                hyperparams=hyperparams)

    # Add the EPA model to the ROC and PR curves, prettify
    c = roc_ax.get_lines()
    for line in c:
        line.set_alpha(.75)

    auc_roc = viz.roc(epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235,
                      ax=roc_ax, block_show=False)[3]
    auc_roc = float(auc_roc)
    epa_line = roc_ax.get_lines()[-2]
    epa_line.set_color([0,0,0])
    epa_line.set_ls('--')
    epa_line.set_linewidth(3)
    epa_line.set_alpha(.85)
    epa_line.set_label('EPA Model - AUC: {0:.4f}'.format(auc_roc))
    roc_ax.legend(loc=4)
    roc_ax.grid(True, which='major')

    c = pr_ax.get_children()
    for line in c:
        line.set_alpha(.75)

    auc_pr = viz.precision_recall(epa_model_df['Drek_Prediction'],
def model(timestamps, predictors, classes,
          classifier=None,
          prediction_attribute='predict_proba',
          hyperparams=None,
          roc_bounds=None,
          verbose=False):
    '''
    Creates several models using leave-one-year-out cross validation.

    ROC and PR curves are plotted as a side-effect.

    Parameters
    ----------
    timestamps : Nx1 pandas series of timestamps.
                 Each element should have a "year" attribute.
    predictors : NxM pandas DataFrame, all values should be numeric,
                 and there should be no NaN values.
    classes    : Nx1 array like of binary outcomes, e.g. True or False.
    classifier : sklearn classifier, should have the attributes "fit"
                 and "predict_proba" at the least.
    hyperparams: Dictionary of hyper parameters to pass to the
                 classifier method.
    prediction_attribute:
                 Name of the attribute of the classifier that returns
                 the probability of belonging to the positive class.
    roc_bounds : [min, max] values to use when computing partial AUC.
    verbose    : True if the clf.feature_importances_ should be printed

    Returns
    -------
    clfs   : Dictionary of (year, classifier) pairs, where the classifier
             is the model found by leaving the specified year out of the
             training set.
    roc_ax : the matplotlib axes object containing all of the ROC curves.
    pr_ax  : the matplotlib axes object containing all of the PR curves.
    '''
    if classifier is None:
        classifier = sklearn.ensemble.GradientBoostingClassifier
    if hyperparams is None:
        hyperparams = {}

    timestamps = timestamps.map(lambda x: x.year)

    start = timestamps.min()
    stop = timestamps.max()

    stop = min(stop, 2014) # do not include 2015

    roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9])
    pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9])

    roc_fig.subplots_adjust(left=0.07, right=0.67)
    pr_fig.subplots_adjust(left=0.07, right=0.67)

    clfs = dict()
    auc_rocs = []

    for yr in range(start, stop+1):
        is_not_yr = timestamps != yr
        train_indices = np.array(is_not_yr)
        test_indices = np.array(~is_not_yr)

        clf = classifier(**hyperparams)
        clf.fit(predictors.ix[train_indices,:], classes[train_indices])

        clfs[yr] = clf

        predictions = getattr(clf, prediction_attribute)(predictors.ix[test_indices,:])[:,1]

        auc_roc = viz.roc(predictions,
                          classes[test_indices],
                          block_show=False,
                          ax=roc_ax,
                          bounds=roc_bounds)[3]
        auc_pr = viz.precision_recall(predictions,
                                      classes[test_indices],
                                      block_show=False,
                                      ax=pr_ax)[3]

        auc_roc = float(auc_roc)
        auc_rocs.append(auc_roc)
        auc_pr = float(auc_pr)

        roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_roc))
        pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_pr))

        if verbose:
            print('Year ' + str(yr))
            print('Feature importances:')
            feat_imps = clf.feature_importances_
            idxs = np.argsort(feat_imps)[::-1]
            max_width = max([len(c) for c in predictors.columns])

            for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]):
                print('  {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi))

    return clfs, auc_rocs, roc_ax, pr_ax
                                          classifier=sklearn.ensemble.RandomForestClassifier,
                                          hyperparams=hyperparams,
                                          roc_bounds=partial_auc_bounds,
                                          verbose=args.verbose)

    # Plotting
    ## ROC
    ### Make ROC yearly lines more transparent
    c = roc_ax.get_lines()
    for line in c:
        line.set_alpha(.7)

    ### Plot ROC curve for EPA model
    fpr, tpr, threshes, auc_roc = viz.roc(epa_model_df['Drek_Prediction'],
                                          epa_model_df['Escherichia.coli'] > 235,
                                          ax=roc_ax, block_show=False,
                                          bounds=partial_auc_bounds,
                                          mark_threshes=[235.0])
    ### Format the EPA line
    auc_roc = float(auc_roc)
    epa_line = roc_ax.get_lines()[-3]
    epa_line.set_color([0,0,0])
    epa_line.set_ls('--')
    epa_line.set_linewidth(3)
    epa_line.set_alpha(.85)
    epa_line.set_label('EPA Model - AUC: {0:.5f}'.format(auc_roc))
    roc_ax.get_lines()[-2].set_label('EPA Model @ 235.0')

    ### Prettify the axis
    roc_ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    roc_ax.set_aspect('auto')  # we are going to be zooming around, set it to auto