示例#1
0
def get_roc_auc(labels, score, verbose=True):
    """return area under ROC curve

    Parameters
    ----------
    labels : np.ndarray
        vector of ground truth
    score : np.ndarray
        vector of scores assigned by classifier (i.e. 
        clf.pred_proba(...)[-1] in sklearn)
    verbose : boolean
        iff True, prints area under the curve
        
    Returns
    -------
    float
        area under the curve

    """
    labels = utils.check_col(labels, argument_name='labels')
    score = utils.check_col(score, argument_name='score')
    auc_score = roc_auc_score(labels, score)
    if verbose:
        print 'ROC AUC: {}'.format(auc_score)
    return auc_score
示例#2
0
def get_roc_auc(labels, score, verbose=True):
    """return area under ROC curve

    Parameters
    ----------
    labels : np.ndarray
        vector of ground truth
    score : np.ndarray
        vector of scores assigned by classifier (i.e. 
        clf.pred_proba(...)[-1] in sklearn)
    verbose : boolean
        iff True, prints area under the curve
        
    Returns
    -------
    float
        area under the curve

    """
    labels = utils.check_col(labels, argument_name='labels')
    score = utils.check_col(score, argument_name='score')
    auc_score = roc_auc_score(labels, score)
    if verbose:
        print 'ROC AUC: {}'.format(auc_score)
    return auc_score
示例#3
0
def plot_prec_recall(labels, score, title='Prec/Recall', verbose=True):
    """Plot precision/recall curve

    Parameters
    ----------
    labels : np.ndarray
        vector of ground truth
    score : np.ndarray
        vector of scores assigned by classifier (i.e. 
        clf.pred_proba(...)[-1] in sklearn)
    title : str
        title of plot
    verbose : boolean
        iff True, display the graph
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    labels = utils.check_col(labels, argument_name='labels')
    score = utils.check_col(score, argument_name='score')
    # adapted from Rayid's prec/recall code
    y_true = labels
    y_score = score
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
        y_true, 
        y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    fig = plt.figure()
    ax1 = plt.gca()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    plt.title(title)
    if verbose:
        fig.show()
    return fig
示例#4
0
def plot_prec_recall(labels, score, title='Prec/Recall', verbose=True):
    """Plot precision/recall curve

    Parameters
    ----------
    labels : np.ndarray
        vector of ground truth
    score : np.ndarray
        vector of scores assigned by classifier (i.e. 
        clf.pred_proba(...)[-1] in sklearn)
    title : str
        title of plot
    verbose : boolean
        iff True, display the graph
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    labels = utils.check_col(labels, argument_name='labels')
    score = utils.check_col(score, argument_name='score')
    # adapted from Rayid's prec/recall code
    y_true = labels
    y_score = score
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
        y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score >= value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    fig = plt.figure()
    ax1 = plt.gca()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    plt.title(title)
    if verbose:
        fig.show()
    return fig
示例#5
0
    def test_check_col(self):
        valid1 = np.array([1, 2, 3, 4])
        valid2 = np.array([[1.0], [2], [3], [4]])
        valid3 = [3.0, 2.0, 1.8]
        valid4 = pd.Series(valid1)
        for valid in (valid1, valid2, valid3, valid4):
            self.assertTrue(utils.is_nd(utils.check_col(valid)))

        self.assertRaises(ValueError, utils.check_col, None)
        self.assertRaises(ValueError, utils.check_col, "lalala")
        self.assertRaises(ValueError, utils.check_col, np.array(
            [[1, 2], [3, 4]]))

        utils.check_col(valid1, n_rows=4)
        self.assertRaises(ValueError, utils.check_col, valid1, n_rows=5)
示例#6
0
def normalize(col, mean=None, stddev=None, return_fit=False):
    """Generate a normalized column.
    
    Normalize both mean and std dev.
    
    Parameters
    ----------
    col : np.ndarray
    mean : float or None
        Mean to use for fit. If none, will use 0
    stddev : float or None
    return_fit : boolean
        If True, returns tuple of fitted col, mean, and standard dev of fit.
        If False, only returns fitted col
    Returns
    -------
    np.ndarray or (np.array, float, float)
    
    """
    # see infonavit for applying to different set than we fit on
    # https://github.com/dssg/infonavit-public/blob/master/pipeline_src/preprocessing.py#L99
    # Logic is from sklearn StandardScaler, but I didn't use sklearn because
    # I want to pass in mean and stddev rather than a fitted StandardScaler
    # https://github.com/scikit-learn/scikit-learn/blob/a95203b/sklearn/preprocessing/data.py#L276
    col = utils.check_col(col)

    if mean is None:
        mean = np.mean(col)
    if stddev is None:
        stddev = np.std(col)
    res = (col - mean) / stddev
    if return_fit:
        return (res, mean, stddev)
    else:
        return res
示例#7
0
def normalize(col, mean=None, stddev=None, return_fit=False):
    """Generate a normalized column.
    
    Normalize both mean and std dev.
    
    Parameters
    ----------
    col : np.ndarray
    mean : float or None
        Mean to use for fit. If none, will use 0
    stddev : float or None
    return_fit : boolean
        If True, returns tuple of fitted col, mean, and standard dev of fit.
        If False, only returns fitted col
    Returns
    -------
    np.ndarray or (np.array, float, float)
    
    """
    # see infonavit for applying to different set than we fit on
    # https://github.com/dssg/infonavit-public/blob/master/pipeline_src/preprocessing.py#L99
    # Logic is from sklearn StandardScaler, but I didn't use sklearn because
    # I want to pass in mean and stddev rather than a fitted StandardScaler
    # https://github.com/scikit-learn/scikit-learn/blob/a95203b/sklearn/preprocessing/data.py#L276
    col = utils.check_col(col)

    if mean is None:
        mean = np.mean(col)
    if stddev is None:
        stddev = np.std(col)
    res = (col - mean) / stddev
    if return_fit:
        return (res, mean, stddev)
    else:
        return res
示例#8
0
def plot_on_timeline(col, verbose=True):
    """Plots points on a timeline
    
    Parameters
    ----------
    col : np.array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    
    Returns
    -------
    matplotlib.figure.Figure
    """
    col = utils.check_col(col)
    # http://stackoverflow.com/questions/1574088/plotting-time-in-python-with-matplotlib
    if is_nd(col):
        col = col.astype(datetime)
    dates = matplotlib.dates.date2num(col)
    fig = plt.figure()
    plt.plot_date(dates, [0] * len(dates))
    if verbose:
        plt.show()
    return fig
示例#9
0
def plot_box_plot(col, title=None, verbose=True):
    """Makes a box plot for a feature
    
    Parameters
    ----------
    col : np.array
    title : str or None
        title of a plot
    verbose : boolean
        iff True, display the graph
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    col = utils.check_col(col)

    fig = plt.figure()
    boxplot(col)
    if title:
        plt.title(title)
    #add col_name to graphn
    if verbose:
        plt.show()
    return fig
示例#10
0
def plot_box_plot(col, title=None, verbose=True):
    """Makes a box plot for a feature
    
    Parameters
    ----------
    col : np.array
    title : str or None
        title of a plot
    verbose : boolean
        iff True, display the graph
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    col = utils.check_col(col)

    fig = plt.figure()
    boxplot(col)
    if title:
        plt.title(title)
    #add col_name to graphn
    if verbose:
        plt.show()
    return fig
示例#11
0
def plot_simple_histogram(col, verbose=True):
    """Makes a histogram of values in a column

    Parameters
    ----------
    col : np.ndarray
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    col = utils.check_col(col)
    override_xticks = False
    if col.dtype.char in ('O', 'S'):  # If col is strings, handle differently
        counts = Counter(col)
        categories = sorted(counts.keys())
        hist = [counts[cat] for cat in categories]
        bins = np.arange(len(categories) + 1)
        override_xticks = True
    else:
        hist, bins = np.histogram(col, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    f = plt.figure()
    plt.bar(center, hist, align='center', width=width)
    if override_xticks:
        plt.xticks(center, categories)
    if verbose:
        plt.show()
    return f
示例#12
0
def plot_on_timeline(col, verbose=True):
    """Plots points on a timeline
    
    Parameters
    ----------
    col : np.array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    
    Returns
    -------
    matplotlib.figure.Figure
    """
    col = utils.check_col(col)
    # http://stackoverflow.com/questions/1574088/plotting-time-in-python-with-matplotlib
    if is_nd(col):
        col = col.astype(datetime)
    dates = matplotlib.dates.date2num(col)
    fig = plt.figure()
    plt.plot_date(dates, [0] * len(dates))
    if verbose:
        plt.show()
    return fig
示例#13
0
def plot_simple_histogram(col, verbose=True):
    """Makes a histogram of values in a column

    Parameters
    ----------
    col : np.ndarray
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    col = utils.check_col(col)
    override_xticks = False
    if col.dtype.char in ('O', 'S'): # If col is strings, handle differently
        counts = Counter(col)
        categories = sorted(counts.keys())
        hist = [counts[cat] for cat in categories]
        bins = np.arange(len(categories) + 1)
        override_xticks = True
    else:
        hist, bins = np.histogram(col, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    f = plt.figure()
    plt.bar(center, hist, align='center', width=width)
    if override_xticks:
        plt.xticks(center, categories)
    if verbose:
        plt.show()
    return f
示例#14
0
def crosstab(col1, col2, verbose=True):
    """
    Makes a crosstab of col1 and col2. This is represented as a
    structured array with the following properties:

    1. The first column is the value of col1 being crossed
    2. The name of every column except the first is the value of col2 being
       crossed
    3. To find the number of cooccurences of x from col1 and y in col2,
       find the row that has 'x' in col1 and the column named 'y'. The 
       corresponding cell is the number of cooccurrences of x and y

    Parameters
    ----------
    col1 : np.ndarray
    col2 : np.ndarray

    Returns
    -------
    np.ndarray
        structured array

    """
    col1 = utils.check_col(col1, argument_name='col1')
    col2 = utils.check_col(col2, argument_name='col2')
    col1_unique = np.unique(col1)
    col2_unique = np.unique(col2)
    crosstab_rows = []
    for col1_val in col1_unique:
        loc_col1_val = np.where(col1 == col1_val)[0]
        col2_vals = col2[loc_col1_val]
        cnt = Counter(col2_vals)
        counts = [
            cnt[col2_val] if cnt.has_key(col2_val) else 0
            for col2_val in col2_unique
        ]
        crosstab_rows.append(['{}'.format(col1_val)] + counts)
    col_names = ['col1_value'
                 ] + ['{}'.format(col2_val) for col2_val in col2_unique]
    ret = convert_to_sa(crosstab_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#15
0
def distance_from_point(lat_origin, lng_origin, lat_col, lng_col):
    """Generates a column of how far each record is from the origin
    
    Parameters
    ----------
    lat_origin : number
    lng_origin : number
    lat_col : np.ndarray
    lng_col : np.ndarray

    Returns
    -------
    np.ndarray

    """
    lat_col = utils.check_col(lat_col, argument_name='lat_col')
    lng_col = utils.check_col(lng_col, argument_name='lng_col')

    return distance(lat_origin, lng_origin, lat_col, lng_col)
示例#16
0
def distance_from_point(lat_origin, lng_origin, lat_col, lng_col):
    """Generates a column of how far each record is from the origin
    
    Parameters
    ----------
    lat_origin : number
    lng_origin : number
    lat_col : np.ndarray
    lng_col : np.ndarray

    Returns
    -------
    np.ndarray

    """
    lat_col = utils.check_col(lat_col, argument_name='lat_col')
    lng_col = utils.check_col(lng_col, argument_name='lng_col')

    return distance(lat_origin, lng_origin, lat_col, lng_col)
示例#17
0
def crosstab(col1, col2, verbose=True):
    """
    Makes a crosstab of col1 and col2. This is represented as a
    structured array with the following properties:

    1. The first column is the value of col1 being crossed
    2. The name of every column except the first is the value of col2 being
       crossed
    3. To find the number of cooccurences of x from col1 and y in col2,
       find the row that has 'x' in col1 and the column named 'y'. The 
       corresponding cell is the number of cooccurrences of x and y

    Parameters
    ----------
    col1 : np.ndarray
    col2 : np.ndarray

    Returns
    -------
    np.ndarray
        structured array

    """
    col1 = utils.check_col(col1, argument_name='col1')
    col2 = utils.check_col(col2, argument_name='col2')
    col1_unique = np.unique(col1)
    col2_unique = np.unique(col2)
    crosstab_rows = []
    for col1_val in col1_unique:
        loc_col1_val = np.where(col1==col1_val)[0]
        col2_vals = col2[loc_col1_val]
        cnt = Counter(col2_vals)
        counts = [cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val 
                  in col2_unique]
        crosstab_rows.append(['{}'.format(col1_val)] + counts)
    col_names = ['col1_value'] + ['{}'.format(col2_val) for col2_val in 
                                  col2_unique]
    ret = convert_to_sa(crosstab_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#18
0
 def __init__(self,
              M,
              labels,
              clfs=[{
                  'clf': RandomForestClassifier
              }],
              subsets=[{
                  'subset': s_i.SubsetNoSubset
              }],
              cvs=[{
                  'cv': KFold
              }],
              trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(labels,
                                           n_rows=M.shape[0],
                                           argument_name='labels')
         else:
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                 M, labels, col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
             clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)},
             optional_keys_take_lists=True,
             argument_name='clfs')
         subsets = utils.check_arguments(subsets, {
             'subset':
             lambda subset: issubclass(subset, s_i.BaseSubsetIter)
         },
                                         optional_keys_take_lists=True,
                                         argument_name='subsets')
         cvs = utils.check_arguments(
             cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
             optional_keys_take_lists=True,
             argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
示例#19
0
 def __init__(
         self, 
         M, 
         labels, 
         clfs=[{'clf': RandomForestClassifier}], 
         subsets=[{'subset': s_i.SubsetNoSubset}], 
         cvs=[{'cv': KFold}],
         trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(
                     labels, 
                     n_rows=M.shape[0], 
                     argument_name='labels')
         else:    
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                     M, 
                     labels, 
                     col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
                 clfs, 
                 {'clf': lambda clf: issubclass(clf, BaseEstimator)},
                 optional_keys_take_lists=True,
                 argument_name='clfs')
         subsets = utils.check_arguments(
                 subsets,
                 {'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter)},
                 optional_keys_take_lists=True,
                 argument_name='subsets')
         cvs = utils.check_arguments(
                 cvs,
                 {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
                 optional_keys_take_lists=True,
                 argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
示例#20
0
def plot_kernel_density(col, verbose=True):
    """Plots kernel density function of column

    From: 
    https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/

    Parameters
    ----------
    col : np.ndarray
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    #address pass entire matrix
    # TODO respect missing_val
    # TODO what does n do?
    col = utils.check_col(col)
    x_grid = np.linspace(min(col), max(col), 1000)

    grid = GridSearchCV(KernelDensity(),
                        {'bandwidth': np.linspace(0.1, 1.0, 30)},
                        cv=20)  # 20-fold cross-validation
    grid.fit(col[:, None])

    kde = grid.best_estimator_
    pdf = np.exp(kde.score_samples(x_grid[:, None]))

    fig, ax = plt.subplots()
    #fig = plt.figure()
    ax.plot(x_grid,
            pdf,
            linewidth=3,
            alpha=0.5,
            label='bw=%.2f' % kde.bandwidth)
    ax.hist(col, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
    ax.legend(loc='upper left')
    ax.set_xlim(min(col), max(col))
    if verbose:
        plt.show()
    return fig
示例#21
0
def plot_kernel_density(col, verbose=True): 
    """Plots kernel density function of column

    From: 
    https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/

    Parameters
    ----------
    col : np.ndarray
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    #address pass entire matrix
    # TODO respect missing_val
    # TODO what does n do?
    col = utils.check_col(col)
    x_grid = np.linspace(min(col), max(col), 1000)

    grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,30)}, cv=20) # 20-fold cross-validation
    grid.fit(col[:, None])

    kde = grid.best_estimator_
    pdf = np.exp(kde.score_samples(x_grid[:, None]))

    fig, ax = plt.subplots()
    #fig = plt.figure()
    ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
    ax.hist(col, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
    ax.legend(loc='upper left')
    ax.set_xlim(min(col), max(col))
    if verbose:
        plt.show()
    return fig
示例#22
0
def table(col, verbose=True):
    """
    Creates a summary or the number of occurrences of each value in the column

    Similar to R's table

    Parameters
    ----------
    col :np.ndarray

    Returns
    -------
    np.ndarray
        structured array
    """
    col = utils.check_col(col)
    cnt = Counter(col)
    cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0])
    ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count'))
    if verbose:
        pprint_sa(ret)
    return ret
示例#23
0
def table(col, verbose=True):
    """
    Creates a summary or the number of occurrences of each value in the column

    Similar to R's table

    Parameters
    ----------
    col :np.ndarray

    Returns
    -------
    np.ndarray
        structured array
    """
    col = utils.check_col(col)
    cnt = Counter(col)
    cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0])
    ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count'))
    if verbose:
        pprint_sa(ret)
    return ret
示例#24
0
def generate_bin(col, num_bins):
    """Generates a column of categories, where each category is a bin.

    Parameters
    ----------
    col : np.ndarray
    
    Returns
    -------
    np.ndarray
    
    Examples
    --------
    >>> M = np.array([0.1, 3.0, 0.0, 1.2, 2.5, 1.7, 2])
    >>> generate_bin(M, 3)
    [0 3 0 1 2 1 2]

    """
    col = utils.check_col(col)

    minimum = float(min(col))
    maximum = float(max(col))
    distance = float(maximum - minimum)
    return [int((x - minimum) / distance * num_bins) for x in col]
示例#25
0
def plot_simple_histogram(col, verbose=True):
    """Makes a histogram of values in a column

    Parameters
    ----------
    col : np.ndarray
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot

    """
    col = utils.check_col(col)
    hist, bins = np.histogram(col, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    f = plt.figure()
    plt.bar(center, hist, align='center', width=width)
    if verbose:
        plt.show()
    return f
示例#26
0
def generate_bin(col, num_bins):
    """Generates a column of categories, where each category is a bin.

    Parameters
    ----------
    col : np.ndarray
    
    Returns
    -------
    np.ndarray
    
    Examples
    --------
    >>> M = np.array([0.1, 3.0, 0.0, 1.2, 2.5, 1.7, 2])
    >>> generate_bin(M, 3)
    [0 3 0 1 2 1 2]

    """
    col = utils.check_col(col)

    minimum = float(min(col))
    maximum = float(max(col))
    distance = float(maximum - minimum)
    return [int((x - minimum) / distance * num_bins) for x in col]