Пример #1
0
def kde_confidence_intervals(array, kernel="gau"):
    '''Calculate the lower and upper limits of the array
    smoothing it first with a gaussian (default) kernel.
    input:
    array (1d np.array)
    The values to estimate the errors of
    kernel="gau"
    The kernel to use. Passes it to statsmodels, allowed values are:
    ”biw” for biweight
    ”cos” for cosine
    ”epa” for Epanechnikov
    ”gau” for Gaussian.
    ”tri” for triangular
    ”triw” for triweight
    ”uni” for uniform

    returns
    lowerlim, upperlim of the 68 percile
    '''
    array = np.array(array)
    assert len(array.shape) == 1
    dens = kde.KDEUnivariate(array)
    dens.fit()
    low = np.percentile(dens.icdf, 15.865)
    up = np.percentile(dens.icdf, 84.135)
    return low, up
Пример #2
0
def plot_kde(data, ax, title=None, color='r', fill_bt=True):
    """
    Plot a smoothed (by kernel density estimate) histogram.
    :type data: numpy array
    :param data: An array containing the data to be plotted

    :type ax: matplotlib.Axes
    :param ax: The Axes object to draw to

    :type title: str
    :param title: The plot title

    :type color: str
    :param color: The color of the histogram line and fill. Note that the fill
                  will be plotted with an alpha of 0.35.

    :type fill_bt: bool
    :param fill_bt: Specify whether to fill the area beneath the histogram line
    """
    if isinstance(data, list):
        data = np.asarray(data)
    e = kde.KDEUnivariate(data.astype(np.float))
    e.fit()
    ax.plot(e.support, e.density, color=color, alpha=0.9, linewidth=2.25)
    if fill_bt:
        ax.fill_between(e.support,
                        e.density,
                        alpha=.35,
                        zorder=1,
                        antialiased=True,
                        color=color)
    if title is not None:
        t = ax.set_title(title)
        t.set_y(1.05)
Пример #3
0
def kde_smallest_interval(array, kernel="gau", area=68.27):
    '''Calculate the lower and upper limits of the array in such a way,
    area (default=68.27) of the values are in a as small as possible spread.
    For small arrays we need the kde.
    Returns the min and max bounds'''
    array = np.array(array)
    assert len(array.shape) == 1, "The array needs to be 1D to \
estimate the errors!"

    dens = kde.KDEUnivariate(array)
    dens.fit()
    icdf = dens.icdf

    npoints = icdf.shape[0]
    # make sure kde samples dense enough
    assert npoints >= 100

    # find the minimum interval now
    spread = np.int(np.round(npoints * (area / 100.)))
    optimumidx = np.argmin(icdf[spread:] - icdf[:-spread])
    # return min/max bounds
    return icdf[optimumidx], icdf[optimumidx + spread]
Пример #4
0
    '''
    Add jitter to a 0/1 vector of data for plotting.
    '''
    jitters = np.random.rand(*x.shape) * jitter_amount
    x_jittered = x + np.where(x == 1, -1, 1) * jitters
    return x_jittered

# First plot the Switch / No Switch dots vs distance to a safe well. Add jitter.
plt.plot(df['dist'], binary_jitter(df['switch'], .1), '.', alpha = .1)
# Now use the model to plot probability of switching vs distance (the green line).
sorted_dist = np.sort(df['dist'])
argsorted_dist = list(np.argsort(df['dist']))
predicted = model1.predict()[argsorted_dist]
plt.plot(sorted_dist, predicted, lw = 2)

kde_sw = kde.KDEUnivariate(df['dist'][df['switch'] == 1])
kde_nosw = kde.KDEUnivariate(df['dist'][df['switch'] == 0])

kde_sw.fit()
kde_nosw.fit()

plt.plot(kde_sw.support, kde_sw.density, label = 'Switch')
plt.plot(kde_nosw.support, kde_nosw.density, color = 'red', label = 'No Switch')
plt.xlabel('Distance (meters)')
plt.legend(loc = 'best')

model2 = logit('switch ~ I(dist / 100.) + arsenic', data = df).fit()
print model2.summary()

margeff =  model2.get_margeff(at = 'mean')
print margeff.summary()
Пример #5
0

#considering two features: Inducted and Total Games
def binary_jitter(x, jitter_amount=.05):
    '''
    Add jitter to a 0/1 vector of data for plotting.
    '''
    jitters = np.random.rand(*x.shape) * jitter_amount
    x_jittered = x + np.where(x == 1, -1, 1) * jitters
    return x_jittered


# First plot the Inducted / Not Inducted dots vs games to a safe well. Add jitter.
plt.plot(df['totalgames'], binary_jitter(df['inducted'], .1), '.', alpha=.1)
# Now use the model to plot probability of induction vs games
sorted_dist = np.sort(df['totalgames'])
argsorted_dist = list(np.argsort(df['totalgames']))
predicted = model1.predict()[argsorted_dist]
plt.plot(sorted_dist, predicted, lw=2)

kde_sw = kde.KDEUnivariate(df['totalgames'][df['inducted'] == 1])
kde_nosw = kde.KDEUnivariate(df['totalgames'][df['inducted'] == 0])

kde_sw.fit()
kde_nosw.fit()

plt.plot(kde_sw.support, kde_sw.density, label='Inducted')
plt.plot(kde_nosw.support, kde_nosw.density, color='red', label='Not Inducted')
plt.xlabel('Games')
plt.legend(loc='best')
Пример #6
0
    for e in range(n_emails):
        weights_rank = rank_email(email_df, e)
        rank_results.append(weights_rank)

    rank_df['rank'] = rank_results

    return rank_df


train_ranks = make_rank_df(train_df)
print(train_ranks)

threshold = train_ranks['rank'].median()
test_ranks = make_rank_df(test_df)

train_kde = kde.KDEUnivariate((train_ranks['rank']))
train_kde.fit()
test_kde = kde.KDEUnivariate(test_ranks['rank'])
test_kde.fit()

plt.figure(figsize=(8, 6))
plt.fill(train_kde.support,
         train_kde.density,
         color='steelblue',
         alpha=.7,
         label='Train')
plt.fill(test_kde.support,
         test_kde.density,
         color='red',
         alpha=.7,
         label='Test')