def kde_confidence_intervals(array, kernel="gau"): '''Calculate the lower and upper limits of the array smoothing it first with a gaussian (default) kernel. input: array (1d np.array) The values to estimate the errors of kernel="gau" The kernel to use. Passes it to statsmodels, allowed values are: ”biw” for biweight ”cos” for cosine ”epa” for Epanechnikov ”gau” for Gaussian. ”tri” for triangular ”triw” for triweight ”uni” for uniform returns lowerlim, upperlim of the 68 percile ''' array = np.array(array) assert len(array.shape) == 1 dens = kde.KDEUnivariate(array) dens.fit() low = np.percentile(dens.icdf, 15.865) up = np.percentile(dens.icdf, 84.135) return low, up
def plot_kde(data, ax, title=None, color='r', fill_bt=True): """ Plot a smoothed (by kernel density estimate) histogram. :type data: numpy array :param data: An array containing the data to be plotted :type ax: matplotlib.Axes :param ax: The Axes object to draw to :type title: str :param title: The plot title :type color: str :param color: The color of the histogram line and fill. Note that the fill will be plotted with an alpha of 0.35. :type fill_bt: bool :param fill_bt: Specify whether to fill the area beneath the histogram line """ if isinstance(data, list): data = np.asarray(data) e = kde.KDEUnivariate(data.astype(np.float)) e.fit() ax.plot(e.support, e.density, color=color, alpha=0.9, linewidth=2.25) if fill_bt: ax.fill_between(e.support, e.density, alpha=.35, zorder=1, antialiased=True, color=color) if title is not None: t = ax.set_title(title) t.set_y(1.05)
def kde_smallest_interval(array, kernel="gau", area=68.27): '''Calculate the lower and upper limits of the array in such a way, area (default=68.27) of the values are in a as small as possible spread. For small arrays we need the kde. Returns the min and max bounds''' array = np.array(array) assert len(array.shape) == 1, "The array needs to be 1D to \ estimate the errors!" dens = kde.KDEUnivariate(array) dens.fit() icdf = dens.icdf npoints = icdf.shape[0] # make sure kde samples dense enough assert npoints >= 100 # find the minimum interval now spread = np.int(np.round(npoints * (area / 100.))) optimumidx = np.argmin(icdf[spread:] - icdf[:-spread]) # return min/max bounds return icdf[optimumidx], icdf[optimumidx + spread]
''' Add jitter to a 0/1 vector of data for plotting. ''' jitters = np.random.rand(*x.shape) * jitter_amount x_jittered = x + np.where(x == 1, -1, 1) * jitters return x_jittered # First plot the Switch / No Switch dots vs distance to a safe well. Add jitter. plt.plot(df['dist'], binary_jitter(df['switch'], .1), '.', alpha = .1) # Now use the model to plot probability of switching vs distance (the green line). sorted_dist = np.sort(df['dist']) argsorted_dist = list(np.argsort(df['dist'])) predicted = model1.predict()[argsorted_dist] plt.plot(sorted_dist, predicted, lw = 2) kde_sw = kde.KDEUnivariate(df['dist'][df['switch'] == 1]) kde_nosw = kde.KDEUnivariate(df['dist'][df['switch'] == 0]) kde_sw.fit() kde_nosw.fit() plt.plot(kde_sw.support, kde_sw.density, label = 'Switch') plt.plot(kde_nosw.support, kde_nosw.density, color = 'red', label = 'No Switch') plt.xlabel('Distance (meters)') plt.legend(loc = 'best') model2 = logit('switch ~ I(dist / 100.) + arsenic', data = df).fit() print model2.summary() margeff = model2.get_margeff(at = 'mean') print margeff.summary()
#considering two features: Inducted and Total Games def binary_jitter(x, jitter_amount=.05): ''' Add jitter to a 0/1 vector of data for plotting. ''' jitters = np.random.rand(*x.shape) * jitter_amount x_jittered = x + np.where(x == 1, -1, 1) * jitters return x_jittered # First plot the Inducted / Not Inducted dots vs games to a safe well. Add jitter. plt.plot(df['totalgames'], binary_jitter(df['inducted'], .1), '.', alpha=.1) # Now use the model to plot probability of induction vs games sorted_dist = np.sort(df['totalgames']) argsorted_dist = list(np.argsort(df['totalgames'])) predicted = model1.predict()[argsorted_dist] plt.plot(sorted_dist, predicted, lw=2) kde_sw = kde.KDEUnivariate(df['totalgames'][df['inducted'] == 1]) kde_nosw = kde.KDEUnivariate(df['totalgames'][df['inducted'] == 0]) kde_sw.fit() kde_nosw.fit() plt.plot(kde_sw.support, kde_sw.density, label='Inducted') plt.plot(kde_nosw.support, kde_nosw.density, color='red', label='Not Inducted') plt.xlabel('Games') plt.legend(loc='best')
for e in range(n_emails): weights_rank = rank_email(email_df, e) rank_results.append(weights_rank) rank_df['rank'] = rank_results return rank_df train_ranks = make_rank_df(train_df) print(train_ranks) threshold = train_ranks['rank'].median() test_ranks = make_rank_df(test_df) train_kde = kde.KDEUnivariate((train_ranks['rank'])) train_kde.fit() test_kde = kde.KDEUnivariate(test_ranks['rank']) test_kde.fit() plt.figure(figsize=(8, 6)) plt.fill(train_kde.support, train_kde.density, color='steelblue', alpha=.7, label='Train') plt.fill(test_kde.support, test_kde.density, color='red', alpha=.7, label='Test')