def _kde_plot( values: ndarray, grid: ndarray, axes: Axes, bw: Union[float, str] = "scott" ) -> None: """Calculate KDE for observed spacings. Parameters ---------- values: ndarray the values used to compute (fit) the kernel density estimate grid: ndarray the grid of values over which to evaluate the computed KDE curve axes: pyplot.Axes the current axes object to be modified bw: bandwidh The `bw` argument for statsmodels KDEUnivariate .fit Notes ----- we are doing this manually because we want to ensure consistency of the KDE calculation and remove Seaborn control over the process, while also avoiding inconsistent behaviours like https://github.com/mwaskom/seaborn/issues/938 and https://github.com/mwaskom/seaborn/issues/796 """ values = values[values > 0] # prevent floating-point bad behaviour kde = KDE(values) # kde.fit(kernel="gau", bw="scott", cut=0) kde.fit(kernel="gau", bw=bw, cut=0) evaluated = np.empty_like(grid) for i, _ in enumerate(evaluated): evaluated[i] = kde.evaluate(grid[i]) kde_curve = axes.plot(grid, evaluated, label="Kernel Density Estimate") plt.setp(kde_curve, color="black")
def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs): """Univariate Kernel Density Estimation with Statsmodels""" from statsmodels.nonparametric.kde import KDEUnivariate kde = KDEUnivariate(self.data) kde.fit(bw=bandwidth, **kwargs) return kde.evaluate(x_grid)
def compute_entropy(U): HGauss0 = 0.5 + 0.5 * np.log(2 * np.pi) nSingVals = U.shape[1] H = np.empty(nSingVals, dtype='float64') for iBasisVector in range(nSingVals): kde = KDE(np.abs(U[:, iBasisVector])) kde.fit(gridsize=1000) pdf = kde.density x = kde.support dx = x[1] - x[0] # Calculate the Gaussian entropy pdfMean = nansum(x * pdf) * dx with np.errstate(invalid='ignore'): sigma = np.sqrt(nansum(((x - pdfMean)**2) * pdf) * dx) HGauss = HGauss0 + np.log(sigma) # Calculate vMatrix entropy pdf_pos = (pdf > 0) HVMatrix = -np.sum(xlogy(pdf[pdf_pos], pdf[pdf_pos])) * dx # Returned entropy is difference between V-Matrix entropy and Gaussian entropy of similar width (sigma) H[iBasisVector] = HVMatrix - HGauss return H
def draw_hist_and_kde(sample, grid, true_pdf): # гистограмма plt.hist(sample, 20, range=(grid.min(), grid.max()), normed=True, label='histogram') # ядерная оценка плотности kernel_density = KDEUnivariate(sample) kernel_density.fit() plt.plot(grid, kernel_density.evaluate(grid), color='green', linewidth=2, label='kde') # истинная плотность plt.plot(grid, true_pdf(grid), color='red', linewidth=2, alpha=0.3, label='true pdf') plt.legend() plt.show()
class EstimatedKDE(object): """docstring for EstimatedKDE""" eps = 0.05 points = 10000 def __init__(self): super(EstimatedKDE, self).__init__() self.dist = None def fit(self, data): self.min = np.min(data) self.max = np.max(data) self.mean = np.mean(data) self.std = np.std(data) self.dist = KDEUnivariate(data) self.dist.fit() return self def mode(self): x = np.linspace(self.min, self.max, self.points) y = self.dist.evaluate(x) return x[np.argmax(y)] def median(self): return self.dist.icdf[50] def pdf(self, x): return self.dist.evaluate(x)
def gen_kde_pdf(distribution, bounds=None, kde_width=None): ## boundary correction for KDE if bounds == None: print("\t setting bounds to max value") var_min, var_max = min(distribution), max(distribution) else: distribution = distribution[np.where((distribution > bounds[0]) & (distribution < bounds[1]))] var_min, var_max = bounds[0], bounds[1] lower = var_min - abs(distribution - var_min) upper = var_max + abs(distribution - var_max) merge = np.concatenate([lower, upper, distribution]) if kde_width == None: print("... setting kde_width") kde_width = S_MAD(distribution) / 2. KDE_MERGE = KDEUnivariate(merge) KDE_MERGE.fit(bw=kde_width) SCALE = np.divide(1., integrate.quad(KDE_MERGE.evaluate, var_min, var_max)[0]) return lambda X: SCALE * KDE_MERGE.evaluate(X)
def kde_1d(signal, x_grid=None): """ Return 1d kde of a vector signal (Created 01/24/2015) Todo: how are the kde's normalized? (i want the kde to sum to 1....) https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ http://glowingpython.blogspot.com/2012/08/kernel-density-estimation-with-scipy.html Usage ----- >>> x = np.linspace(0,1,401) >>> kde = tw.kde_1d(signal, x) >>> plt.plot(x, kde) >>> plt.grid('on') """ # from scipy.stats.kde import gaussian_kde # if x is None: # x = np.linspace(0,1,401) # # return gaussian_kde(signal)(x) from statsmodels.nonparametric.kde import KDEUnivariate kde = KDEUnivariate(signal) kde.fit() if x_grid is None: x_grid = np.linspace(0, 1, 401) #bin_space = x_grid[1]-x_grid[0] # kde estimate kde_est = kde.evaluate(x_grid) # normalize to pdf (need to come back on this....multiply by bin-spacing??) kde_est /= kde_est.sum() return kde_est, x_grid
def gaussian_density_estimation(samples, weights, grid, bw=0.1): """ Kernel density estimation with Gaussian kernel. Parameters ---------- samples : np.ndarray Array of sample values. weights : np.ndarray Array of sample weights. If None, unweighted KDE will be performed. grid : np.ndarray Grid points at which the KDE function should be evaluated. bw : float Bandwidth parameter for kernel density estimation. Associated with sigma in the case of a Gaussian kernel. Returns ---------- np.ndarray The probability density values at the supplied grid points. """ # KDE for fine-grained optimization kde = KDEUnivariate(samples) kde.fit(weights=weights, bw=bw, fft=False) # evaluate pdf on a grid to for use in SGOOP # TODO: area under curve between points instead of pdf at point return kde.evaluate(grid)
def reweight(rc, metad_traj, cv_columns, v_minus_c_col, rc_bins=20, kt=2.5): """ Reweighting biased MD trajectory to unbiased probabilty along a given reaction coordinate. Using rbias column from COLVAR to perform reweighting per Tiwary and Parinello """ # read in parameters from sgoop object colvar = metad_traj[cv_columns].values v_minus_c = metad_traj[v_minus_c_col].values # calculate rc observable for each frame colvar_rc = np.sum(colvar * rc, axis=1) # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method) weights = np.exp(v_minus_c / kt) norm_weights = weights / weights.sum() # fit weighted KDE with statsmodels method kde = KDEUnivariate(colvar_rc) kde.fit(weights=norm_weights, bw=0.05, fft=False) # evaluate pdf on a grid to for use in SGOOP grid = np.linspace(colvar_rc.min(), colvar.max(), num=rc_bins) pdf = kde.evaluate(grid) pdf = pdf / pdf.sum() return pdf, grid
def setupClass(cls): cls.x = x = KDEWResults['x'] weights = KDEWResults['weights'] res1 = KDE(x) res1.fit(kernel=cls.kernel_name, weights=weights, fft=False) cls.res1 = res1 cls.res_density = KDEWResults[cls.res_kernel_name]
def fit_kde(x, grid): resol = len(grid) d = np.zeros(resol) kde = KDEUnivariate(x) kde.fit() d = kde.evaluate(grid) return d
def pdf(self, token, years, bw=5, *args, **kwargs): """ Estimate a density function from a token's ratio series. Args: token (str) years (iter) bw (int) Returns: OrderedDict {year: density} """ series = self.clean_series(token, *args, **kwargs) # Use the ratio values as weights. weights = np.array(list(series.values())) # Fit the density estimate. density = KDEUnivariate(list(series.keys())) density.fit(fft=False, weights=weights, bw=bw) samples = OrderedDict() for year in years: samples[year] = density.evaluate(year)[0] return samples
def setup_class(cls): cls.decimal_density = 2 # low accuracy because binning is different res1 = KDE(Xi) res1.fit(kernel="gau", fft=True, bw="silverman") cls.res1 = res1 rfname2 = os.path.join(curdir, 'results', 'results_kde_fft.csv') cls.res_density = np.genfromtxt(open(rfname2, 'rb'))
def _mode(data): modes = np.zeros([data.shape[0]]) for i in range(data.shape[0]): kde = KDE(data[i, :]) kde.fit(gridsize=2000) modes[i] = kde.support[np.argmax(kde.density)] return modes
def _reduce_mode(x): if len(x) == 0: return np.NaN x = np.asarray(x, dtype=np.float64) kde = KDE(x) kde.fit(gridsize=2000) return kde.support[np.argmax(kde.density)]
def reduce_mode(x): kde = KDE(x) kde.fit(gridsize=2000) pdf = kde.density x = kde.support return x[np.argmax(pdf)]
def compute_kde(data, test_x): data = data.flatten() test_x = test_x.flatten() kde = KDEUnivariate(data) kde.fit(kernel="gau", bw="silverman") dens = kde.evaluate(test_x) return dens, None
def setup_class(cls): cls.decimal_density = 2 # low accuracy because binning is different res1 = KDE(Xi) res1.fit(kernel="gau", fft=True, bw="silverman") cls.res1 = res1 rfname2 = os.path.join(curdir,'results','results_kde_fft.csv') cls.res_density = np.genfromtxt(open(rfname2, 'rb'))
def setup_class(cls): cls.x = x = KDEWResults['x'] weights = KDEWResults['weights'] res1 = KDE(x) # default kernel was scott when reference values computed res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott") cls.res1 = res1 cls.res_density = KDEWResults[cls.res_kernel_name]
def weighted_kernel_density_1d(values, weights, bw='silverman', plot=False): from statsmodels.nonparametric.kde import KDEUnivariate kden= KDEUnivariate(values) kden.fit(weights=weights, bw=bw, fft=False) if plot: import matplotlib.pyplot as plt plt.plot(kden.support, [kden.evaluate(xi) for xi in kden.support], 'o-') return kden
def setup_class(cls): res1 = KDE(Xi) weights = np.linspace(1,100,200) res1.fit(kernel="gau", gridsize=50, weights=weights, fft=False, bw="silverman") cls.res1 = res1 rfname = os.path.join(curdir,'results','results_kde_weights.csv') cls.res_density = np.genfromtxt(open(rfname, 'rb'), skip_header=1)
def calcKDE(kd_bw=0.1): """ """ #> KDE using StatsModels kde = KDEUnivariate(nao_rn) kde.fit(bw=kd_bw) return kde.evaluate(x_kde)
def fit(self, data): self.min = np.min(data) self.max = np.max(data) self.mean = np.mean(data) self.std = np.std(data) self.dist = KDEUnivariate(data) self.dist.fit() return self
def test_kde_bw_positive(): # GH 6679 x = np.array([ 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 5.67332327, 6.19847872, 7.43189192 ]) kde = KDE(x) kde.fit() assert kde.bw > 0
def find_outiers_kde(x): x_scaled = scale(list(map(float, x))) kde = KDEUnivariate(x_scaled) kde.fit(bw="scott", fft=True) pred = kde.evaluate(x_scaled) n = sum(pred < 0.5) outlierindices = np.asarray(pred).argsort()[:n] outliervalue = np.asarray(x)[outlierindices] return outlierindices, outliervalue
def find_outiers_kde(x): x_scaled = scale(list(map(float,x))) kde = KDEUnivariate(x_scaled) kde.fit(bw="scott",fft=True) pred = kde.evaluate(x_scaled) n = sum(pred < 0.5) outlierindices=np.asarray(pred).argsort()[:n] outliervalue=np.asarray(x)[outlierindices] return outlierindices,outliervalue
def mag_dist(dval): """ Function to plot magnitude distribution for targets .. codeauthor:: Mikkel N. Lund <*****@*****.**> .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger('dataval') logger.info('Plotting Magnitude distribution...') fig, ax = plt.subplots(figsize=plt.figaspect(0.5)) fig.subplots_adjust(left=0.14, wspace=0.3, top=0.94, bottom=0.155, right=0.96) colors = ['r', 'b', 'g'] # TODO: What if there are more than three? for k, cadence in enumerate(dval.cadences): star_vals = dval.search_database(select='todolist.tmag', search=f'cadence={cadence:d}') if star_vals: tmags = np.array([star['tmag'] for star in star_vals]) kde = KDE(tmags) kde.fit(gridsize=1000) ax.fill_between(kde.support, 0, kde.density / np.max(kde.density), color=colors[k], alpha=0.3, label=f'{cadence:d}s cadence') # kde_all = KDE(tmags) # kde_all.fit(gridsize=1000) # ax.plot(kde_all.support, kde_all.density/np.max(kde_all.density), 'k-', lw=1.5, label='All') ax.set_ylim(bottom=0) ax.set_xlabel('TESS magnitude') ax.set_ylabel('Normalised Density') ax.xaxis.set_major_locator(MultipleLocator(2)) ax.xaxis.set_minor_locator(MultipleLocator(1)) ax.legend(frameon=False, loc='upper left', borderaxespad=0, handlelength=2.5, handletextpad=0.4) fig.savefig(os.path.join(dval.outfolder, 'mag_dist')) if not dval.show: plt.close(fig)
def normalize_data(img, contrast='T1'): ''' Normalizes 3D images via KDE and clamping Params: - img: 3D image Returns: - normalized image ''' if contrast == 'T1': CONTRAST = 1 else: CONTRAST = 0 if (len(np.nonzero(img)[0])) == 0: normalized_img = img else: tmp = np.asarray(np.nonzero(img.flatten())) q = np.percentile(tmp, 99.) tmp = tmp[tmp <= q] tmp = np.asarray(tmp, dtype=float).reshape(-1, 1) GRID_SIZE = 80 bw = float(q) / GRID_SIZE kde = KDEUnivariate(tmp) kde.fit(kernel='gau', bw=bw, gridsize=GRID_SIZE, fft=True) X = 100. * kde.density Y = kde.support idx = argrelextrema(X, np.greater) idx = np.asarray(idx, dtype=int) H = X[idx] H = H[0] p = Y[idx] p = p[0] x = 0. if CONTRAST == 1: T1_CLAMP_VALUE = 1.25 x = p[-1] normalized_img = img / x normalized_img[normalized_img > T1_CLAMP_VALUE] = T1_CLAMP_VALUE else: T2_CLAMP_VALUE = 3.5 x = np.amax(H) j = np.where(H == x) x = p[j] if len(x) > 1: x = x[0] normalized_img = img / x normalized_img[normalized_img > T2_CLAMP_VALUE] = T2_CLAMP_VALUE normalized_img /= normalized_img.max() return normalized_img
def bootstrap_stats( args: Dict[str, Any], out_q: Optional[mp.Queue] = None) -> Union[None, Dict[str, Any]]: r''' Computes statistics and KDEs of data via sampling with replacement Arguments: args: dictionary of arguments. Possible keys are: data - data to resample name - name prepended to returned keys in result dict weights - array of weights matching length of data to use for weighted resampling n - number of times to resample data x - points at which to compute the kde values of resample data kde - whether to compute the kde values at x-points for resampled data mean - whether to compute the means of the resampled data std - whether to compute standard deviation of resampled data c68 - whether to compute the width of the absolute central 68.2 percentile of the resampled data out_q: if using multiporcessing can place result dictionary in provided queue Returns: Result dictionary if `out_q` is `None` else `None`. ''' out_dict, mean, std, c68, boot = {}, [], [], [], [] name = '' if 'name' not in args else args['name'] weights = None if 'weights' not in args else args['weights'] if 'n' not in args: args['n'] = 100 if 'kde' not in args: args['kde'] = False if 'mean' not in args: args['mean'] = False if 'std' not in args: args['std'] = False if 'c68' not in args: args['c68'] = False if args['kde'] and args['data'].dtype != 'float64': data = np.array(args['data'], dtype='float64') else: data = args['data'] len_d = len(data) np.random.seed() for i in range(args['n']): points = np.random.choice(data, len_d, replace=True, p=weights) if args['kde']: kde = KDEUnivariate(points) kde.fit() boot.append([kde.evaluate(x) for x in args['x']]) if args['mean']: mean.append(np.mean(points)) if args['std']: std.append(np.std(points, ddof=1)) if args['c68']: c68.append(np.percentile(np.abs(points), 68.2)) if args['kde']: out_dict[f'{name}_kde'] = boot if args['mean']: out_dict[f'{name}_mean'] = mean if args['std']: out_dict[f'{name}_std'] = std if args['c68']: out_dict[f'{name}_c68'] = c68 if out_q is not None: out_q.put(out_dict) else: return out_dict
def kde_param(distribution, x0): ### kde_param tries to ensure correct handling of multimodal distributions ### compute kernal density estimation KDE = KDEUnivariate(distribution) KDE.fit(bw=np.std(distribution)/3.0) result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x), x0 = x0, method='Powell') ## Powell has been working pretty well. return {'result' : float(result['x']), 'kde' : KDE}
def kde_param(distribution, x0): ### compute kernal density estimation KDE = KDEUnivariate(distribution) KDE.fit(bw=np.std(distribution)/3.0) result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x), x0 = x0, method='Powell') #print(result) return {'result' : float(result['x']), 'kde' : KDE}
def calc_bayes_factor(prior_samples, posterior_samples, x=0): '''Returns the Bayes Factor (BF01) such that values >1 indicate there is more support for `x` under the posterior, relative to the prior. ''' kde = KDEUnivariate(prior_samples) kde.fit() prior_density_at_zero = kde.evaluate([x]) kde = KDEUnivariate(posterior_samples) kde.fit() posterior_density_at_zero = kde.evaluate([x]) BF_prior_post = prior_density_at_zero/posterior_density_at_zero return BF_prior_post[0]
def sample_pdf(catalog, parameter, pdf_fun, params, bounds): ## Catalog: pd.DataFrame() input catalog with arbitrary distribution function ## input_fun: desired distribution of sample ## scale: scale of sample param_span = np.linspace(min(catalog[parameter]), max(catalog[parameter]), 100) print("... determine master KDE") KDE = KDEUnivariate(catalog[parameter]) KDE.fit(bw=np.std(catalog[parameter]) / 3) KDE_FUN = interp1d(param_span, KDE.evaluate(param_span)) ## need to rescale within the bounds. NORM = np.divide( 1., integrate.quad(KDE.evaluate, bounds[0], bounds[1], points=param_span[np.where((param_span > bounds[0]) & (param_span < bounds[1]))], limit=200)[0]) ########################################## N = len(catalog[catalog[parameter].between(*bounds)]) ############################################ ### we need the scale from the other function result, kde_fun = determine_scale(catalog, parameter, pdf_fun, params, bounds=bounds) sample = np.random.uniform(0.0, 1.0, len(catalog)) * len(catalog) * NORM * KDE_FUN( catalog[parameter]) boo_array = sample < result['x'] * pdf_fun(catalog[parameter], *params) return catalog[boo_array & (catalog[parameter].between( bounds[0], bounds[1], inclusive=True))].copy()
def reweight(rc, metad_traj, cv_columns, v_minus_c_col, rc_bins=20, kt=2.5, kde=False): """ Reweighting biased MD trajectory to unbiased probabilty along a given reaction coordinate. Using rbias column from COLVAR to perform reweighting per Tiwary and Parinello """ # read in parameters from sgoop object colvar = metad_traj[cv_columns].values v_minus_c = metad_traj[v_minus_c_col].values # calculate rc observable for each frame colvar_rc = np.sum(colvar * rc, axis=1) # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method) weights = np.exp(v_minus_c / kt) norm_weights = weights / weights.sum() if kde: # KDE for fine-grained optimization kde = KDEUnivariate(colvar_rc) kde.fit(weights=norm_weights, bw=0.1, fft=False) # evaluate pdf on a grid to for use in SGOOP # TODO: area under curve between points instead of pdf at point grid = np.linspace(colvar_rc.min(), colvar_rc.max(), num=rc_bins) pdf = kde.evaluate(grid) return pdf, grid # histogram density for coarse optimization ( hist, bin_edges = np.histogram(colvar_rc, weights=norm_weights, bins=rc_bins, density=True, range=(colvar_rc.min(), colvar_rc.max())) # set grid points to center of bins bin_width = bin_edges[1] - bin_edges[0] grid = bin_edges[:-1] + bin_width pdf = hist return pdf, grid
def empiricalPDF(data): """ Evaluate a probability density function using kernel density estimation for input data. :param data: :class:`numpy.ndarray` of data values. :returns: PDF values at the data points. """ LOG.debug("Calculating empirical PDF") sortedmax = np.sort(data) kde = KDEUnivariate(sortedmax) kde.fit() try: res = kde.evaluate(sortedmax) except MemoryError: res = np.zeros(len(sortedmax)) return res
def kde_statsmodels_u(data, grid, **kwargs): """ Univariate Kernel Density Estimation with Statsmodels Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x 1` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x 1` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde = KDEUnivariate(data) kde.fit(**kwargs) return kde.evaluate(grid)
ln_par, ln_lo, ln_up = bootstrap_fit( stats.lognorm, resid, n_iter=n_bs, quant=q ) hc_par, hc_lo, hc_up = bootstrap_fit( stats.halfcauchy, resid, n_iter=n_bs, quant=q ) gam_par, gam_lo, gam_up = bootstrap_fit( stats.gamma, resid, n_iter=n_bs, quant=q ) ################################################################## hc = stats.halfcauchy(*stats.halfcauchy.fit(resid)) lg = stats.lognorm(*stats.lognorm.fit(resid)) dens = KDEUnivariate(resid) dens.fit() ecdf = ECDF(resid) ################################################################## # prepare X axes for plotting ex = ecdf.x x = np.linspace(min(resid), max(resid), 2000) ################################################################## # Fit a Landau distribution with ROOT if HAS_ROOT: root_hist = rootpy.plotting.Hist(100, 0, np.pi) root_hist.fill_array(resid)
plt.title('Logit Residuals'); # Hey I've got an idea, let's just make more plots... fig = plt.figure(figsize=(18,9), dpi=1600) a = .2 fig.add_subplot(221, axisbg="#DBDBDB") """ this is the "kernel density estimator", just like was used above, to create a nice smoothed density plot of the predictions the y-values look incorrect, but I'm guessing the shape is right """ kde_res = KDEUnivariate(res.predict()) kde_res.fit() # I think the "support" is simply the domain in which the # density is greater than 0. plt.plot(kde_res.support,kde_res.density) plt.fill_between(kde_res.support,kde_res.density, alpha=a) plt.title("Distribution of our Predictions") # show that predicted survival probabilities are much lower # for males than females fig.add_subplot(222, axisbg="#DBDBDB") plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a) plt.grid(b=True, which='major', axis='x') plt.xlabel("Predicted chance of survival") plt.ylabel("Gender Bool")
def setup_class(cls): res1 = KDE(Xi) res1.fit(kernel="gau", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["gau_d"]
def kde_statsmodels_u(x, x_grid, bandwidth=0.2, **kwargs): """Univariate Kernel Density Estimation with Statsmodels""" kde = KDEUnivariate(x) kde.fit(bw=bandwidth, **kwargs) return kde.evaluate(x_grid)
def draw_logit_regression(df, kind): w = open("logit_result.txt", "w") formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked)' # here the ~ sign is an = sign, and the features of our dataset results = {} # create a results dictionary to hold our regression results for easy analysis later y, x = dmatrices(formula, data=df, return_type='dataframe') model = sm.Logit(y, x) res = model.fit() results['Logit'] = [res, formula] print >> w, res.summary() if kind is 1: return results # Plot Predictions Vs Actual plt.figure(figsize=(18,4)); plt.subplot(121, axisbg="#DBDBDB") # generate predictions from our fitted model ypred = res.predict(x) plt.plot(x.index, ypred, 'bo', x.index, y, 'mo', alpha=.25); plt.grid(color='white', linestyle='dashed') plt.title('Logit predictions, Blue: \nFitted/predicted values: Red'); plt.savefig("1.eps") # Residuals plt.subplot(122, axisbg="#DBDBDB") plt.plot(res.resid, 'r-') plt.grid(color='white', linestyle='dashed') plt.title('Logit Residuals'); plt.savefig("2.eps") fig = plt.figure(figsize=(18,9), dpi=1600) a = .2 # Below are examples of more advanced plotting. # It it looks strange check out the tutorial above. fig.add_subplot(221, axisbg="#DBDBDB") kde_res = KDEUnivariate(res.predict()) kde_res.fit() plt.plot(kde_res.support,kde_res.density) plt.fill_between(kde_res.support,kde_res.density, alpha=a) title("Distribution of our Predictions") fig.add_subplot(222, axisbg="#DBDBDB") plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a) plt.grid(b=True, which='major', axis='x') plt.xlabel("Predicted chance of survival") plt.ylabel("Gender Bool") title("The Change of Survival Probability by Gender (1 = Male)") fig.add_subplot(223, axisbg="#DBDBDB") plt.scatter(res.predict(),x['C(Pclass)[T.3]'] , alpha=a) plt.xlabel("Predicted chance of survival") plt.ylabel("Class Bool") plt.grid(b=True, which='major', axis='x') title("The Change of Survival Probability by Lower Class (1 = 3rd Class)") fig.add_subplot(224, axisbg="#DBDBDB") plt.scatter(res.predict(),x.Age , alpha=a) plt.grid(True, linewidth=0.15) title("The Change of Survival Probability by Age") plt.xlabel("Predicted chance of survival") plt.ylabel("Age") plt.savefig("prediction.eps")
def lfdr(p_values, pi0, trunc = True, monotone = True, transf = "probit", adj = 1.5, eps = np.power(10.0,-8)): """ Estimate local FDR / posterior error probability from p-values according to bioconductor/qvalue """ p = np.array(p_values) # Compare to bioconductor/qvalue reference implementation # import rpy2 # import rpy2.robjects as robjects # from rpy2.robjects import pandas2ri # pandas2ri.activate() # density=robjects.r('density') # smoothspline=robjects.r('smooth.spline') # predict=robjects.r('predict') # Check inputs lfdr_out = p rm_na = np.isfinite(p) p = p[rm_na] if (min(p) < 0 or max(p) > 1): raise click.ClickException("p-values not in valid range [0,1].") elif (pi0 < 0 or pi0 > 1): raise click.ClickException("pi0 not in valid range [0,1].") # Local FDR method for both probit and logit transformations if (transf == "probit"): p = np.maximum(p, eps) p = np.minimum(p, 1-eps) x = scipy.stats.norm.ppf(p, loc=0, scale=1) # R-like implementation bw = bw_nrd0(x) myd = KDEUnivariate(x) myd.fit(bw=adj*bw, gridsize = 512) splinefit = sp.interpolate.splrep(myd.support, myd.density) y = sp.interpolate.splev(x, splinefit) # myd = density(x, adjust = 1.5) # R reference function # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function # y = predict(mys, x).rx2('y') # R reference function lfdr = pi0 * scipy.stats.norm.pdf(x) / y elif (transf == "logit"): x = np.log((p + eps) / (1 - p + eps)) # R-like implementation bw = bw_nrd0(x) myd = KDEUnivariate(x) myd.fit(bw=adj*bw, gridsize = 512) splinefit = sp.interpolate.splrep(myd.support, myd.density) y = sp.interpolate.splev(x, splinefit) # myd = density(x, adjust = 1.5) # R reference function # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function # y = predict(mys, x).rx2('y') # R reference function dx = np.exp(x) / np.power((1 + np.exp(x)),2) lfdr = (pi0 * dx) / y else: raise click.ClickException("Invalid local FDR method.") if (trunc): lfdr[lfdr > 1] = 1 if (monotone): lfdr = lfdr[p.ravel().argsort()] for i in range(1,len(x)): if (lfdr[i] < lfdr[i - 1]): lfdr[i] = lfdr[i - 1] lfdr = lfdr[scipy.stats.rankdata(p,"min")-1] lfdr_out[rm_na] = lfdr return lfdr_out