def stratified_bayesian_blocks(x, p0=0.01, min_bin_width=0.01): """ Creates smart histogram bins for mixed continuous/categorical data """ # Create strata r, s = np.unique(x, return_counts=True) strata_edges = bayesian_blocks(s, p0=0.01) strata_bins = zip(strata_edges[:-1], strata_edges[1:]) # Iterate over each strata data_bins = [] for strata_bin in strata_bins: # Select the data pertaining only to a given strata sel = (strata_bin[0] <= s) & (s < strata_bin[1]) strata_data = np.repeat(r[sel], s[sel]) # Normalize and regularize the data within the strata strata_data = _regularize(_normalize(strata_data)) # Perform Bayesian Blocks and append the bins edges = bayesian_blocks(strata_data, p0=0.01) data_bins.append(edges) # Collect the bins from all strata together data_bins = np.sort(np.concatenate(data_bins)) # Clean up with the min_bin_width heuristic sel = (data_bins[1:] - min_bin_width) < data_bins[:-1] sel = np.r_[False, sel] data_bins = data_bins[~sel] return data_bins
def stratified_bayesian_blocks(x, p0=0.01, min_bin_width=0.01): """ Creates smart histogram bins for mixed continuous/categorical data """ # Create strata r, s = np.unique(x, return_counts=True) strata_edges = bayesian_blocks(s, p0=0.01) strata_bins = zip(strata_edges[:-1], strata_edges[1:]) # Iterate over each strata data_bins = [] for strata_bin in strata_bins: # Select the data pertaining only to a given strata sel = (strata_bin[0] <= s) & (s < strata_bin[1]) strata_data = np.repeat(r[sel], s[sel]) # Normalize and regularize the data within the strata strata_data = _regularize(_normalize(strata_data)) # Perform Bayesian Blocks and append the bins edges = bayesian_blocks(strata_data, p0=0.01) data_bins.append(edges) # Collect the bins from all strata together data_bins = np.sort(np.concatenate(data_bins)) # Clean up with the min_bin_width heuristic sel = (data_bins[1:]-min_bin_width) < data_bins[:-1] sel = np.r_[False, sel] data_bins = data_bins[~sel] return data_bins
def test_duplicate_events(): t = np.random.random(100) t[80:] = t[:20] x = np.ones_like(t) x[:20] += 1 bins1 = bayesian_blocks(t) bins2 = bayesian_blocks(t[:80], x[:80]) assert_allclose(bins1, bins2)
def test_duplicate_events(): t = np.random.random(100) t[80:] = t[:20] x = np.ones_like(t) x[:20] += 1 with catch_warnings(AstroMLDeprecationWarning): bins1 = bayesian_blocks(t) bins2 = bayesian_blocks(t[:80], x[:80]) assert_allclose(bins1, bins2)
def ProbRemoval(data, h, rMags): # data = output from DCDT # h = output from makePrior # rMags = rMags from catalog, same length as data z = [] bBlocks = [] for i in range(len(data)): rN = np.searchsorted( rMids, rMags[i]) # index of nearest rMag with defined zDist if rN > 239: print(rMags[i]) rN = 239 ws = np.interp(data[i][:, 1], zMids, h[rN]) # values of zDist at zFits ws = ws / np.max( ws) # Normalize to make probabalistic removal possible ran = np.random.rand(len(ws)) msk = ran > ws # True where ran exceeds prior prob zs = np.ma.masked_array(data[i][:, 1], mask=msk).compressed() bins = bayesian_blocks(zs, fitness='events', p0=0.25) histo = np.histogram(zs, bins) bBlocks.append([bins, histo]) try: nMax = np.argmax(histo[0]) loc = (histo[1][nMax] + histo[1][nMax + 1]) / 2. z.append(loc) except: loc = np.percentile(zs, 50) z.append(loc) return (z)
def gaussfit_sky(values, p_thresh=0.65, plot=False, **extras): """Fit a gaussian to the lower part of a histogram of the sky values. The histogram bins are estimated using Bayesian blocks. p_thresh gives the percentile below which the gaussian is fitted to the data. Return central value and estimate of standard deviation per pixel """ bins = bayesian_blocks(values) print(len(bins), bins) #dbin = bins[1:]-bins[:-1] cbin = (bins[1:] + bins[:-1]) / 2 hist = np.histogram(values, bins=bins, range=(bins.min(), bins.max()), density=True) #pdf = hist/dbin val_thresh = np.percentile(values, p_thresh) lower = cbin < p_thresh def gauss(x, *p): A, mu, sigma = p return A * np.exp(-(x - mu)**2 / (2. * sigma**2)) # p0 is the initial guess for the fitting coefficients (A, mu and sigma above) p0 = [np.max(hist[0]), values.mean(), values.std()] coeff, var_matrix = curve_fit(gauss, cbin[lower], hist[0][lower], p0=p0) if plot: print(len(hist[1]), len(hist[0]), type(coeff)) pl.figure() pl.plot(cbin, hist[0], color='b') pl.plot(cbin, gauss(cbin, [coeff[0], coeff[1], coeff[2]]), color='r') pl.axvline(val_thresh) return coeff[1], coeff[2]
def ProbRemoval(data,h,rMags): # data = output from DCDT # h = output from makePrior # rMags = rMags from catalog, same length as data z = [] bBlocks = [] for i in range(len(data)): rN = np.searchsorted(rMids,rMags[i]) # index of nearest rMag with defined zDist if rN > 239: print(rMags[i]) rN = 239 ws = np.interp(data[i][:,1],zMids,h[rN]) # values of zDist at zFits ws = ws/np.max(ws) # Normalize to make probabalistic removal possible ran = np.random.rand(len(ws)) msk = ran > ws # True where ran exceeds prior prob zs = np.ma.masked_array(data[i][:,1],mask=msk).compressed() bins = bayesian_blocks(zs,fitness='events',p0=0.25) histo = np.histogram(zs,bins) bBlocks.append([bins,histo]) try: nMax = np.argmax(histo[0]) loc = (histo[1][nMax]+histo[1][nMax+1])/2. z.append(loc) except: loc = np.percentile(zs,50) z.append(loc) return(z)
def gaussfit_sky(values, p_thresh = 0.65, plot = False, **extras): """Fit a gaussian to the lower part of a histogram of the sky values. The histogram bins are estimated using Bayesian blocks. p_thresh gives the percentile below which the gaussian is fitted to the data. Return central value and estimate of standard deviation per pixel """ bins = bayesian_blocks(values) print(len(bins),bins) #dbin = bins[1:]-bins[:-1] cbin = (bins[1:]+bins[:-1])/2 hist = np.histogram(values, bins = bins, range = (bins.min(), bins.max()), density = True) #pdf = hist/dbin val_thresh = np.percentile(values, p_thresh) lower = cbin < p_thresh def gauss(x, *p): A, mu, sigma = p return A*np.exp(-(x-mu)**2/(2.*sigma**2)) # p0 is the initial guess for the fitting coefficients (A, mu and sigma above) p0 = [np.max(hist[0]), values.mean(), values.std()] coeff, var_matrix = curve_fit(gauss, cbin[lower], hist[0][lower], p0=p0) if plot: print(len(hist[1]), len(hist[0]),type(coeff)) pl.figure() pl.plot(cbin,hist[0], color = 'b') pl.plot(cbin, gauss(cbin, [coeff[0], coeff[1], coeff[2]]), color = 'r') pl.axvline(val_thresh) return coeff[1], coeff[2]
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'): """ Set the plx_kde Parameters ---------- t : ndarray float Catalog of parallax measures (units: mas) bandwidth : float Bandwidth for gaussian_kde (optional, 0.01 recommended) method : string Method for density determination (options: scipy_kde, sklearn_kde, blocks) """ global plx_kde if method is 'scipy_kde': if plx_kde is None: # We are only going to allow parallaxes above some minimum value if bandwidth is None: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0]) else: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0], bw_method=bandwidth) elif method is 'sklearn_kde': if plx_kde is None: kwargs = {'kernel':'tophat'} if bandwidth is None: plx_kde = KernelDensity(**kwargs) else: plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs) if c.kde_subset: plx_ran = np.copy(t['plx'][t['plx']>0.0]) np.random.shuffle(plx_ran) plx_kde.fit( plx_ran[0:5000, np.newaxis] ) else: plx_kde.fit( t['plx'][t['plx']>0.0][:, np.newaxis] ) elif method is 'blocks': global plx_bins_blocks global plx_hist_blocks # Set up Bayesian Blocks print("Calculating Bayesian Blocks...") nbins = np.min([len(t), 40000]) bins = bayesian_blocks(t['plx'][t['plx']>0.0][0:nbins]) hist, bins = np.histogram(t['plx'][t['plx']>0.0][0:nbins], bins=bins, normed=True) # Pad with zeros plx_bins_blocks = np.append(-1.0e100, bins) hist_pad = np.append(0.0, hist) plx_hist_blocks = np.append(hist_pad, 0.0) print("Bayesian Blocks set.") else: print("You must include a valid method") print("Options: kde or blocks") return
def histogram(a, bins=10, range=None, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as numpy.histogram(). Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ a = np.asarray(a) # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'scotts', 'freedman'])): a = a[(a >= range[0]) & (a <= range[1])] if bins == 'blocks': bins = bayesian_blocks(a) elif bins == 'knuth': da, bins = knuth_bin_width(a, True) elif bins == 'scotts': da, bins = scotts_bin_width(a, True) elif bins == 'freedman': da, bins = freedman_bin_width(a, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return np.histogram(a, bins, range, **kwargs)
def hist(x, bins=10, range=None, *args, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as pylab.hist(). Parameters ---------- x : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scott' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) ax : Axes instance (optional) specify the Axes on which to draw the histogram. If not specified, then the current active axes will be used. **kwargs : other keyword arguments are described in pylab.hist(). """ x = np.asarray(x) if 'ax' in kwargs: ax = kwargs['ax'] del kwargs['ax'] else: ax = plt.gca() # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman', 'freedmans'])): x = x[(x >= range[0]) & (x <= range[1])] if bins in ['blocks']: bins = bayesian_blocks(x) elif bins in ['knuth', 'knuths']: dx, bins = knuth_bin_width(x, True) elif bins in ['scott', 'scotts']: dx, bins = scotts_bin_width(x, True) elif bins in ['freedman', 'freedmans']: dx, bins = freedman_bin_width(x, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return ax.hist(x, bins, range, **kwargs)
def hist(x, bins=10, range=None, *args, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as pylab.hist(). Parameters ---------- x : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scott' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) ax : Axes instance (optional) specify the Axes on which to draw the histogram. If not specified, then the current active axes will be used. **kwargs : other keyword arguments are described in pylab.hist(). """ x = np.asarray(x) if 'ax' in kwargs: ax = kwargs['ax'] del kwargs['ax'] else: ax = plt.gca() # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in [ 'blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman', 'freedmans' ])): x = x[(x >= range[0]) & (x <= range[1])] if bins in ['blocks']: bins = bayesian_blocks(x) elif bins in ['knuth', 'knuths']: dx, bins = knuth_bin_width(x, True) elif bins in ['scott', 'scotts']: dx, bins = scotts_bin_width(x, True) elif bins in ['freedman', 'freedmans']: dx, bins = freedman_bin_width(x, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return ax.hist(x, bins, range, **kwargs)
def test_single_change_point(): np.random.seed(0) x = np.concatenate([np.random.random(100), 1 + np.random.random(200)]) with catch_warnings(AstroMLDeprecationWarning): bins = bayesian_blocks(x) assert_(len(bins) == 3) assert_allclose(bins[1], 1, rtol=0.02)
def test_single_change_point(): np.random.seed(0) x = np.concatenate([np.random.random(100), 1 + np.random.random(200)]) bins = bayesian_blocks(x) assert_(len(bins) == 3) assert_allclose(bins[1], 1, rtol=0.02)
def test_measures_fitness_heteroscedastic(): np.random.seed(1) t = np.linspace(0, 1, 11) x = np.exp(-0.5 * (t - 0.5) ** 2 / 0.01 ** 2) sigma = 0.02 + 0.02 * np.random.random(len(x)) x = np.random.normal(x, sigma) bins = bayesian_blocks(t, x, sigma, fitness='measures') assert_allclose(bins, [0, 0.45, 0.55, 1])
def test_regular_events(): np.random.seed(0) dt = 0.01 steps = np.concatenate([np.unique(np.random.randint(0, 500, 100)), np.unique(np.random.randint(500, 1000, 200))]) t = dt * steps bins = bayesian_blocks(t, fitness='regular_events', dt=dt) assert_(len(bins) == 3) assert_allclose(bins[1], 5, rtol=0.05)
def test_measures_fitness_heteroscedastic(): np.random.seed(1) t = np.linspace(0, 1, 11) x = np.exp(-0.5 * (t - 0.5)**2 / 0.01**2) sigma = 0.02 + 0.02 * np.random.random(len(x)) x = np.random.normal(x, sigma) with catch_warnings(AstroMLDeprecationWarning): bins = bayesian_blocks(t, x, sigma, fitness='measures') assert_allclose(bins, [0, 0.45, 0.55, 1])
def test_measures_fitness_homoscedastic(): np.random.seed(0) t = np.linspace(0, 1, 11) x = np.exp(-0.5 * (t - 0.5) ** 2 / 0.01 ** 2) sigma = 0.05 x = np.random.normal(x, sigma) with catch_warnings(AstroMLDeprecationWarning): bins = bayesian_blocks(t, x, sigma, fitness='measures') assert_allclose(bins, [0, 0.45, 0.55, 1])
def DualTreePeakProbs(data,flagMultimodal=False,saveBBlocks=False): ''' Creates a Bayesian blocks histogram of the set of values found for each parameter for each object. The peak probability value is taken to be the centre of the block with highest value. Inputs: DualTree Output array of size (#objects,mcIts,#params) Output: Peak probability parameter values for each object, a NumPy array of size(#objects,#parameters) ''' allPeakLocs = [] #for all objects multimo = [] bBlocks = [] for i in range(len(data)): peakLocs = [] #for this object, each parameter myMultimo = [] myBBlocks = [] for j in range(len(data[0][0])): bins = bayesian_blocks(data[i][:,j],fitness='events',p0=0.05) histo = np.histogram(data[i][:,j],bins) # Optional Bayesian Block Histogram storage if saveBBlocks: myBBlocks.append([bins,histo]) try: nMax = np.argmax(histo[0]) except: print(i,j) return(histo) loc = (histo[1][nMax]+histo[1][nMax+1])/2. peakLocs.append(loc) # Optional check for possible multimodality. Not remotely rigorous, but I haven't seen it fail yet if flagMultimodal: left = histo[0][1:-1] > histo[0][:-2] right = histo[0][1:-1] > histo[0][2:] nPeaks = np.sum(left*right) if nPeaks>1: myMultimo.append(True) else: myMultimo.append(False) if flagMultimodal: multimo.append(myMultimo) if saveBBlocks: bBlocks.append(myBBlocks) allPeakLocs.append(peakLocs) if flagMultimodal and not saveBBlocks: return([allPeakLocs,multimo]) if not flagMultimodal and saveBBlocks: return([allPeakLocs,bBlocks]) if flagMultimodal and saveBBlocks: return([allPeakLocs,multimo,bBlocks]) else: return(np.array(allPeakLocs))
def run_bayesianblocks(times, counts, p0): edges = bayesian_blocks(times, counts, fitness="events", p0=p0) #print("edges: " +str(edges)) binned_counts = [] for i in range(len(edges[:-1])): #print("first edge: " + str(edges[i])) #print("second edge: " +str(edges[i+1])) #print("times: " + str(times[:10])) e_inds = np.where((edges[i] <= times) & (times <= edges[i+1])) #print(e_inds) binned_counts.append(np.sum(counts[e_inds])) return edges, binned_counts
def DualTreePeakProbs(data, flagMultimodal=False, saveBBlocks=False): ''' Creates a Bayesian blocks histogram of the set of values found for each parameter for each object. The peak probability value is taken to be the centre of the block with highest value. Inputs: DualTree Output array of size (#objects,mcIts,#params) Output: Peak probability parameter values for each object, a NumPy array of size(#objects,#parameters) ''' allPeakLocs = [] #for all objects multimo = [] bBlocks = [] for i in range(len(data)): peakLocs = [] #for this object, each parameter myMultimo = [] myBBlocks = [] for j in range(len(data[0][0])): bins = bayesian_blocks(data[i][:, j], fitness='events', p0=0.05) histo = np.histogram(data[i][:, j], bins) # Optional Bayesian Block Histogram storage if saveBBlocks: myBBlocks.append([bins, histo]) try: nMax = np.argmax(histo[0]) except: print(i, j) return (histo) loc = (histo[1][nMax] + histo[1][nMax + 1]) / 2. peakLocs.append(loc) # Optional check for possible multimodality. Not remotely rigorous, but I haven't seen it fail yet if flagMultimodal: left = histo[0][1:-1] > histo[0][:-2] right = histo[0][1:-1] > histo[0][2:] nPeaks = np.sum(left * right) if nPeaks > 1: myMultimo.append(True) else: myMultimo.append(False) if flagMultimodal: multimo.append(myMultimo) if saveBBlocks: bBlocks.append(myBBlocks) allPeakLocs.append(peakLocs) if flagMultimodal and not saveBBlocks: return ([allPeakLocs, multimo]) if not flagMultimodal and saveBBlocks: return ([allPeakLocs, bBlocks]) if flagMultimodal and saveBBlocks: return ([allPeakLocs, multimo, bBlocks]) else: return (np.array(allPeakLocs))
def test_regular_events(): np.random.seed(0) dt = 0.01 steps = np.concatenate([ np.unique(np.random.randint(0, 500, 100)), np.unique(np.random.randint(500, 1000, 200)) ]) t = dt * steps with catch_warnings(AstroMLDeprecationWarning): bins = bayesian_blocks(t, fitness='regular_events', dt=dt) assert_(len(bins) == 3) assert_allclose(bins[1], 5, rtol=0.05)
def chckfile( fle, func ): #pickle if os.path.isfile( fle ) : bayesfile=open(fle , 'rb') bypkl = pickle.load(bayesfile) bynp = np.asarray( bypkl, dtype=np.float64 ) ab.bins = bynp bayesfile.close() #function else : ww = ab.wts.flat intervals = bayesian_blocks(ww) #array of optimal bin_edges ab.bins = intervals ab.pckle(ab.bins, fle ) print 'bins**', ab.bins
def _size_bins(self,hist,bin_tool,**kwargs): """Wrapper for astroML routines to choose optimal bin widths.""" if bin_tool == 'freedman': _,bins = density_estimation.freedman_bin_width(hist,return_bins=True) elif bin_tool == 'scotts': _,bins = density_estimation.scotts_bin_width(hist,return_bins=True) elif bin_tool == 'knuth': _,bins = density_estimation.knuth_bin_width(hist,return_bins=True, disp=False) elif bin_tool == 'blocks': bins = density_estimation.bayesian_blocks(hist,**kwargs) elif type(bin_tool) == type(int()) or type(bin_tool) == type(np.int64()) or type(bin_tool) == type(np.int32()): bins=bin_tool else: self.logger.warning("Unrecognized bin_tool option. Using Freedman-Diaconis rule.") _,bins = density_estimation.freedman_bin_width(hist,return_bins=True) return bins
def histogram(self, data, bin_width='knuth', weights=None, density=None, norm=None, ax=None, **kwargs): """ Plots a histogram. Parameters ---------- data : list or array Data to plot. bin_width : {'knuth', 'bayesian'} or float, optional Automatically determine the bin width using Knuth's rule (2006physics...5197K), with Bayesian blocks (2013ApJ...764..167S), or manually, choosing a floating point value. weights : array, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If False, the result will contain the number of samples in each bin. If True, the result is the value of the probability *density* function at the bin, normalised such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. norm : int or float Custom normalisation. ax : `matplotlib.axes.Axes`, optional Axes instance. """ # Axes instance: if ax is None: ax = self.axes[0] elif not isinstance(ax, Axes): raise TypeError('ax must be of type `matplotlib.axes.Axes`') # Convert list to array: if isinstance(data, list): data = np.array(data) if bin_width == 'knuth': _, bins = knuth_bin_width(data, return_bins=True) elif bin_width == 'bayesian': bins = bayesian_blocks(data) elif isinstance(bin_width, (int, float)): bins = np.arange(data.min(), data.max(), bin_width) else: raise ValueError('bin_width must be a number, or one of' '(`knuth`, `bayesian`)') # Ensure padding with empty bins: dx = np.diff(bins).min() bins = np.pad(bins, (1, 2), mode='linear_ramp', end_values=(bins[0] - dx, bins[-1] + 2 * dx)) # Calculate histogram: histogram, bins = np.histogram( data, bins, weights=weights, density=density) if norm: histogram /= norm # Plot data: ax.plot(bins[:-1] + np.diff(bins) / 2, histogram, drawstyle='steps-mid', **kwargs)
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'): """ Set the plx_kde Parameters ---------- t : ndarray float Catalog of parallax measures (units: mas) bandwidth : float Bandwidth for gaussian_kde (optional, 0.01 recommended) method : string Method for density determination (options: scipy_kde, sklearn_kde, blocks) """ global plx_kde if method is 'scipy_kde': if plx_kde is None: # We are only going to allow parallaxes above some minimum value if bandwidth is None: plx_kde = gaussian_kde(t['plx'][t['plx'] > 0.0]) else: plx_kde = gaussian_kde(t['plx'][t['plx'] > 0.0], bw_method=bandwidth) elif method is 'sklearn_kde': if plx_kde is None: kwargs = {'kernel': 'tophat'} if bandwidth is None: plx_kde = KernelDensity(**kwargs) else: plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs) if c.kde_subset: plx_ran = np.copy(t['plx'][t['plx'] > 0.0]) np.random.shuffle(plx_ran) plx_kde.fit(plx_ran[0:5000, np.newaxis]) else: plx_kde.fit(t['plx'][t['plx'] > 0.0][:, np.newaxis]) elif method is 'blocks': global plx_bins_blocks global plx_hist_blocks # Set up Bayesian Blocks print("Calculating Bayesian Blocks...") nbins = np.min([len(t), 40000]) bins = bayesian_blocks(t['plx'][t['plx'] > 0.0][0:nbins]) hist, bins = np.histogram(t['plx'][t['plx'] > 0.0][0:nbins], bins=bins, normed=True) # Pad with zeros plx_bins_blocks = np.append(-1.0e100, bins) hist_pad = np.append(0.0, hist) plx_hist_blocks = np.append(hist_pad, 0.0) print("Bayesian Blocks set.") else: print("You must include a valid method") print("Options: kde or blocks") return
def plot_stacked_timeseries_histogram(total_counts, itemized_counts=None, ax=None): r"""Create a time series histogram with stacked counts labeled by category. Convenience function for methods from `astroML.density_estimation.bayesian_blocks`. Args: total_counts (collections.Counter): Total counts by time. Example: total_counts.items() = [(1, 5), (2, 4), ...] where day 1 had 5 total counts, day 2 had 4 total counts... itemized_counts (optional, dict): `dict` of `collections.Counter`. If `None` (default), histogram is not stacked. Keys: `hashable` label for each type of event. To preserve key order, use `collections.OrderedDict`. Values: `collections.Counter` counts by time. Example: itemized_counts = dict(a=counter_a, b=counter_b) where counter_a.items() = [(1, 1), (2, 1), ...] and counter_b.items() = [(1, 4), (2, 3), ...] Required: The `total_counts` must equal the sum of all `itemized_counts` ax (optional, matplotlib.Axes): Axes instance on which to add the plot. If `None` (default), an axes instance is created. Returns: ax (matplotlib.axes): Axes instance for the plot. Raises: AssertionError: If `total_counts` does not equal the sum of all `itemized_counts`. See Also: astroML.density_estimation.bayesian_blocks Notes: * This simple implementation assumes that the times are not regularly spaced and that the data are counts of events. * Example call with ax=`None`: ax = plot_stacked_timeseries_histogram( total_counts=total_counts, itemized_counts=itemized_counts, ax=None) ax.legend(loc='upper left') plt.show(ax) * Example call with ax defined: fig = plt.figure() ax = fig.add_subplot(111) ax = plot_stacked_timeseries_histogram( total_counts=total_counts, itemized_counts=itemized_counts, ax=ax) ax.legend(loc='upper left') plt.show(ax) """ # Check input. if itemized_counts is not None: summed_itemized_counts = collections.Counter() for key in itemized_counts.keys(): summed_itemized_counts.update(itemized_counts[key]) if not total_counts == summed_itemized_counts: raise AssertionError( "`total_counts` must equal the sum of all `itemized_counts`.") # Calculate histogram bins. (times, counts) = zip(*total_counts.items()) bin_edges = astroML_dens.bayesian_blocks(t=times, x=counts, fitness='events') # Create plot. if ax is None: fig = plt.figure() ax = fig.add_subplot(111) if itemized_counts is None: ax.hist(list(total_counts.elements()), bins=bin_edges, stacked=False, rwidth=1.0, label=None, color=sns.color_palette()[0]) else: keys = itemized_counts.keys() ax.hist([list(itemized_counts[key].elements()) for key in keys], bins=bin_edges, stacked=True, rwidth=1.0, label=keys, color=sns.husl_palette(n_colors=len(keys))) return ax
def histogram(self, data, bin_width='knuth', weights=None, density=None, norm=None, ax=None, **kwargs): """ Plots a histogram. Parameters ---------- data : list or array Data to plot. bin_width : {'knuth', 'bayesian'} or float, optional Automatically determine the bin width using Knuth's rule (2006physics...5197K), with Bayesian blocks (2013ApJ...764..167S), or manually, choosing a floating point value. weights : array, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If False, the result will contain the number of samples in each bin. If True, the result is the value of the probability *density* function at the bin, normalised such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. norm : int or float Custom normalisation. ax : `matplotlib.axes.Axes`, optional Axes instance. """ # Axes instance: if ax is None: ax = self.axes[0] elif not isinstance(ax, Axes): raise TypeError('ax must be of type `matplotlib.axes.Axes`') # Convert list to array: if isinstance(data, list): data = np.array(data) if bin_width == 'knuth': _, bins = knuth_bin_width(data, return_bins=True) elif bin_width == 'bayesian': bins = bayesian_blocks(data) elif isinstance(bin_width, (int, float)): bins = np.arange(data.min(), data.max(), bin_width) else: raise ValueError('bin_width must be a number, or one of' '(`knuth`, `bayesian`)') # Ensure padding with empty bins: dx = np.diff(bins).min() bins = np.pad(bins, (1, 2), mode='linear_ramp', end_values=(bins[0] - dx, bins[-1] + 2 * dx)) # Calculate histogram: histogram, bins = np.histogram(data, bins, weights=weights, density=density) if norm: histogram /= norm # Plot data: ax.plot(bins[:-1] + np.diff(bins) / 2, histogram, drawstyle='steps-mid', **kwargs)
def create_scatter_hist(data,sigcutx,sigcuty,paramx,paramy,range_x,range_y,dataset_id,frequencies): # create the figure with eta and V histograms and scatter plot print('plotting figure: scatter histogram plot') frequencies.sort() if "TP" in frequencies: # if the data is classified, we ensure that the "frequencies" are correct frequencies = ["TN","TP","FN","FP"] if "stable" in frequencies: freq_labels= [name.replace("_", " ") for name in frequencies] else: freq_labels=frequencies # Setting up the plot nullfmt = NullFormatter() # no labels fontP = FontProperties() fontP.set_size('large') col = make_colours(frequencies) left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] fig = plt.figure(1,figsize=(12,12)) axScatter = fig.add_subplot(223, position=rect_scatter) plt.xlabel(r'$\eta_{\nu}$', fontsize=28) plt.ylabel(r'$V_{\nu}$', fontsize=28) axHistx=fig.add_subplot(221, position=rect_histx) axHisty=fig.add_subplot(224, position=rect_histy) # Plotting data - scatter plot for i in range(len(frequencies)): xdata_var=[data[n][1] for n in range(len(data)) if data[n][3]==frequencies[i]] ydata_var=[data[n][2] for n in range(len(data)) if data[n][3]==frequencies[i]] if frequencies[i]=='stable': axScatter.scatter(xdata_var, ydata_var,color='0.75', s=10., zorder=1) else: axScatter.scatter(xdata_var, ydata_var,color=col[i], s=10., zorder=5) if 'stable' in frequencies or 'TN' in frequencies: x=[data[n][1] for n in range(len(data)) if (data[n][3]=='stable' or data[n][3]=='FP' or data[n][3]=='TN')] y=[data[n][2] for n in range(len(data)) if (data[n][3]=='stable' or data[n][3]=='FP' or data[n][3]=='TN')] else: x=[data[n][1] for n in range(len(data))] y=[data[n][2] for n in range(len(data))] # Plotting histograms with bayesian blocks binning new_bins = density_estimation.bayesian_blocks(x) binsx = [new_bins[a] for a in range(len(new_bins)-1) if abs((new_bins[a+1]-new_bins[a])/new_bins[a])>0.05] binsx = binsx + [new_bins[-1]] new_bins = density_estimation.bayesian_blocks(y) binsy = [new_bins[a] for a in range(len(new_bins)-1) if abs((new_bins[a+1]-new_bins[a])/new_bins[a])>0.05] binsy = binsy + [new_bins[-1]] axHistx.hist(x, bins=binsx, normed=1, histtype='stepfilled', color='b') axHisty.hist(y, bins=binsy, normed=1, histtype='stepfilled', orientation='horizontal', color='b') axScatter.legend(freq_labels,loc=4, prop=fontP) # Plotting lines representing thresholds (unless no thresholds) if sigcutx != 0 or sigcuty != 0: axHistx.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') axHisty.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') # Plotting the Gaussian fits fit=norm.pdf(range_x,loc=paramx[0],scale=paramx[1]) axHistx.plot(range_x,fit, 'k:', linewidth=2) fit2=norm.pdf(range_y,loc=paramy[0],scale=paramy[1]) axHisty.plot(fit2, range_y, 'k:', linewidth=2) # Final plot settings axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) axHistx.axes.yaxis.set_ticklabels([]) axHisty.axes.xaxis.set_ticklabels([]) axHistx.set_xlim( axScatter.get_xlim() ) axHisty.set_ylim( axScatter.get_ylim() ) xmin=int(min([data[n][1] for n in range(len(data))])-1) xmax=int(max([data[n][1] for n in range(len(data))]))+1 ymin=int(min([data[n][2] for n in range(len(data))])-1) ymax=int(max([data[n][2] for n in range(len(data))]))+1 xvals=range(xmin,xmax) xtxts=[r'$10^{'+str(a)+'}$' for a in xvals] yvals=range(ymin,ymax) ytxts=[r'$10^{'+str(a)+'}$' for a in yvals] axScatter.set_xlim([xmin,xmax]) axScatter.set_ylim([ymin,ymax]) axScatter.set_xticks(xvals) axScatter.set_xticklabels(xtxts, fontsize=20) axScatter.set_yticks(yvals) axScatter.set_yticklabels(ytxts, fontsize=20) axHistx.set_xlim( axScatter.get_xlim() ) axHisty.set_ylim( axScatter.get_ylim() ) plt.savefig('ds'+str(dataset_id)+'_scatter_hist.png') # find all the variable candidates tmp=[x for x in data if x[1]>sigcutx if x[2]>sigcuty] tmp2=[] for line in tmp: if line[0] not in tmp2: tmp2.append(line[0]) IdTrans=np.sort(tmp2, axis=0) plt.close() return IdTrans
def MakeBlocks(self, p0): self.bins = bayesian_blocks(self.evts, p0=p0) self.bType = "bb" self.binWidth = diff(self.bins)
def MakeBlocks(self, p0): self.bins = bayesian_blocks(self.evts, p0 = p0) self.bType = "bb" self.binWidth = diff(self.bins)
def create_scatter_hist(data, sigcutx, sigcuty, paramx, paramy, range_x, range_y, dataset_id, frequencies): # create the figure with eta and V histograms and scatter plot print('plotting figure: scatter histogram plot') frequencies.sort() if "TP" in frequencies: # if the data is classified, we ensure that the "frequencies" are correct frequencies = ["TN", "TP", "FN", "FP"] if "stable" in frequencies: freq_labels = [name.replace("_", " ") for name in frequencies] # elif "~" in frequencies[0]: # freq_labels= [name.replace("~", ",") for name in frequencies] else: freq_labels = frequencies # Setting up the plot nullfmt = NullFormatter() # no labels fontP = FontProperties() # fontP.set_size('large') col = make_colours(frequencies) left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] fig = plt.figure(1, figsize=(12, 12)) axScatter = fig.add_subplot(223, position=rect_scatter) plt.xlabel(r'$\eta_{\nu}$', fontsize=28) plt.ylabel(r'$V_{\nu}$', fontsize=28) axHistx = fig.add_subplot(221, position=rect_histx) axHisty = fig.add_subplot(224, position=rect_histy) # Plotting data - scatter plot for i in range(len(frequencies)): xdata_var = [ data[n][1] for n in range(len(data)) if data[n][3] == frequencies[i] ] ydata_var = [ data[n][2] for n in range(len(data)) if data[n][3] == frequencies[i] ] if frequencies[i] == 'stable': axScatter.scatter(xdata_var, ydata_var, color='0.75', s=10., zorder=1) else: axScatter.scatter(xdata_var, ydata_var, color=col[i], s=10., zorder=5) if 'stable' in frequencies or 'TN' in frequencies: x = [ data[n][1] for n in range(len(data)) if (data[n][3] == 'stable' or data[n][3] == 'FP' or data[n][3] == 'TN') ] y = [ data[n][2] for n in range(len(data)) if (data[n][3] == 'stable' or data[n][3] == 'FP' or data[n][3] == 'TN') ] else: x = [data[n][1] for n in range(len(data))] y = [data[n][2] for n in range(len(data))] # Plotting histograms with bayesian blocks binning new_bins = density_estimation.bayesian_blocks(x) binsx = [ new_bins[a] for a in range(len(new_bins) - 1) if abs((new_bins[a + 1] - new_bins[a]) / new_bins[a]) > 0.05 ] binsx = binsx + [new_bins[-1]] new_bins = density_estimation.bayesian_blocks(y) binsy = [ new_bins[a] for a in range(len(new_bins) - 1) if abs((new_bins[a + 1] - new_bins[a]) / new_bins[a]) > 0.05 ] binsy = binsy + [new_bins[-1]] axHistx.hist(x, bins=binsx, normed=1, histtype='stepfilled', color='b') axHisty.hist(y, bins=binsy, normed=1, histtype='stepfilled', orientation='horizontal', color='b') axScatter.legend(freq_labels, loc=4, prop=fontP) # Plotting lines representing thresholds (unless no thresholds) if sigcutx != 0 or sigcuty != 0: axHistx.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') axHisty.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') # Plotting the Gaussian fits fit = norm.pdf(range_x, loc=paramx[0], scale=paramx[1]) axHistx.plot(range_x, fit, 'k:', linewidth=2) fit2 = norm.pdf(range_y, loc=paramy[0], scale=paramy[1]) axHisty.plot(fit2, range_y, 'k:', linewidth=2) # Final plot settings axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) axHistx.axes.yaxis.set_ticklabels([]) axHisty.axes.xaxis.set_ticklabels([]) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) xmin = int(min([data[n][1] for n in range(len(data))]) - 1) xmax = int(max([data[n][1] for n in range(len(data))]) + 1) ymin = int(min([data[n][2] for n in range(len(data))]) - 1) ymax = int(max([data[n][2] for n in range(len(data))]) + 1) xvals = range(xmin, xmax) xtxts = [r'$10^{' + str(a) + '}$' for a in xvals] yvals = range(ymin, ymax) ytxts = [r'$10^{' + str(a) + '}$' for a in yvals] axScatter.set_xlim([xmin, xmax]) axScatter.set_ylim([ymin, ymax]) axScatter.set_xticks(xvals) axScatter.set_xticklabels(xtxts, fontsize=20) axScatter.set_yticks(yvals) axScatter.set_yticklabels(ytxts, fontsize=20) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) plt.savefig('ds' + str(dataset_id) + '_scatter_hist.png') # find all the variable candidates tmp = [x for x in data if x[1] > sigcutx if x[2] > sigcuty] tmp2 = [] for line in tmp: if line[0] not in tmp2: tmp2.append(line[0]) IdTrans = np.sort(tmp2, axis=0) plt.close() return IdTrans