def update_map(indicator): return ff.create_choropleth(fips=all_counties['fips_county'].tolist(), values=all_counties[indicator].tolist(), scope=['usa'], title=f'{indicator} by county', binning_endpoints=np.histogram_bin_edges( all_counties[indicator].tolist(), bins=20, range=[0, 100]).tolist(), showlegend=True)
def digitize_non_genes_data(col): not_nan = numpy.nan_to_num(col, nan=-1) n_unique = len(set(not_nan)) if 15 < n_unique: n_unique = 15 edge = numpy.histogram_bin_edges(numpy.nan_to_num(col, nan=col.min()), bins=n_unique - 1) digits = numpy.digitize(not_nan, edge) digits = digits - digits.min() return digits
def test_histogram_multiple(self, long_df, triple_args): h = Hist() out = h(long_df, *triple_args) bins = np.histogram_bin_edges(long_df["x"], "auto") for (a, s), out_part in out.groupby(["a", "s"]): x = long_df.loc[(long_df["a"] == a) & (long_df["s"] == s), "x"] hist, edges = np.histogram(x, bins=bins) assert_array_equal(out_part["y"], hist) assert_array_equal(out_part["space"], np.diff(edges))
def to_operation(self, inputs, outputs, **kwargs): outputs = self.output_vars() if self.values['binned']: bins = np.histogram_bin_edges(np.arange(self.values['min'], self.values['max']), bins=self.values['bins'], range=(self.values['min'], self.values['max'])) map_outputs = [self.name()+'_bin', self.name()+'_map_count'] reduce_outputs = [self.name()+'_reduce_count'] def func(k, v): return np.digitize(k, bins), (v, 1) def mean(d): res = {bins[i]: 0 for i in range(0, bins.size)} for k, v in d.items(): try: res[bins[k]] = v[0]/v[1] except IndexError: pass keys, values = zip(*sorted(res.items())) return np.array(keys), np.array(values) nodes = [ gn.Map(name=self.name()+'_map', inputs=inputs, outputs=map_outputs, func=func, **kwargs), gn.ReduceByKey(name=self.name()+'_reduce', inputs=map_outputs, outputs=reduce_outputs, reduction=lambda cv, v: (cv[0]+v[0], cv[1]+v[1]), **kwargs), gn.Map(name=self.name()+'_mean', inputs=reduce_outputs, outputs=outputs, func=mean, **kwargs) ] else: map_outputs = [self.name()+'_map_count'] reduce_outputs = [self.name()+'_reduce_count'] def mean(d): res = {} for k, v in d.items(): res[k] = v[0]/v[1] keys, values = zip(*sorted(res.items())) return np.array(keys), np.array(values) nodes = [ gn.Map(name=self.name()+'_map', inputs=[inputs['Value']], outputs=map_outputs, func=lambda a: (a, 1), **kwargs), gn.ReduceByKey(name=self.name()+'_reduce', inputs=[inputs['Bin']]+map_outputs, outputs=reduce_outputs, reduction=lambda cv, v: (cv[0]+v[0], cv[1]+v[1]), **kwargs), gn.Map(name=self.name()+'_mean', inputs=reduce_outputs, outputs=outputs, func=mean, **kwargs) ] return nodes
def get_bin_edges(query_return_df, label_template): all_data = query_return_df['VALUE'].values all_data = all_data[np.logical_not(np.isnan(all_data))] # resample data down to an annual sample size (data size is used in bin calc algorithms) n_draws = len(query_return_df['GEOID'].unique()) - 2 # could perform resampling multiple times and average edges # but doing calc only once for efficiency sampled_data = list(np.random.choice(all_data, size=n_draws, replace=True)) # make sure the min and max are always in the data or bin range will be wrong sampled_data.append(max(all_data)) sampled_data.append(min(all_data)) # Freedman Diaconis Estimator bin_edges = np.histogram_bin_edges(sampled_data, bins='fd') bar_centers = [ round(((bin_edges[i - 1] + bin_edges[i]) / 2.0), 2) for i in range(1, len(bin_edges)) ] # measure kurtosis to determine binning strategy k = pearson_kurtosis(all_data) # k = 0 is close to a normal distribution; some of our data have k = 80 if k < 10: label_fmt = [ label_template(bin_edges[i - 1], bin_edges[i]) for i in range(1, len(bin_edges)) ] return bin_edges, bar_centers, label_fmt else: # find the 95th percentile of all data to use as binning threshold threshold_95 = np.quantile(all_data, 0.95) # subset the bin edges under the threshold edges_threshold = list(bin_edges[bin_edges < threshold_95]) # add the maximum value back to the sequence edges_threshold.append(max(bin_edges)) # calculate the bar centers so that the final wide bar isn't stretched bar_centers_threshold = bar_centers[0:len(edges_threshold) + 1] # update the bar labels label_fmt_threshold = [ label_template(edges_threshold[i - 1], edges_threshold[i]) for i in range(1, len(edges_threshold)) ] return edges_threshold, bar_centers_threshold, label_fmt_threshold
def test_histogram_bin_edges_execution(setup): rs = np.random.RandomState(0) raw = rs.randint(10, size=(20, )) a = tensor(raw, chunk_size=6) # range provided for range_ in [(0, 10), (3, 11), (3, 7)]: bin_edges = histogram_bin_edges(a, range=range_) result = bin_edges.execute().fetch() expected = np.histogram_bin_edges(raw, range=range_) np.testing.assert_array_equal(result, expected) raw2 = rs.randint(10, size=(1, )) b = tensor(raw2) raw3 = rs.randint(10, size=(0, )) c = tensor(raw3) for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]: test_bins = [ 10, 'stone', 'auto', 'doane', 'fd', 'rice', 'scott', 'sqrt', 'sturges' ] for bins in test_bins: bin_edges = histogram_bin_edges(t, bins=bins) result = bin_edges.execute().fetch() expected = np.histogram_bin_edges(r, bins=bins) np.testing.assert_array_equal(result, expected) test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)] for bins in test_bins: bin_edges = histogram_bin_edges(t, bins=bins) result = bin_edges.execute().fetch() expected = np.histogram_bin_edges(r, bins=[0, 4, 8]) np.testing.assert_array_equal(result, expected) raw = np.arange(5) a = tensor(raw, chunk_size=3) bin_edges = histogram_bin_edges(a) result = bin_edges.execute().fetch() expected = np.histogram_bin_edges(raw) assert bin_edges.shape == expected.shape np.testing.assert_array_equal(result, expected)
def test_bin_precision(self): # Ensure bin edges are precise bins = 500 r_min = 0 r_max = 50 rdf = freud.density.RDF(bins=bins, r_max=r_max, r_min=r_min) expected_bin_edges = np.histogram_bin_edges(np.array([0], dtype=np.float32), bins=bins, range=[r_min, r_max]) npt.assert_allclose(rdf.bin_edges, expected_bin_edges, atol=1e-6)
def histogram_bin_edges(a, bins=10, range=None, weights=None, log=False): if is_array(bins): return bins a = a if isinstance(a, np.ndarray) else np.asarray(a) if np.issubdtype(a.dtype, np.datetime64): bins = histogram_bin_edges((a - np.min(a)).astype(float), bins, range, weights) time_unit = np.datetime_data(a.dtype)[0] bins = np.min(a) + bins.astype(f"timedelta64[{time_unit}]") return bins elif is_integer(a): pass # bins = np.unique(a) elif is_floating(a): pass elif np.issubdtype(a.dtype, np.bool_) or np.issubdtype(a.dtype, np.str_): # Not implemented yet that bins affect nothing bins = np.unique(a) return bins else: raise NotImplementedError(f"Unexpected type: {a.dtype}") try: if log: log_a = np.log10(a) log_a = log_a[~np.isnan(log_a)] bins = 10 ** histogram_bin_edges(log_a, bins, range, weights, log=False) else: if range is None: range = (np.min(a), np.max(a)) if ( (range[1] - range[0] < 1e7) or (bins is not None) ): # print(bins) bins = np.histogram_bin_edges(a, bins, range, weights) else: warnings.warn( "It may cause memory leak and hang" f" due to significantly different between range first and second: {range}\n" f"Use bin size {n_bins_limit}" ) bins = np.linspace(*range, n_bins_limit) if bins.size > n_bins_limit: warnings.warn(f"Huge bin size {bins.size} -> {n_bins_limit}") bins = np.linspace(np.min(bins), np.max(bins), n_bins_limit) except MemoryError as e: warnings.warn(f"Encountered MemoryError: {e}") bins = np.linspace(np.min(a), np.max(a), n_bins_limit) return bins
def setHistogram( self, data: np.ndarray, bins: Union[int, str] = "auto", min_bins: int = 16, max_bins: int = 128, ) -> None: """Draw 'data' as a histogram. Args: data: hist data bins: passed to np.histogram_bin_edges min_bins: minimum number of bins max_bins: maximum number of bins """ vmin, vmax = np.percentile(data, 5), np.percentile(data, 95) barset = QtCharts.QBarSet("histogram") barset.setColor(sequential[1]) barset.setLabelColor(light_theme["text"]) bin_edges = np.histogram_bin_edges(data, bins=bins, range=(vmin, vmax)) if bin_edges.size > max_bins: bin_edges = np.histogram_bin_edges(data, bins=max_bins, range=(vmin, vmax)) elif bin_edges.size < min_bins: bin_edges = np.histogram_bin_edges(data, bins=min_bins, range=(vmin, vmax)) hist, edges = np.histogram(data, bins=bin_edges) barset.append(list(hist)) self.series.clear() self.series.append(barset) self._xaxis.setRange(-0.5, hist.size - 0.5) self.xaxis.setRange(edges[0], edges[-1]) self.yaxis.setRange(0, np.amax(hist)) self.yaxis.applyNiceNumbers()
def find_centroids(self): """This (crude) algorithm iterates through the pixels of an image and returns the RA and Dec coordinates of the maxima of the image as two 1D numpy arrays. """ with fits.open(f"{self.file_path}") as file: # load in RA/Dec coordinates xs = np.array(file[1].data["X"]) ys = np.array(file[1].data["Y"]) # determine x and y bins xbinedges = np.histogram_bin_edges(xs, bins='fd') ybinedges = np.histogram_bin_edges(ys, bins='fd') # iterate through the bins on each axis image_maxes = [] for j in range(len(ybinedges) - 1): for i in range(len(xbinedges) - 1): cell_xmin, cell_xmax = xbinedges[i], xbinedges[i + 1] cell_ymin, cell_ymax = ybinedges[j], ybinedges[j + 1] # isolate the events within each bin cell_constraints = np.where((xs < cell_xmax) & (xs >= cell_xmin) & (ys < cell_ymax) & (ys >= cell_ymin)) cell_xs = xs[cell_constraints] cell_ys = ys[cell_constraints] coord_pairs = list(zip(cell_xs, cell_ys)) # find the maximum point in the bin coord_counts = Counter(coord_pairs).most_common(1) for result in coord_counts: image_maxes.append(result) # sort the identified maxima by count and return their x(RA) and y(Dec) coordinates image_maxes = sorted(image_maxes, key=lambda x: x[1], reverse=True) centroid_xs = np.array([item[0][0] for item in image_maxes]) centroid_ys = np.array([item[0][1] for item in image_maxes]) return centroid_xs, centroid_ys
def generate_plot(context, trips): minute_lengths = [ x.total_seconds() / 60 for x in trips.end_time - trips.start_time ] bin_edges = np.histogram_bin_edges(minute_lengths, 15) fig, ax = plt.subplots(figsize=(10, 5)) ax.set(title="Trip lengths", xlabel="Minutes", ylabel="Count") ax.hist(minute_lengths, bins=bin_edges) fig.savefig("trip_lengths.png") context.log_event( AssetMaterialization(asset_key="trip_dist_plot", description="Distribution of trip lengths."))
def plot_min_distances(turbines, distances, title='', factors=None, quantiles=None): distances = distances.where(distances < np.inf) idcs_not_nan = ~np.isnan(turbines.t_rd) rotor_diameters_m = turbines.t_rd[idcs_not_nan] min_distances_not_nan = distances[idcs_not_nan] * 1e3 bin_edges = np.histogram_bin_edges(rotor_diameters_m, bins=15, range=(5, 155)) bin_idcs = np.digitize(rotor_diameters_m, bin_edges) fig, ax = plt.subplots(1, 1, figsize=FIGSIZE) idcs_cut_off = min_distances_not_nan < 500 bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2. x = bin_centers[bin_idcs[idcs_cut_off] - 1] ax = sns.stripplot(x=x, y=min_distances_not_nan[idcs_cut_off], jitter=.4, size=1, color='k') colors = '#c72321', '#fbd7a9', '#f0c220', '#7a6952', factors = factors or DISTANCE_FACTORS for color, factor in zip(colors, factors): ax.plot(factor * bin_centers, label=f'{factor:.2f}x', color=color) colors = '#246b71', '#6a9395', '#84bcbf', '#9bdade' quantiles = quantiles or (0.05, 0.1, 0.2, 0.3) for q, color in zip(quantiles, colors): ax.plot(pd.Series(min_distances_not_nan).groupby(bin_idcs).quantile( q=q).values, label=f"{int(q * 100)}% quantile", color=color) ax.set_ylim(0, 500) ax.set_xlim(0, 11) plt.ylabel('Distance to closest turbine [m]') plt.xlabel('Rotor diameter [m]') if title: plt.title(title) plt.legend() plt.grid() return fig
def lr_histogram(lrs, y, bins=20, ax=plt): """ plots the 10log lrs """ log_lrs = np.log10(lrs) bins = np.histogram_bin_edges(log_lrs, bins=bins) points0, points1 = util.Xy_to_Xn(log_lrs, y) ax.hist(points0, bins=bins, alpha=.25, density=True) ax.hist(points1, bins=bins, alpha=.25, density=True) ax.set_xlabel('10log likelihood ratio') ax.set_ylabel('count')
def histogram(xin, bins=100, norm=False, logx=True, logbase=10, density=False): if logx: flog, fpow = get_log_pow(logbase) xi = xin[xin > 0] bin_arr = fpow(np.linspace(flog(xi.min()), flog(xi.max()), bins)) else: xi = xin bin_arr = np.histogram_bin_edges(xi, bins=bins) hh,be = np.histogram(xi, bins=bin_arr, density=density) if norm: hh = hh / np.dot(hh, np.diff(be)) return hh, be
def define_bins_from_samples(self, samples): # convert to array if given in samples if type(samples) is not np.ndarray: samples = np.array(samples) # recalculate bins self.bins = [] for v in range(self.n_vars): bins = np.histogram_bin_edges(samples[:, v], bins=self.bin_sizes[v]) self.bins.append(bins[1:-1])
def freedman_diaconis(x: np.ndarray) -> np.ndarray: """ The binwidth is proportional to the interquartile range (IQR) and inversely proportional to cube root of a.size. Can be too conservative for small datasets, but is quite good for large datasets. The IQR is very robust to outliers. :param x: np.ndarray The 1-dimensional x-data to bin. :return: np.ndarray The bins edges computed using the FD method. """ return np.histogram_bin_edges(x, bins='fd')
def fit(self, eps=0.9, min_samples=3): eps *= np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0] # eps *= median_absolute_deviation(self.x) print(np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0]) print(median_absolute_deviation(self.x)) db = skDBSCAN(eps, min_samples, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None).fit(self.X) labels = db.labels_ # @Note: Number of clusters in labels, ignoring noise if present. n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) out = {"n_clusters": n_clusters, "n_noise": n_noise, "labels": labels} return out
def compute_quantization_parameters(self, ranges='q99', clip=True, center=False): """ Make bins for int8 quantization and convert value-stats. Parameters ---------- ranges : str or sequence of two numbers Ranges to quantize data to. Available options are: - `q95`, `q99`, `q999` to clip data to respective quantiles. - `same` keep the same range of data. clip : bool Whether to clip data to selected ranges. center : bool Whether to make data have 0-mean before quantization. """ ranges_dict = { 'q95': min(abs(self.v_q05), abs(self.v_q95)), 'q99': min(abs(self.v_q01), abs(self.v_q99)), 'q999': min(abs(self.v_q001), abs(self.v_q999)), 'same': max(abs(self.v_min), abs(self.v_max)), } if ranges in ranges_dict: ranges = ranges_dict[ranges] ranges = (-ranges, +ranges) if center: ranges = tuple(item - self.v_mean for item in ranges) self.qnt_ranges = ranges self.qnt_bins = np.histogram_bin_edges(None, bins=254, range=ranges).astype(np.float) self.qnt_clip = clip self.qnt_center = center # Compute quantized statistics quantized_tc = self.quantize(self.trace_container) self.qnt_min, self.qnt_max = self.quantize(self.v_min), self.quantize( self.v_max) self.qnt_mean, self.qnt_std = np.mean(quantized_tc), np.std( quantized_tc) self.qnt_q001, self.qnt_q01, self.qnt_q05 = np.quantile( quantized_tc, [0.001, 0.01, 0.05]) self.qnt_q999, self.qnt_q99, self.qnt_q95 = np.quantile( quantized_tc, [0.999, 0.99, 0.95]) # Estimate difference after quantization quantized_tc += 127 restored_tc = self.qnt_bins[quantized_tc] self.qnt_error = np.mean( np.abs(restored_tc - self.trace_container)) / self.v_std
def __call__(self, loaded_data_tuple, metadata, random_state): bin_edges = np.histogram_bin_edges(loaded_data_tuple.data, self.bins, range) if np.isscalar(self.bins): bin_edges = bin_edges[1:] data = np.digitize(loaded_data_tuple.data, bin_edges, right=True) if self.use_one_hot: one_hot = np.zeros(data.shape + (len(bin_edges) + 1, ), data.dtype) one_hot = np.reshape(one_hot, (-1, one_hot.shape[-1])) for idx, bin in enumerate(np.reshape(data, -1)): one_hot[idx, bin] = 1 data = np.reshape(one_hot, data.shape + (one_hot.shape[-1], )) return replace(loaded_data_tuple, data=data)
def equal_number_FD(x: np.ndarray) -> np.ndarray: """ Takes the number of bins computed using the FD method, but then selects the bin edges splitting the dataset in bins with equal number of data-points. :param x: np.ndarray The 1-dimensional x-data to bin. :return: np.ndarray The bins edges computed using the equal-N method. """ nbin = len(np.histogram_bin_edges(x, bins='fd')) - 1 npt = len(x) return np.interp(np.linspace(0, npt, nbin + 1), np.arange(npt), np.sort(x))
def plot_sim_data(sims_data=None, normalize=True, bin_width=None): if not bin_width: # Generate bin edges for each input all_bin_edges = [] for sim_data in sims_data.values(): all_bin_edges.append(np.histogram_bin_edges(sim_data, bins='fd')) # Discover bin intervals for each input all_bin_widths = [] for input_bin_edges in all_bin_edges: all_bin_widths.append(input_bin_edges[1] - input_bin_edges[0]) # Determine max bin interval bin_width = max(all_bin_widths) fig, ax = plt.subplots() prop_iter = iter(plt.rcParams['axes.prop_cycle']) for sim_name, sim_data in sims_data.items(): # if input type is constant (all values in array match), plot as bar, not histogram if not all(sim_data == sim_data[0]): # Create bin edges for each input using calculated max bin interval input_bin_edges = np.arange(min(sim_data), max(sim_data), bin_width) ax.hist(sim_data, bins=input_bin_edges, density=normalize, color=next(prop_iter)['color'], alpha=0.5, label=sim_name) else: if normalize: ax.bar(sim_data[0], 1.0, bin_width, color=next(prop_iter)['color'], alpha=0.5, label=sim_name) else: ax.bar(sim_data[0], len(sim_data), bin_width, color=next(prop_iter)['color'], alpha=0.5, label=sim_name) ax.set_ylabel('PDF (%)') ax.set_xlabel('Value') return fig, ax
def fit(self, x: Union[list, np.ndarray], y=None): if not isinstance(x, np.ndarray): x = np.array(x) if len(x.shape) == 1: x = x.reshape(-1, 1) # parameter validation if not self.columns: self.columns = list(range(x.shape[1])) self.min_value_by_column_ = self._validate_value(self.min_value) self.max_value_by_column_ = self._validate_value(self.max_value) if isinstance(self.n_bits, list): self.n_bits_by_column_ = self.n_bits else: self.n_bits_by_column_ = [self.n_bits] * x.shape[1] # fitting if not self.min_value_by_column_: self.min_value_by_column_ = [] for c in self.columns: self.min_value_by_column_.append(np.min(x[:, c])) if not self.max_value_by_column_: self.max_value_by_column_ = [] for c in self.columns: self.max_value_by_column_.append(np.max(x[:, c])) self.possible_values_ = dict() self.bins_ = dict() for i, c in enumerate(self.columns): if self.quantile_based: self.bins_[c] = np.unique( np.quantile(x[:, c], np.linspace(0, 1, self.n_bits_by_column_[c] + 1), interpolation='higher')) else: self.bins_[c] = np.histogram_bin_edges( [], bins=self.n_bits_by_column_[c], range=(self.min_value_by_column_[i], self.max_value_by_column_[i])) self.possible_values_[c] = [ ((i) * [1] + (self.n_bits_by_column_[c] - i) * [0]) for i in range(self.n_bits_by_column_[c] + 1) ] self.possible_values_[c] = np.array(self.possible_values_[c]) return self
def discretize_continous_label(labels, bins='sturges', verbose=False): # Get an estimation of the best bin edges. 'Sturges' is conservative for pretty large datasets (N>1000). bin_edges = np.histogram_bin_edges(labels, bins=bins) if verbose: print('Global histogram:\n', np.histogram(labels, bins=bin_edges, density=False), flush=True) # Discretizes the values according to these bins discretization = np.digitize(labels, bin_edges[1:], right=True) if verbose: print('Bin Counts after discretization:\n', np.bincount(discretization), flush=True) return discretization
def _hist_bins(sm): data = sm.get_array() log = isinstance(sm.norm, mpl.colors.LogNorm) if log: data = np.log(data) bins = np.histogram_bin_edges( data, "auto", np.log(sm.get_clim()) if log else sm.get_clim()) if data.dtype.kind in "iu": # not log. binsize = max(round(bins[1] - bins[0]), 1) bins = np.arange(data.min(), data.max() + binsize + .5, binsize) - .5 if log: bins = np.exp(bins) return bins
def _get_hist(_width): if _width == 'auto': _edges = np.histogram_bin_edges(array, 'auto').tolist() _edges = [ _edges[0] - 0.1 * i for i in range(-5, 0, -1) ] + _edges + [_edges[-1] + 0.1 * i for i in range(1, 6)] else: _edges = np.arange(array_range[0] - _width * 6, array_range[1] + _width * 5, _width) h, edges = np.histogram(array, bins=_edges, density=True) h /= 100. # conv_kernel = self.option.density_smooth_conv_kernel # h = np.convolve(h, conv_kernel, 'full') / np.sum(conv_kernel) return h, np.convolve(edges, [1, 1], 'valid') / 2
def array2TH1F(array, bins, name = 'historam', title = 'histogram'): if (len(bins)==0): # if binning is not specified, use numpy's auto bin finder # https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges bins = np.histogram_bin_edges(array, 'auto') else: bins = np.array(bins, dtype=float) histo = ROOT.TH1F(name, title, len(bins)-1, bins) for x in array: histo.Fill(x) return histo
def _init_numpy( self, obj, bins="auto", range=None, weights=None, threads=1, overflow=True ): # convert ROOT-like "50,0,10" to equivalent of np.linspace(0,10,51) if isinstance(bins, str) and (bins.count(",") == 2): nbins, low, high = bins.split(",") range = (float(low), float(high)) bins = int(nbins) if is_datelike(obj): obj = convert_dates(obj) self._metadata["date_axes"] = ["x"] if isinstance(bins, str): # if binning integers, binning choice is easy if hasattr(obj, "dtype") and ("int" in str(obj.dtype)): # check just 10% on each side to get reasonable ranges # n = max(int(0.1*len(obj)), 1) # maxi = max(obj[:n].max(), obj[-n:].max()) # mini = min(obj[:n].min(), obj[-n:].min()) mini, maxi = obj.min(), obj.max() bins = np.linspace(mini - 0.5, maxi + 0.5, maxi - mini + 2) else: bins = np.histogram_bin_edges(obj, bins, range) if weights is not None: weights = np.array(weights, copy=False) if is_datelike(bins): bins = convert_dates(bins) self._metadata["date_axes"] = ["x"] counts, (edges,) = histogramdd_wrapper( (obj,), (bins,), (range,), weights, overflow, threads, ) if weights is not None: sumw2, _ = histogramdd_wrapper( (obj,), (bins,), (range,), weights ** 2, overflow, threads, ) errors = sumw2 ** 0.5 else: errors = counts ** 0.5 self._counts = counts self._edges = edges self._errors = errors
def digitize_star_along_stream(data, instream, bins=10): """digitize_star_along_stream. Parameters ---------- data: (Q)Table instream: array_like index / bool array Returns ------- df: (n, 3) ndarray columns ra, dec, dec_err Notes ----- dec_err estimated as bin_width / sqrt(numpoints) works with output of select_stars_in_an_arm """ if isinstance(data, SkyCoord): dataphi1 = data.phi1 ra, dec = data.icrs.ra, data.icrs.dec elif isinstance(data, (Table, QTable)): dataphi1 = data["phi1"] ra, dec = data["ra"], data["dec"] else: raise TypeError("data is not a SkyCoord or (Q)Table") x = dataphi1[instream] bin_edges = np.histogram_bin_edges(x, bins=bins) # binning binnums = np.digitize(x, bin_edges[:-1]) # getting binnumber print(len(x), len(binnums)) print(bins, len(np.unique(binnums))) avg_ra = np.full(bins, np.nan) avg_dec = np.full(bins, np.nan) avg_dec_err = np.full(bins, np.nan) for i, b in enumerate(np.unique(binnums)): ind = binnums == b avg_ra[i] = ra[instream][ind].mean().value avg_dec[i] = dec[instream][ind].mean().value avg_dec_err[i] = np.diff(bin_edges)[i] / np.sqrt( ind.sum()) # width/sqrt(numpoints) return np.c_[avg_ra, avg_dec, avg_dec_err]
def compute_2D_pspec(self): self.cosmo_FFT2() self.compute_k_2D() bin_edges = np.histogram_bin_edges(np.sort(self.k_del), bins=self.nbins) self.kmodes = bin_edges[:self. nbins] #bin_edges[:self.nbins]+half_delta_bin a = np.zeros( len(bin_edges) - 1 ) #holds real stuff..here you need to take the number of BINS not bin edges! # you alwaysneed an extra edge than you have bin! #c holds, in each element, the number of pixels c = np.zeros_like(a) for i in range(self.data1.shape[0]): for j in range(self.data1.shape[1]): kx = ((i - (self.data1.shape[0] / 2)) * self.delta_kx ) #need to multiply by kdelta to get your k units ky = ((j - (self.data1.shape[1] / 2)) * self.delta_ky) kmag = np.sqrt((kx**2) + (ky**2)) for k in range( len(bin_edges) - 1 ): #make sure that you speed this up by not considering already binned ps's if bin_edges[k] < kmag <= bin_edges[k + 1]: a[k] += np.real(self.ps_data[i, j]) c[k] += 1 break arg = np.argwhere(np.isnan(a)), np.where( c == 0 ) # Make sure there are no nans! If there are make them zeros. Also make sure you never divide by 0! if len(arg) > 0: for i in range(len(arg)): a[arg[i]] = 0 c[arg[i]] = 1 else: pass T_tilde = a / c volume = self.Lx * self.Ly self.pk = T_tilde / volume #[mk^2*Mpc^2] return self.kmodes[1:], self.pk[1:]
def ece(probs, labels, n_bins=30): ''' probs has shape [n_examples, n_classes], labels has shape [n_class] -> np.float Computes the Expected Calibration Error (ECE). Many options are possible, in this implementation, we provide a simple version. Using a uniform binning scheme on the full range of probabilities, zero to one, we bin the probabilities of the predicted label only (ignoring all other probabilities). For the ith bin, we compute the avg predicted probability, p_i, and the bin's total accuracy, a_i. We then compute the ith calibration error of the bin, |p_i - a_i|. The final returned value is the weighted average of calibration errors of each bin. ''' n_examples, n_classes = probs.shape # assume that the prediction is the class with the highest prob. preds = np.argmax(probs, axis=1) onehot_labels = np.eye(n_classes)[labels] #check what are the probabilities for our predictions predicted_class_probs = probs[range(n_examples), preds] # Use uniform bins on the range of probabilities, i.e. closed interval [0.,1.] bin_upper_edges = np.histogram_bin_edges([], bins=n_bins, range=(0., 1.)) bin_upper_edges = bin_upper_edges[1:] # bin_upper_edges[0] = 0. #to get the array of indices of the bin of each value which belongs to an array probs_as_bin_num = np.digitize(predicted_class_probs, bin_upper_edges) sums_per_bin = np.bincount(probs_as_bin_num, minlength=n_bins, weights=predicted_class_probs) sums_per_bin = sums_per_bin.astype(np.float32) total_per_bin = np.bincount(probs_as_bin_num, minlength=n_bins) \ + np.finfo(sums_per_bin.dtype).eps # division by zero avg_prob_per_bin = sums_per_bin / total_per_bin accuracies = onehot_labels[range(n_examples), preds] # accuracies[i] is 0 or 1 accuracies_per_bin = np.bincount(probs_as_bin_num, weights=accuracies, minlength=n_bins) \ / total_per_bin prob_of_being_in_a_bin = total_per_bin / float(n_examples) ece_ret = np.abs(accuracies_per_bin - avg_prob_per_bin) * prob_of_being_in_a_bin ece_ret = np.sum(ece_ret) return ece_ret