示例#1
0
def update_map(indicator):
    return ff.create_choropleth(fips=all_counties['fips_county'].tolist(),
                                values=all_counties[indicator].tolist(),
                                scope=['usa'],
                                title=f'{indicator} by county',
                                binning_endpoints=np.histogram_bin_edges(
                                    all_counties[indicator].tolist(),
                                    bins=20,
                                    range=[0, 100]).tolist(),
                                showlegend=True)
示例#2
0
文件: util.py 项目: stjordanis/cancer
def digitize_non_genes_data(col):
    not_nan = numpy.nan_to_num(col, nan=-1)
    n_unique = len(set(not_nan))
    if 15 < n_unique:
        n_unique = 15
    edge = numpy.histogram_bin_edges(numpy.nan_to_num(col, nan=col.min()),
                                     bins=n_unique - 1)
    digits = numpy.digitize(not_nan, edge)
    digits = digits - digits.min()
    return digits
示例#3
0
    def test_histogram_multiple(self, long_df, triple_args):

        h = Hist()
        out = h(long_df, *triple_args)
        bins = np.histogram_bin_edges(long_df["x"], "auto")
        for (a, s), out_part in out.groupby(["a", "s"]):
            x = long_df.loc[(long_df["a"] == a) & (long_df["s"] == s), "x"]
            hist, edges = np.histogram(x, bins=bins)
            assert_array_equal(out_part["y"], hist)
            assert_array_equal(out_part["space"], np.diff(edges))
示例#4
0
    def to_operation(self, inputs, outputs, **kwargs):
        outputs = self.output_vars()

        if self.values['binned']:
            bins = np.histogram_bin_edges(np.arange(self.values['min'], self.values['max']),
                                          bins=self.values['bins'],
                                          range=(self.values['min'], self.values['max']))
            map_outputs = [self.name()+'_bin', self.name()+'_map_count']
            reduce_outputs = [self.name()+'_reduce_count']

            def func(k, v):
                return np.digitize(k, bins), (v, 1)

            def mean(d):
                res = {bins[i]: 0 for i in range(0, bins.size)}
                for k, v in d.items():
                    try:
                        res[bins[k]] = v[0]/v[1]
                    except IndexError:
                        pass

                keys, values = zip(*sorted(res.items()))
                return np.array(keys), np.array(values)

            nodes = [
                gn.Map(name=self.name()+'_map', inputs=inputs, outputs=map_outputs,
                       func=func, **kwargs),
                gn.ReduceByKey(name=self.name()+'_reduce',
                               inputs=map_outputs, outputs=reduce_outputs,
                               reduction=lambda cv, v: (cv[0]+v[0], cv[1]+v[1]), **kwargs),
                gn.Map(name=self.name()+'_mean', inputs=reduce_outputs, outputs=outputs, func=mean,
                       **kwargs)
            ]
        else:
            map_outputs = [self.name()+'_map_count']
            reduce_outputs = [self.name()+'_reduce_count']

            def mean(d):
                res = {}
                for k, v in d.items():
                    res[k] = v[0]/v[1]
                keys, values = zip(*sorted(res.items()))
                return np.array(keys), np.array(values)

            nodes = [
                gn.Map(name=self.name()+'_map', inputs=[inputs['Value']], outputs=map_outputs,
                       func=lambda a: (a, 1), **kwargs),
                gn.ReduceByKey(name=self.name()+'_reduce',
                               inputs=[inputs['Bin']]+map_outputs, outputs=reduce_outputs,
                               reduction=lambda cv, v: (cv[0]+v[0], cv[1]+v[1]), **kwargs),
                gn.Map(name=self.name()+'_mean', inputs=reduce_outputs, outputs=outputs, func=mean,
                       **kwargs)
            ]

        return nodes
示例#5
0
def get_bin_edges(query_return_df, label_template):
    all_data = query_return_df['VALUE'].values
    all_data = all_data[np.logical_not(np.isnan(all_data))]

    # resample data down to an annual sample size (data size is used in bin calc algorithms)
    n_draws = len(query_return_df['GEOID'].unique()) - 2

    # could perform resampling multiple times and average edges
    # but doing calc only once for efficiency
    sampled_data = list(np.random.choice(all_data, size=n_draws, replace=True))

    # make sure the min and max are always in the data or bin range will be wrong
    sampled_data.append(max(all_data))
    sampled_data.append(min(all_data))

    # Freedman Diaconis Estimator
    bin_edges = np.histogram_bin_edges(sampled_data, bins='fd')
    bar_centers = [
        round(((bin_edges[i - 1] + bin_edges[i]) / 2.0), 2)
        for i in range(1, len(bin_edges))
    ]

    # measure kurtosis to determine binning strategy
    k = pearson_kurtosis(all_data)

    # k = 0 is close to a normal distribution; some of our data have k = 80
    if k < 10:

        label_fmt = [
            label_template(bin_edges[i - 1], bin_edges[i])
            for i in range(1, len(bin_edges))
        ]
        return bin_edges, bar_centers, label_fmt

    else:

        # find the 95th percentile of all data to use as binning threshold
        threshold_95 = np.quantile(all_data, 0.95)

        # subset the bin edges under the threshold
        edges_threshold = list(bin_edges[bin_edges < threshold_95])

        # add the maximum value back to the sequence
        edges_threshold.append(max(bin_edges))

        # calculate the bar centers so that the final wide bar isn't stretched
        bar_centers_threshold = bar_centers[0:len(edges_threshold) + 1]

        # update the bar labels
        label_fmt_threshold = [
            label_template(edges_threshold[i - 1], edges_threshold[i])
            for i in range(1, len(edges_threshold))
        ]

        return edges_threshold, bar_centers_threshold, label_fmt_threshold
示例#6
0
def test_histogram_bin_edges_execution(setup):
    rs = np.random.RandomState(0)

    raw = rs.randint(10, size=(20, ))
    a = tensor(raw, chunk_size=6)

    # range provided
    for range_ in [(0, 10), (3, 11), (3, 7)]:
        bin_edges = histogram_bin_edges(a, range=range_)
        result = bin_edges.execute().fetch()
        expected = np.histogram_bin_edges(raw, range=range_)
        np.testing.assert_array_equal(result, expected)

    raw2 = rs.randint(10, size=(1, ))
    b = tensor(raw2)
    raw3 = rs.randint(10, size=(0, ))
    c = tensor(raw3)
    for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
        test_bins = [
            10, 'stone', 'auto', 'doane', 'fd', 'rice', 'scott', 'sqrt',
            'sturges'
        ]
        for bins in test_bins:
            bin_edges = histogram_bin_edges(t, bins=bins)
            result = bin_edges.execute().fetch()
            expected = np.histogram_bin_edges(r, bins=bins)
            np.testing.assert_array_equal(result, expected)

        test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
        for bins in test_bins:
            bin_edges = histogram_bin_edges(t, bins=bins)
            result = bin_edges.execute().fetch()
            expected = np.histogram_bin_edges(r, bins=[0, 4, 8])
            np.testing.assert_array_equal(result, expected)

        raw = np.arange(5)
        a = tensor(raw, chunk_size=3)
        bin_edges = histogram_bin_edges(a)
        result = bin_edges.execute().fetch()
        expected = np.histogram_bin_edges(raw)
        assert bin_edges.shape == expected.shape
        np.testing.assert_array_equal(result, expected)
示例#7
0
 def test_bin_precision(self):
     # Ensure bin edges are precise
     bins = 500
     r_min = 0
     r_max = 50
     rdf = freud.density.RDF(bins=bins, r_max=r_max, r_min=r_min)
     expected_bin_edges = np.histogram_bin_edges(np.array([0],
                                                          dtype=np.float32),
                                                 bins=bins,
                                                 range=[r_min, r_max])
     npt.assert_allclose(rdf.bin_edges, expected_bin_edges, atol=1e-6)
示例#8
0
def histogram_bin_edges(a, bins=10, range=None, weights=None, log=False):
    if is_array(bins):
        return bins

    a = a if isinstance(a, np.ndarray) else np.asarray(a)
    if np.issubdtype(a.dtype, np.datetime64):
        bins = histogram_bin_edges((a - np.min(a)).astype(float), bins, range, weights)

        time_unit = np.datetime_data(a.dtype)[0]
        bins = np.min(a) + bins.astype(f"timedelta64[{time_unit}]")
        return bins
    elif is_integer(a):
        pass
    #     bins = np.unique(a)
    elif is_floating(a):
        pass
    elif np.issubdtype(a.dtype, np.bool_) or np.issubdtype(a.dtype, np.str_):
        # Not implemented yet that bins affect nothing
        bins = np.unique(a)
        return bins
    else:
        raise NotImplementedError(f"Unexpected type: {a.dtype}")

    try:
        if log:
            log_a = np.log10(a)
            log_a = log_a[~np.isnan(log_a)]
            bins = 10 ** histogram_bin_edges(log_a, bins, range, weights, log=False)
        else:
            if range is None:
                range = (np.min(a), np.max(a))

            if (
                    (range[1] - range[0] < 1e7) or
                    (bins is not None)
            ):
                # print(bins)
                bins = np.histogram_bin_edges(a, bins, range, weights)
            else:
                warnings.warn(
                    "It may cause memory leak and hang"
                    f" due to significantly different between range first and second: {range}\n"
                    f"Use bin size {n_bins_limit}"
                )
                bins = np.linspace(*range, n_bins_limit)

        if bins.size > n_bins_limit:
            warnings.warn(f"Huge bin size {bins.size} -> {n_bins_limit}")
            bins = np.linspace(np.min(bins), np.max(bins), n_bins_limit)
    except MemoryError as e:
        warnings.warn(f"Encountered MemoryError: {e}")
        bins = np.linspace(np.min(a), np.max(a), n_bins_limit)

    return bins
示例#9
0
文件: histogram.py 项目: djdt/pewpew
    def setHistogram(
        self,
        data: np.ndarray,
        bins: Union[int, str] = "auto",
        min_bins: int = 16,
        max_bins: int = 128,
    ) -> None:
        """Draw 'data' as a histogram.

        Args:
            data: hist data
            bins: passed to np.histogram_bin_edges
            min_bins: minimum number of bins
            max_bins: maximum number of bins
        """
        vmin, vmax = np.percentile(data, 5), np.percentile(data, 95)

        barset = QtCharts.QBarSet("histogram")
        barset.setColor(sequential[1])
        barset.setLabelColor(light_theme["text"])

        bin_edges = np.histogram_bin_edges(data, bins=bins, range=(vmin, vmax))
        if bin_edges.size > max_bins:
            bin_edges = np.histogram_bin_edges(data,
                                               bins=max_bins,
                                               range=(vmin, vmax))
        elif bin_edges.size < min_bins:
            bin_edges = np.histogram_bin_edges(data,
                                               bins=min_bins,
                                               range=(vmin, vmax))

        hist, edges = np.histogram(data, bins=bin_edges)
        barset.append(list(hist))

        self.series.clear()
        self.series.append(barset)

        self._xaxis.setRange(-0.5, hist.size - 0.5)
        self.xaxis.setRange(edges[0], edges[-1])
        self.yaxis.setRange(0, np.amax(hist))
        self.yaxis.applyNiceNumbers()
    def find_centroids(self):
        """This (crude) algorithm iterates through the pixels of an image and returns the
        RA and Dec coordinates of the maxima of the image as two 1D numpy arrays.
        """
        with fits.open(f"{self.file_path}") as file:
            # load in RA/Dec coordinates
            xs = np.array(file[1].data["X"])
            ys = np.array(file[1].data["Y"])

            # determine x and y bins
            xbinedges = np.histogram_bin_edges(xs, bins='fd')
            ybinedges = np.histogram_bin_edges(ys, bins='fd')

            # iterate through the bins on each axis
            image_maxes = []
            for j in range(len(ybinedges) - 1):
                for i in range(len(xbinedges) - 1):
                    cell_xmin, cell_xmax = xbinedges[i], xbinedges[i + 1]
                    cell_ymin, cell_ymax = ybinedges[j], ybinedges[j + 1]

                    # isolate the events within each bin
                    cell_constraints = np.where((xs < cell_xmax)
                                                & (xs >= cell_xmin)
                                                & (ys < cell_ymax)
                                                & (ys >= cell_ymin))

                    cell_xs = xs[cell_constraints]
                    cell_ys = ys[cell_constraints]
                    coord_pairs = list(zip(cell_xs, cell_ys))

                    # find the maximum point in the bin
                    coord_counts = Counter(coord_pairs).most_common(1)
                    for result in coord_counts:
                        image_maxes.append(result)

            # sort the identified maxima by count and return their x(RA) and y(Dec) coordinates
            image_maxes = sorted(image_maxes, key=lambda x: x[1], reverse=True)

            centroid_xs = np.array([item[0][0] for item in image_maxes])
            centroid_ys = np.array([item[0][1] for item in image_maxes])
            return centroid_xs, centroid_ys
示例#11
0
def generate_plot(context, trips):
    minute_lengths = [
        x.total_seconds() / 60 for x in trips.end_time - trips.start_time
    ]
    bin_edges = np.histogram_bin_edges(minute_lengths, 15)
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.set(title="Trip lengths", xlabel="Minutes", ylabel="Count")
    ax.hist(minute_lengths, bins=bin_edges)
    fig.savefig("trip_lengths.png")
    context.log_event(
        AssetMaterialization(asset_key="trip_dist_plot",
                             description="Distribution of trip lengths."))
示例#12
0
def plot_min_distances(turbines,
                       distances,
                       title='',
                       factors=None,
                       quantiles=None):
    distances = distances.where(distances < np.inf)
    idcs_not_nan = ~np.isnan(turbines.t_rd)
    rotor_diameters_m = turbines.t_rd[idcs_not_nan]
    min_distances_not_nan = distances[idcs_not_nan] * 1e3
    bin_edges = np.histogram_bin_edges(rotor_diameters_m,
                                       bins=15,
                                       range=(5, 155))
    bin_idcs = np.digitize(rotor_diameters_m, bin_edges)

    fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)

    idcs_cut_off = min_distances_not_nan < 500
    bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2.
    x = bin_centers[bin_idcs[idcs_cut_off] - 1]
    ax = sns.stripplot(x=x,
                       y=min_distances_not_nan[idcs_cut_off],
                       jitter=.4,
                       size=1,
                       color='k')

    colors = '#c72321', '#fbd7a9', '#f0c220', '#7a6952',
    factors = factors or DISTANCE_FACTORS

    for color, factor in zip(colors, factors):
        ax.plot(factor * bin_centers, label=f'{factor:.2f}x', color=color)

    colors = '#246b71', '#6a9395', '#84bcbf', '#9bdade'
    quantiles = quantiles or (0.05, 0.1, 0.2, 0.3)
    for q, color in zip(quantiles, colors):
        ax.plot(pd.Series(min_distances_not_nan).groupby(bin_idcs).quantile(
            q=q).values,
                label=f"{int(q * 100)}% quantile",
                color=color)

    ax.set_ylim(0, 500)
    ax.set_xlim(0, 11)

    plt.ylabel('Distance to closest turbine [m]')
    plt.xlabel('Rotor diameter [m]')

    if title:
        plt.title(title)

    plt.legend()

    plt.grid()

    return fig
def lr_histogram(lrs, y, bins=20, ax=plt):
    """
    plots the 10log lrs
    """
    log_lrs = np.log10(lrs)

    bins = np.histogram_bin_edges(log_lrs, bins=bins)
    points0, points1 = util.Xy_to_Xn(log_lrs, y)
    ax.hist(points0, bins=bins, alpha=.25, density=True)
    ax.hist(points1, bins=bins, alpha=.25, density=True)
    ax.set_xlabel('10log likelihood ratio')
    ax.set_ylabel('count')
示例#14
0
def histogram(xin, bins=100, norm=False, logx=True, logbase=10, density=False):
    if logx:
        flog, fpow = get_log_pow(logbase)
        xi = xin[xin > 0]
        bin_arr = fpow(np.linspace(flog(xi.min()), flog(xi.max()), bins))
    else:
        xi = xin
        bin_arr = np.histogram_bin_edges(xi, bins=bins)
    hh,be = np.histogram(xi, bins=bin_arr, density=density)
    if norm:
        hh = hh / np.dot(hh, np.diff(be))
    return hh, be
示例#15
0
    def define_bins_from_samples(self, samples):

        # convert to array if given in samples
        if type(samples) is not np.ndarray:
            samples = np.array(samples)

        # recalculate bins
        self.bins = []
        for v in range(self.n_vars):
            bins = np.histogram_bin_edges(samples[:, v],
                                          bins=self.bin_sizes[v])
            self.bins.append(bins[1:-1])
示例#16
0
def freedman_diaconis(x: np.ndarray) -> np.ndarray:
    """
    The binwidth is proportional to the interquartile range (IQR) and inversely proportional to cube root of a.size.
    Can be too conservative for small datasets, but is quite good for large datasets. The IQR is very robust to
    outliers.

    :param x: np.ndarray
        The 1-dimensional x-data to bin.
    :return: np.ndarray
        The bins edges computed using the FD method.
    """
    return np.histogram_bin_edges(x, bins='fd')
示例#17
0
    def fit(self, eps=0.9, min_samples=3):
        eps *= np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0]
        # eps *= median_absolute_deviation(self.x)
        print(np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0])
        print(median_absolute_deviation(self.x))

        db = skDBSCAN(eps,
                      min_samples,
                      metric='euclidean',
                      metric_params=None,
                      algorithm='auto',
                      leaf_size=30,
                      p=None,
                      n_jobs=None).fit(self.X)

        labels = db.labels_
        # @Note: Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        out = {"n_clusters": n_clusters, "n_noise": n_noise, "labels": labels}
        return out
示例#18
0
    def compute_quantization_parameters(self,
                                        ranges='q99',
                                        clip=True,
                                        center=False):
        """ Make bins for int8 quantization and convert value-stats.

        Parameters
        ----------
        ranges : str or sequence of two numbers
            Ranges to quantize data to. Available options are:
                - `q95`, `q99`, `q999` to clip data to respective quantiles.
                - `same` keep the same range of data.
        clip : bool
            Whether to clip data to selected ranges.
        center : bool
            Whether to make data have 0-mean before quantization.
        """
        ranges_dict = {
            'q95': min(abs(self.v_q05), abs(self.v_q95)),
            'q99': min(abs(self.v_q01), abs(self.v_q99)),
            'q999': min(abs(self.v_q001), abs(self.v_q999)),
            'same': max(abs(self.v_min), abs(self.v_max)),
        }
        if ranges in ranges_dict:
            ranges = ranges_dict[ranges]
            ranges = (-ranges, +ranges)

        if center:
            ranges = tuple(item - self.v_mean for item in ranges)

        self.qnt_ranges = ranges
        self.qnt_bins = np.histogram_bin_edges(None, bins=254,
                                               range=ranges).astype(np.float)
        self.qnt_clip = clip
        self.qnt_center = center

        # Compute quantized statistics
        quantized_tc = self.quantize(self.trace_container)
        self.qnt_min, self.qnt_max = self.quantize(self.v_min), self.quantize(
            self.v_max)
        self.qnt_mean, self.qnt_std = np.mean(quantized_tc), np.std(
            quantized_tc)
        self.qnt_q001, self.qnt_q01, self.qnt_q05 = np.quantile(
            quantized_tc, [0.001, 0.01, 0.05])
        self.qnt_q999, self.qnt_q99, self.qnt_q95 = np.quantile(
            quantized_tc, [0.999, 0.99, 0.95])

        # Estimate difference after quantization
        quantized_tc += 127
        restored_tc = self.qnt_bins[quantized_tc]
        self.qnt_error = np.mean(
            np.abs(restored_tc - self.trace_container)) / self.v_std
示例#19
0
 def __call__(self, loaded_data_tuple, metadata, random_state):
     bin_edges = np.histogram_bin_edges(loaded_data_tuple.data, self.bins,
                                        range)
     if np.isscalar(self.bins):
         bin_edges = bin_edges[1:]
     data = np.digitize(loaded_data_tuple.data, bin_edges, right=True)
     if self.use_one_hot:
         one_hot = np.zeros(data.shape + (len(bin_edges) + 1, ), data.dtype)
         one_hot = np.reshape(one_hot, (-1, one_hot.shape[-1]))
         for idx, bin in enumerate(np.reshape(data, -1)):
             one_hot[idx, bin] = 1
         data = np.reshape(one_hot, data.shape + (one_hot.shape[-1], ))
     return replace(loaded_data_tuple, data=data)
示例#20
0
def equal_number_FD(x: np.ndarray) -> np.ndarray:
    """
    Takes the number of bins computed using the FD method, but then selects the bin edges splitting
    the dataset in bins with equal number of data-points.

    :param x: np.ndarray
        The 1-dimensional x-data to bin.
    :return: np.ndarray
        The bins edges computed using the equal-N method.
    """
    nbin = len(np.histogram_bin_edges(x, bins='fd')) - 1
    npt = len(x)
    return np.interp(np.linspace(0, npt, nbin + 1), np.arange(npt), np.sort(x))
示例#21
0
def plot_sim_data(sims_data=None, normalize=True, bin_width=None):
    if not bin_width:
        # Generate bin edges for each input
        all_bin_edges = []
        for sim_data in sims_data.values():
            all_bin_edges.append(np.histogram_bin_edges(sim_data, bins='fd'))

        # Discover bin intervals for each input
        all_bin_widths = []
        for input_bin_edges in all_bin_edges:
            all_bin_widths.append(input_bin_edges[1] - input_bin_edges[0])

        # Determine max bin interval
        bin_width = max(all_bin_widths)

    fig, ax = plt.subplots()

    prop_iter = iter(plt.rcParams['axes.prop_cycle'])

    for sim_name, sim_data in sims_data.items():
        # if input type is constant (all values in array match), plot as bar, not histogram
        if not all(sim_data == sim_data[0]):
            # Create bin edges for each input using calculated max bin interval
            input_bin_edges = np.arange(min(sim_data), max(sim_data),
                                        bin_width)
            ax.hist(sim_data,
                    bins=input_bin_edges,
                    density=normalize,
                    color=next(prop_iter)['color'],
                    alpha=0.5,
                    label=sim_name)
        else:
            if normalize:
                ax.bar(sim_data[0],
                       1.0,
                       bin_width,
                       color=next(prop_iter)['color'],
                       alpha=0.5,
                       label=sim_name)
            else:
                ax.bar(sim_data[0],
                       len(sim_data),
                       bin_width,
                       color=next(prop_iter)['color'],
                       alpha=0.5,
                       label=sim_name)

    ax.set_ylabel('PDF (%)')
    ax.set_xlabel('Value')

    return fig, ax
示例#22
0
    def fit(self, x: Union[list, np.ndarray], y=None):
        if not isinstance(x, np.ndarray):
            x = np.array(x)

        if len(x.shape) == 1:
            x = x.reshape(-1, 1)

        # parameter validation
        if not self.columns:
            self.columns = list(range(x.shape[1]))

        self.min_value_by_column_ = self._validate_value(self.min_value)
        self.max_value_by_column_ = self._validate_value(self.max_value)

        if isinstance(self.n_bits, list):
            self.n_bits_by_column_ = self.n_bits
        else:
            self.n_bits_by_column_ = [self.n_bits] * x.shape[1]

        # fitting
        if not self.min_value_by_column_:
            self.min_value_by_column_ = []
            for c in self.columns:
                self.min_value_by_column_.append(np.min(x[:, c]))

        if not self.max_value_by_column_:
            self.max_value_by_column_ = []
            for c in self.columns:
                self.max_value_by_column_.append(np.max(x[:, c]))

        self.possible_values_ = dict()
        self.bins_ = dict()
        for i, c in enumerate(self.columns):
            if self.quantile_based:
                self.bins_[c] = np.unique(
                    np.quantile(x[:, c],
                                np.linspace(0, 1,
                                            self.n_bits_by_column_[c] + 1),
                                interpolation='higher'))
            else:
                self.bins_[c] = np.histogram_bin_edges(
                    [],
                    bins=self.n_bits_by_column_[c],
                    range=(self.min_value_by_column_[i],
                           self.max_value_by_column_[i]))
            self.possible_values_[c] = [
                ((i) * [1] + (self.n_bits_by_column_[c] - i) * [0])
                for i in range(self.n_bits_by_column_[c] + 1)
            ]
            self.possible_values_[c] = np.array(self.possible_values_[c])
        return self
示例#23
0
 def discretize_continous_label(labels, bins='sturges', verbose=False):
     # Get an estimation of the best bin edges. 'Sturges' is conservative for pretty large datasets (N>1000).
     bin_edges = np.histogram_bin_edges(labels, bins=bins)
     if verbose:
         print('Global histogram:\n',
               np.histogram(labels, bins=bin_edges, density=False),
               flush=True)
     # Discretizes the values according to these bins
     discretization = np.digitize(labels, bin_edges[1:], right=True)
     if verbose:
         print('Bin Counts after discretization:\n',
               np.bincount(discretization),
               flush=True)
     return discretization
示例#24
0
def _hist_bins(sm):
    data = sm.get_array()
    log = isinstance(sm.norm, mpl.colors.LogNorm)
    if log:
        data = np.log(data)
    bins = np.histogram_bin_edges(
        data, "auto",
        np.log(sm.get_clim()) if log else sm.get_clim())
    if data.dtype.kind in "iu":  # not log.
        binsize = max(round(bins[1] - bins[0]), 1)
        bins = np.arange(data.min(), data.max() + binsize + .5, binsize) - .5
    if log:
        bins = np.exp(bins)
    return bins
示例#25
0
 def _get_hist(_width):
     if _width == 'auto':
         _edges = np.histogram_bin_edges(array, 'auto').tolist()
         _edges = [
             _edges[0] - 0.1 * i for i in range(-5, 0, -1)
         ] + _edges + [_edges[-1] + 0.1 * i for i in range(1, 6)]
     else:
         _edges = np.arange(array_range[0] - _width * 6,
                            array_range[1] + _width * 5, _width)
     h, edges = np.histogram(array, bins=_edges, density=True)
     h /= 100.
     # conv_kernel = self.option.density_smooth_conv_kernel
     # h = np.convolve(h, conv_kernel, 'full') / np.sum(conv_kernel)
     return h, np.convolve(edges, [1, 1], 'valid') / 2
示例#26
0
def array2TH1F(array, bins, name = 'historam', title = 'histogram'):  
    if (len(bins)==0):
        # if binning is not specified, use numpy's auto bin finder
        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
        bins  = np.histogram_bin_edges(array, 'auto')
    else:
        bins = np.array(bins, dtype=float)
        
    histo = ROOT.TH1F(name, title, len(bins)-1, bins)
        
    for x in array:
        histo.Fill(x)
        
    return histo
示例#27
0
    def _init_numpy(
        self, obj, bins="auto", range=None, weights=None, threads=1, overflow=True
    ):

        # convert ROOT-like "50,0,10" to equivalent of np.linspace(0,10,51)
        if isinstance(bins, str) and (bins.count(",") == 2):
            nbins, low, high = bins.split(",")
            range = (float(low), float(high))
            bins = int(nbins)

        if is_datelike(obj):
            obj = convert_dates(obj)
            self._metadata["date_axes"] = ["x"]

        if isinstance(bins, str):

            # if binning integers, binning choice is easy
            if hasattr(obj, "dtype") and ("int" in str(obj.dtype)):
                # check just 10% on each side to get reasonable ranges
                # n = max(int(0.1*len(obj)), 1)
                # maxi = max(obj[:n].max(), obj[-n:].max())
                # mini = min(obj[:n].min(), obj[-n:].min())
                mini, maxi = obj.min(), obj.max()
                bins = np.linspace(mini - 0.5, maxi + 0.5, maxi - mini + 2)
            else:
                bins = np.histogram_bin_edges(obj, bins, range)

        if weights is not None:
            weights = np.array(weights, copy=False)

        if is_datelike(bins):
            bins = convert_dates(bins)
            self._metadata["date_axes"] = ["x"]

        counts, (edges,) = histogramdd_wrapper(
            (obj,), (bins,), (range,), weights, overflow, threads,
        )

        if weights is not None:
            sumw2, _ = histogramdd_wrapper(
                (obj,), (bins,), (range,), weights ** 2, overflow, threads,
            )
            errors = sumw2 ** 0.5
        else:
            errors = counts ** 0.5

        self._counts = counts
        self._edges = edges
        self._errors = errors
示例#28
0
def digitize_star_along_stream(data, instream, bins=10):
    """digitize_star_along_stream.

    Parameters
    ----------
    data: (Q)Table
    instream: array_like
        index / bool array

    Returns
    -------
    df: (n, 3) ndarray
        columns ra, dec, dec_err

    Notes
    -----
    dec_err estimated as bin_width / sqrt(numpoints)
    works with output of select_stars_in_an_arm

    """
    if isinstance(data, SkyCoord):
        dataphi1 = data.phi1
        ra, dec = data.icrs.ra, data.icrs.dec
    elif isinstance(data, (Table, QTable)):
        dataphi1 = data["phi1"]
        ra, dec = data["ra"], data["dec"]
    else:
        raise TypeError("data is not a SkyCoord or (Q)Table")

    x = dataphi1[instream]

    bin_edges = np.histogram_bin_edges(x, bins=bins)  # binning
    binnums = np.digitize(x, bin_edges[:-1])  # getting binnumber

    print(len(x), len(binnums))
    print(bins, len(np.unique(binnums)))

    avg_ra = np.full(bins, np.nan)
    avg_dec = np.full(bins, np.nan)
    avg_dec_err = np.full(bins, np.nan)

    for i, b in enumerate(np.unique(binnums)):
        ind = binnums == b
        avg_ra[i] = ra[instream][ind].mean().value
        avg_dec[i] = dec[instream][ind].mean().value
        avg_dec_err[i] = np.diff(bin_edges)[i] / np.sqrt(
            ind.sum())  # width/sqrt(numpoints)

    return np.c_[avg_ra, avg_dec, avg_dec_err]
示例#29
0
    def compute_2D_pspec(self):

        self.cosmo_FFT2()
        self.compute_k_2D()

        bin_edges = np.histogram_bin_edges(np.sort(self.k_del),
                                           bins=self.nbins)

        self.kmodes = bin_edges[:self.
                                nbins]  #bin_edges[:self.nbins]+half_delta_bin

        a = np.zeros(
            len(bin_edges) - 1
        )  #holds real stuff..here you need to take the number of BINS not bin edges! # you alwaysneed an extra edge than you have bin!

        #c holds, in each element, the number of pixels
        c = np.zeros_like(a)

        for i in range(self.data1.shape[0]):
            for j in range(self.data1.shape[1]):
                kx = ((i - (self.data1.shape[0] / 2)) * self.delta_kx
                      )  #need to multiply by kdelta to get your k units
                ky = ((j - (self.data1.shape[1] / 2)) * self.delta_ky)
                kmag = np.sqrt((kx**2) + (ky**2))
                for k in range(
                        len(bin_edges) - 1
                ):  #make sure that you speed this up by not considering already binned ps's
                    if bin_edges[k] < kmag <= bin_edges[k + 1]:
                        a[k] += np.real(self.ps_data[i, j])
                        c[k] += 1
                        break

        arg = np.argwhere(np.isnan(a)), np.where(
            c == 0
        )  # Make sure there are no nans! If there are make them zeros. Also make sure you never divide by 0!
        if len(arg) > 0:
            for i in range(len(arg)):
                a[arg[i]] = 0
                c[arg[i]] = 1
        else:
            pass

        T_tilde = a / c

        volume = self.Lx * self.Ly

        self.pk = T_tilde / volume  #[mk^2*Mpc^2]

        return self.kmodes[1:], self.pk[1:]
示例#30
0
def ece(probs, labels, n_bins=30):
    '''
    probs has shape [n_examples, n_classes], labels has shape [n_class] -> np.float
    Computes the Expected Calibration Error (ECE). Many options are possible,
    in this implementation, we provide a simple version.

    Using a uniform binning scheme on the full range of probabilities, zero
    to one, we bin the probabilities of the predicted label only (ignoring
    all other probabilities). For the ith bin, we compute the avg predicted
    probability, p_i, and the bin's total accuracy, a_i. We then compute the
    ith calibration error of the bin, |p_i - a_i|. The final returned value
    is the weighted average of calibration errors of each bin.
    '''
    n_examples, n_classes = probs.shape

    # assume that the prediction is the class with the highest prob.
    preds = np.argmax(probs, axis=1)

    onehot_labels = np.eye(n_classes)[labels]

    #check what are the probabilities for our predictions
    predicted_class_probs = probs[range(n_examples), preds]

    # Use uniform bins on the range of probabilities, i.e. closed interval [0.,1.]
    bin_upper_edges = np.histogram_bin_edges([], bins=n_bins, range=(0., 1.))
    bin_upper_edges = bin_upper_edges[1:]  # bin_upper_edges[0] = 0.

    #to get the array of indices of the bin of each value which belongs to an array
    probs_as_bin_num = np.digitize(predicted_class_probs, bin_upper_edges)
    sums_per_bin = np.bincount(probs_as_bin_num,
                               minlength=n_bins,
                               weights=predicted_class_probs)
    sums_per_bin = sums_per_bin.astype(np.float32)

    total_per_bin = np.bincount(probs_as_bin_num, minlength=n_bins) \
        + np.finfo(sums_per_bin.dtype).eps # division by zero
    avg_prob_per_bin = sums_per_bin / total_per_bin

    accuracies = onehot_labels[range(n_examples),
                               preds]  # accuracies[i] is 0 or 1
    accuracies_per_bin = np.bincount(probs_as_bin_num, weights=accuracies, minlength=n_bins) \
        / total_per_bin

    prob_of_being_in_a_bin = total_per_bin / float(n_examples)

    ece_ret = np.abs(accuracies_per_bin -
                     avg_prob_per_bin) * prob_of_being_in_a_bin
    ece_ret = np.sum(ece_ret)
    return ece_ret