Пример #1
0
def bin_edges_f(bin_method, mags_cols_cl):
    '''
    Obtain bin edges for each photometric dimension using the cluster region
    diagram. The 'bin_edges' list will contain all magnitudes first, and then
    all colors (in the same order in which they are read).
    '''
    bin_edges = []
    if bin_method in (
            'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt'):

        for mag in mags_cols_cl[0]:
            bin_edges.append(np.histogram(mag, bins=bin_method)[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(np.histogram(col, bins=bin_method)[1])

    elif bin_method == 'fixed':
        # Based on Bonatto & Bica (2007) 377, 3, 1301-1323 but using larger
        # the values used by them (0.25 for colors and 0.5 for magnitudes)
        for mag in mags_cols_cl[0]:
            b_num = max(2, (max(mag) - min(mag)) / 1.)
            bin_edges.append(np.histogram(mag, bins=int(b_num))[1])
        for col in mags_cols_cl[1]:
            b_num = max(2, (max(col) - min(col)) / .5)
            bin_edges.append(np.histogram(col, bins=int(b_num))[1])

    elif bin_method == 'knuth':
        for mag in mags_cols_cl[0]:
            bin_edges.append(knuth_bin_width(
                mag, return_bins=True, quiet=True)[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(knuth_bin_width(
                col, return_bins=True, quiet=True)[1])

    elif bin_method == 'blocks':
        for mag in mags_cols_cl[0]:
            bin_edges.append(bayesian_blocks(mag))
        for col in mags_cols_cl[1]:
            bin_edges.append(bayesian_blocks(col))

    # TODO this method is currently hidden from the params file.
    # To be used when #325 is implemented. Currently used to test
    # multi-dimensional likelihoods.
    #
    # For 4 to 6 dimensions the rule below appears to be a somewhat reasonable
    # rule of thumb for the number of bins for each dimension.
    # There is a trade-off between a large number of smaller bins which
    # better match features of the observed cluster but benefits larger
    # mass values, and fewer larger bins which better match masses but losing
    # finer details of the cluster.
    elif bin_method == 'man':
        d = len(mags_cols_cl[0]) + len(mags_cols_cl[1])
        b_num = [15, 10, 7][d - 4]
        for mag in mags_cols_cl[0]:
            bin_edges.append(np.histogram(mag, bins=int(b_num))[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(np.histogram(col, bins=int(b_num))[1])

    return bin_edges
Пример #2
0
def test_knuth_bin_width(N=10000, rseed=0):
    rng = np.random.RandomState(rseed)
    X = rng.randn(N)

    dx, bins = knuth_bin_width(X, return_bins=True)
    assert_allclose(len(bins), 59)

    dx2 = knuth_bin_width(X)
    assert dx == dx2

    with pytest.raises(ValueError):
        knuth_bin_width(rng.rand(2, 10))
Пример #3
0
def test_knuth_bin_width(N=10000, rseed=0):
    rng = np.random.default_rng(rseed)
    X = rng.standard_normal(N)

    dx, bins = knuth_bin_width(X, return_bins=True)
    assert_allclose(len(bins), 58)

    dx2 = knuth_bin_width(X)
    assert dx == dx2

    with pytest.raises(ValueError):
        knuth_bin_width(rng.random((2, 10)))
Пример #4
0
def histogram(a, bins=10, range=None, **kwargs):
    """Enhanced histogram

    This is a histogram function that enables the use of more sophisticated
    algorithms for determining bins.  Aside from the `bins` argument allowing
    a string specified how bins are computed, the parameters are the same
    as numpy.histogram().

    Parameters
    ----------
    a : array_like
        array of data to be histogrammed

    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'blocks' : use bayesian blocks for dynamic bin widths
        'knuth' : use Knuth's rule to determine bins
        'scotts' : use Scott's rule to determine bins
        'freedman' : use the Freedman-diaconis rule to determine bins

    range : tuple or None (optional)
        the minimum and maximum range for the histogram.  If not specified,
        it will be (x.min(), x.max())

    other keyword arguments are described in numpy.hist().

    Returns
    -------
    hist : array
        The values of the histogram. See `normed` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.

    See Also
    --------
    numpy.histogram
    astroML.plotting.hist
    """
    a = np.asarray(a)

    # if range is specified, we need to truncate the data for
    # the bin-finding routines
    if (range is not None and (bins in ['blocks', 'knuth',
                                        'scotts', 'freedman'])):
        a = a[(a >= range[0]) & (a <= range[1])]

    if isinstance(bins, str):
        if bins == 'blocks':
            bins = astropy_stats.bayesian_blocks(a)
        elif bins == 'knuth':
            da, bins = astropy_stats.knuth_bin_width(a, True)
        elif bins == 'scotts':
            da, bins = astropy_stats.scott_bin_width(a, True)
        elif bins == 'freedman':
            da, bins = astropy_stats.freedman_bin_width(a, True)
        else:
            raise ValueError("unrecognized bin code: '{}'".format(bins))

    return np.histogram(a, bins, range, **kwargs)
Пример #5
0
def knuth_bw_selector(dat_list):
    """Selects the kde bandwidth using Knuth's rule implemented in Astropy
    If Knuth's rule raises error, Scott's rule is used
    
    Parameters
    ----------
    dat_list : list
        List of data arrays that will be used to generate a kde

    Returns
    -------
    bw_min : float
        Minimum of bandwidths for all of the data arrays in dat_list
    """

    bw_list = []
    for dat in dat_list:
        try:
            bw = astrostats.knuth_bin_width(dat)
        except:
            print('Using Scott Rule!!')
            bw = astrostats.scott_bin_width(dat)
        bw_list.append(bw)
    return np.mean(bw_list)
        
Пример #6
0
 def f(x, mode):
     if mode == "knuth":
         # https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html
         from astropy.stats import knuth_bin_width
         _, bin_edges = knuth_bin_width(x, return_bins=True)
         return bin_edges
     else:
         return np.histogram_bin_edges(x, bins=mode)
Пример #7
0
    def cmd_hist(self, upper, lower, colour, cmd_colour):

        plt.figure()
        binwidth = stats.knuth_bin_width(colour)
        kde_data = colour
        bins = np.arange(min(kde_data), max(kde_data) + binwidth, binwidth)

        kde_data = colour

        plt.hist(kde_data, bins=bins, label='Binned Data')
        plt.title(cmd_colour)
Пример #8
0
def prepObsMass(obs_mass, bin_edges):
    """
    """
    # Obtain histogram for observed cluster.
    if bin_edges == 'knuth':
        bin_edges = knuth_bin_width(obs_mass, return_bins=True, quiet=True)[1]
    elif bin_edges == 'block':
        bin_edges = bayesian_blocks(obs_mass)
    cl_histo, bin_edges = np.histogram(obs_mass, bins=bin_edges)

    return [bin_edges, cl_histo]
Пример #9
0
def get_bin_sizes_xy(x, y, algo='scott'):
    """ Smartly get bin size to have a loer bias due to binning"""
    from astropy.stats import freedman_bin_width, scott_bin_width, knuth_bin_width, bayesian_blocks
    logger.info(" > Get smart bin sizes in 2D")

    if algo == 'scott':
        logger.info("use scott rule of thumb")
        width_x, bins_x = scott_bin_width(x, return_bins=True)
        width_y, bins_y = scott_bin_width(y, return_bins=True)
    elif algo == 'knuth':
        logger.info("use knuth rule of thumb")
        width_x, bins_x = knuth_bin_width(x, return_bins=True)
        width_y, bins_y = knuth_bin_width(y, return_bins=True)
    elif algo == 'freedman':
        logger.info("use freedman rule of thumb")
        width_x, bins_x = freedman_bin_width(x, return_bins=True)
        width_y, bins_y = freedman_bin_width(y, return_bins=True)
    else:
        raise NotImplementedError("use scott or knuth")
    n_bins_x, n_bins_y = len(bins_x), len(bins_y)

    return bins_x, bins_y, width_x, width_y
Пример #10
0
def calc_bin(data, mode):
    n = len(data)
    if mode == "sqrt":
        bins = np.sqrt(n)
    elif mode == "sturges":
        bins = np.log2(n) + 1
    elif mode == "scott":
        width, bins = scott_bin_width(data, return_bins=True)
        bins = len(bins)
    elif mode == "freedman":
        width, bins = freedman_bin_width(data, return_bins=True)
        bins = len(bins)
    elif mode == "knuth":
        width, bins = knuth_bin_width(data, return_bins=True)
        bins = len(bins)
    return bins
Пример #11
0
    def cmd_kde(self, upper, lower, colour):
        plt.figure()

        print(colour)
        binwidth = stats.knuth_bin_width(colour)
        kde_data = colour
        #print(binwidth)
        bins = np.arange(min(kde_data), max(kde_data) + binwidth, binwidth)
        x_eval = np.linspace(kde_data.min() - 1.0, kde_data.max() + 1.0, 500)
        kde = gaussian_kde(kde_data, bw_method=binwidth)

        plt.plot(x_eval, kde(x_eval), 'k', lw=2, label='KDE')
        plt.hist(kde_data, bins=bins, density=True, label='Binned Data')
        plt.legend()
        plt.xlabel('$(J-K)_0$')
        plt.ylabel('Normalised Density')
        plt.show()
Пример #12
0
def plotHistWithKnuth(data, axis, x_label=""):
    from scipy.stats import norm
    """ 
    This is a funtion that helps with the ploting of the data. 
    data: data to be plotted. It must be a Series of pd
    axis: axis instanco of marplotlib
    x_label: the label to put in the x axis. default is "" but that means that it takes the name of the series. 
    """
    #Obtain the bins using an specific method
    dx, bins = knuth_bin_width(data, return_bins=True)

    #Plot the bins
    axis.hist(data, bins, density=True)
    #Obtain the gaussian distribution that fits best the bins.
    mu, sigma = norm.fit(data)
    x = np.linspace(round(data.min()), round(data.max()), 100)  #points to draw
    y = norm.pdf(x, mu, sigma)  #value of gaussian dist in those points
    axis.plot(x, y, 'r--', linewidth=2)  #plot

    #print some valuable info
    axis.text(1.0,
              1.15,
              'Número total de trayectorias = ' + str(data.count()),
              verticalalignment='top',
              horizontalalignment='right',
              transform=axis.transAxes,
              fontsize=20)

    axis.text(1.0,
              0.9,
              '$\mu =${0:.3f} \n $\sigma =${1:.3f}'.format(mu, sigma),
              verticalalignment='top',
              horizontalalignment='right',
              transform=axis.transAxes,
              fontsize=20)
    #Change axis name or set the default name, the name of the data series
    if not len(x_label):
        axis.set_xlabel(data.name)
    else:
        axis.set_xlabel(x_label)
    axis.grid()

    axis.set_ylabel('Número de trayectorias (normalizado)')
Пример #13
0
def knuth_bin_width(data, return_bins=False, disp=True):
    r"""Return the optimal histogram bin width using Knuth's rule [1]_

    Parameters
    ----------
    data : array-like, ndim=1
        observed (one-dimensional) data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    dx : float
        optimal bin width. Bins are measured starting at the first data point.
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal number of bins is the value M which maximizes the function

    .. math::
        F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2})
        - M\log\Gamma(\frac{1}{2})
        - \log\Gamma(\frac{2n+M}{2})
        + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2})

    where :math:`\Gamma` is the Gamma function, :math:`n` is the number of
    data points, :math:`n_k` is the number of measurements in bin :math:`k`.

    References
    ----------
    .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms".
           arXiv:0605197, 2006

    See Also
    --------
    KnuthF
    freedman_bin_width
    scotts_bin_width
    """
    return astropy_stats.knuth_bin_width(data, return_bins)
Пример #14
0
def knuth_bin_width(data, return_bins=False, disp=True):
    r"""Return the optimal histogram bin width using Knuth's rule [1]_

    Parameters
    ----------
    data : array-like, ndim=1
        observed (one-dimensional) data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    dx : float
        optimal bin width. Bins are measured starting at the first data point.
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal number of bins is the value M which maximizes the function

    .. math::
        F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2})
        - M\log\Gamma(\frac{1}{2})
        - \log\Gamma(\frac{2n+M}{2})
        + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2})

    where :math:`\Gamma` is the Gamma function, :math:`n` is the number of
    data points, :math:`n_k` is the number of measurements in bin :math:`k`.

    References
    ----------
    .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms".
           arXiv:0605197, 2006

    See Also
    --------
    KnuthF
    freedman_bin_width
    scotts_bin_width
    """
    return astropy_stats.knuth_bin_width(data, return_bins)
Пример #15
0
def get_bin_sizes_x(x, algo='scott'):
    """ Smartly get bin size to have a loer bias due to binning"""
    from astropy.stats import freedman_bin_width, scott_bin_width, knuth_bin_width, bayesian_blocks
    logger.info(" > Get smart bin sizes in 1D")

    if algo == 'scott':
        logger.info("use scott rule of thumb")
        width_x, bins_x = scott_bin_width(x, return_bins=True)
    elif algo == 'knuth':
        logger.info("use knuth rule of thumb")
        width_x, bins_x = knuth_bin_width(x, return_bins=True)
    elif algo == 'freedman':
        logger.info("use freedman rule of thumb")
        width_x, bins_x = freedman_bin_width(x, return_bins=True)
    elif algo == 'blocks':
        logger.info("use bayesian blocks rule of thumb")
        width_x, bins_x = bayesian_blocks(x, return_bins=True)
    else:
        raise NotImplementedError("use scott, knuth, freedman or blocks")

    return bins_x, width_x
Пример #16
0
def hist(x, bins=10, range=None, *args, **kwargs):
    """Enhanced histogram

    This is a histogram function that enables the use of more sophisticated
    algorithms for determining bins.  Aside from the `bins` argument allowing
    a string specified how bins are computed, the parameters are the same
    as pylab.hist().

    Parameters
    ----------
    x : array_like
        array of data to be histogrammed

    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'blocks' : use bayesian blocks for dynamic bin widths
        'knuth' : use Knuth's rule to determine bins
        'scott' : use Scott's rule to determine bins
        'freedman' : use the Freedman-diaconis rule to determine bins

    range : tuple or None (optional)
        the minimum and maximum range for the histogram.  If not specified,
        it will be (x.min(), x.max())

    ax : Axes instance (optional)
        specify the Axes on which to draw the histogram.  If not specified,
        then the current active axes will be used.

    **kwargs :
        other keyword arguments are described in pylab.hist().
    """
    if isinstance(bins, str) and "weights" in kwargs:
        warnings.warn("weights argument is not supported: it will be ignored.")
        kwargs.pop('weights')

    x = np.asarray(x)

    if 'ax' in kwargs:
        ax = kwargs['ax']
        del kwargs['ax']
    else:
        # import here so that testing with Agg will work
        from matplotlib import pyplot as plt
        ax = plt.gca()

    # if range is specified, we need to truncate the data for
    # the bin-finding routines
    if (range is not None and (bins in ['blocks',
                                        'knuth', 'knuths',
                                        'scott', 'scotts',
                                        'freedman', 'freedmans'])):
        x = x[(x >= range[0]) & (x <= range[1])]

    if bins in ['blocks']:
        bins = bayesian_blocks(x)
    elif bins in ['knuth', 'knuths']:
        dx, bins = knuth_bin_width(x, True)
    elif bins in ['scott', 'scotts']:
        dx, bins = scott_bin_width(x, True)
    elif bins in ['freedman', 'freedmans']:
        dx, bins = freedman_bin_width(x, True)
    elif isinstance(bins, str):
        raise ValueError("unrecognized bin code: '{}'".format(bins))

    return ax.hist(x, bins, range, **kwargs)
Пример #17
0
def match(dataCm):
    """Performs the Match calculation in Eq. 1 of Breivik & Larson (2018)

    Parameters
    ----------
    dataCm : list
        List of two cumulative data sets for a single paramter

    Returns
    -------
    match : list
        List of matches for each cumulative data set

    binwidth : float
        Binwidth of histograms used for match computation
    """

    # DEFINE A LIST TO HOLD THE BINNED DATA:
    histo = [[], []]
    histoBinEdges = [[], []]
    # COMPUTE THE BINWIDTH FOR THE MOST COMPLETE DATA SET:
    # NOTE: THIS WILL BE THE BINWIDTH FOR ALL THE HISTOGRAMS IN THE HISTO LIST
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore", message="divide by zero encountered in double_scalars")
        try:
            bw, binEdges = astroStats.knuth_bin_width(np.array(dataCm[0]),
                                                      return_bins=True)
        except Exception:
            bw, binEdges = astroStats.scott_bin_width(np.array(dataCm[0]),
                                                      return_bins=True)
    if bw < 1e-4:
        bw = 1e-4
        binEdges = np.arange(binEdges[0], binEdges[-1], bw)

    # BIN THE DATA:
    for i in range(2):
        histo[i], histoBinEdges[i] = astroStats.histogram(dataCm[i],
                                                          bins=binEdges,
                                                          density=True)
    # COMPUTE THE MATCH:
    nominator = []
    denominator1 = []
    denominator2 = []
    nominatorSum = []
    denominator1Sum = []
    denominator2Sum = []

    histo2 = histo[1]
    histo1 = histo[0]

    for j in range(len(histo1)):
        nominator.append(histo1[j] * histo2[j])
        denominator1.append((histo1[j] * histo1[j]))
        denominator2.append((histo2[j] * histo2[j]))
    nominatorSum.append(np.sum(nominator))
    denominator1Sum.append(np.sum(denominator1))
    denominator2Sum.append(np.sum(denominator2))

    nominatorSum = np.array(nominatorSum)
    denominator1Sum = np.array(denominator1Sum)
    denominator2Sum = np.array(denominator2Sum)

    binwidth = binEdges[1] - binEdges[0]
    if binwidth < 1e-7:
        match = 1e-9
    else:
        match = np.log10(1 - nominatorSum /
                         np.sqrt(denominator1Sum * denominator2Sum))

    return match[0], binwidth
Пример #18
0
 def knuth_bandwidth_determination(self, bw_selection='min'):
     # bandwidth selection is min, max, mean
     bandwidths = np.asarray([knuth_bin_width(data_set) for data_set in self.data.T])
     self.bw = getattr(bandwidths, bw_selection)()
     return
Пример #19
0
    def calc_bins_intervals(self, nbins=101, precision=None):
        r"""
        Calculate histogram bins.

        nbins: int, str, array-like
            If int, use np.histogram to calculate the bin edges.
            If str and nbins == "knuth", use `astropy.stats.knuth_bin_width`
            to calculate optimal bin widths.
            If str and nbins != "knuth", use `np.histogram(data, bins=nbins)`
            to calculate bins.
            If array-like, treat as bins.

        precision: int or None
            Precision at which to store intervals. If None, default to 3.
        """
        data = self.data
        bins = {}
        intervals = {}

        if precision is None:
            precision = 5

        gb_axes = self._gb_axes

        if isinstance(nbins, (str, int)) or (hasattr(nbins, "__iter__")
                                             and len(nbins) != len(gb_axes)):
            # Single paramter for `nbins`.
            nbins = {k: nbins for k in gb_axes}

        elif len(nbins) == len(gb_axes):
            # Passed one bin spec per axis
            nbins = {k: v for k, v in zip(gb_axes, nbins)}

        else:
            msg = f"Unrecognized `nbins`\ntype: {type(nbins)}\n bins:{nbins}"
            raise ValueError(msg)

        for k in self._gb_axes:
            b = nbins[k]
            # Numpy and Astropy don't like NaNs when calculating bins.
            # Infinities in bins (typically from log10(0)) also create problems.
            d = data.loc[:, k].replace([-np.inf, np.inf], np.nan).dropna()

            if isinstance(b, str):
                b = b.lower()

            if isinstance(b, str) and b == "knuth":
                try:
                    assert knuth_bin_width
                except NameError:
                    raise NameError("Astropy is unavailable.")

                dx, b = knuth_bin_width(d, return_bins=True)

            else:
                try:
                    b = np.histogram_bin_edges(d, b)
                except MemoryError:
                    # Clip the extremely large values and extremely small outliers.
                    lo, up = d.quantile([0.0005, 0.9995])
                    b = np.histogram_bin_edges(d.clip(lo, up), b)
                except AttributeError:
                    c, b = np.histogram(d, b)

            assert np.unique(b).size == b.size
            try:
                assert not np.isnan(b).any()
            except TypeError:
                assert not b.isna().any()

            b = b.round(precision)

            zipped = zip(b[:-1], b[1:])
            i = [pd.Interval(*b0b1, closed="right") for b0b1 in zipped]

            bins[k] = b
            #             intervals[k] = pd.IntervalIndex(i)
            intervals[k] = pd.CategoricalIndex(i)

        bins = tuple(bins.items())
        intervals = tuple(intervals.items())
        #         self._intervals = intervals
        self._categoricals = intervals
Пример #20
0
def splitEnv(cluster, turn_off, isoch_phot, low_env_perc=50):
    """
    TODO

    1. implement iterative outliers removal for the estimation of the MSRL
    2. don't use binary envelope. Instead use the following method:
      - estimate the MSRL
      - divide it in (rotated) magnitude bins
      - for each bin, count how many members there are below the MSRL
      - estimate the
    """

    # Estimate the optimal rotation angle using the best fit isochrone
    theta = rotIsoch(turn_off, isoch_phot)

    # Rotate the cluster using 'theta'
    origin = (cluster[0].max(), cluster[1].max())
    cluster_rot = rotate(theta, cluster.T, origin).T

    # Define the edges along the rotated sequence
    bin_edges = knuth_bin_width(
        cluster_rot[1], return_bins=True, quiet=True)[1]
    # Remove edges in the brightest portion
    msk = bin_edges > np.percentile(cluster_rot[1], .1)
    bin_edges = bin_edges[msk]
    # Add resolution to the low mass region
    extra_edges = np.linspace(bin_edges[-2], bin_edges[-1], 5)
    bin_edges = list(bin_edges[:-2]) + list(extra_edges)
    # Obtain lower envelope
    lower_env_rot = []
    for i, low in enumerate(bin_edges):
        if i + 1 == len(bin_edges):
            break
        msk = (cluster_rot[1] > low) & (cluster_rot[1] <= bin_edges[i + 1])
        if msk.sum() > 0:
            mid_p = (low + bin_edges[i + 1]) * .5
            lower_env_rot.append([
                np.percentile(cluster_rot[0][msk], low_env_perc), mid_p])

    # Rotate the lower envelope back to its original position
    lower_env = rotate(-theta, lower_env_rot, origin).T

    # Extend envelope to lower magnitudes
    poly = np.polyfit(lower_env[0][-3:], lower_env[1][-3:], deg=1)
    # Extrapolate 1 mag
    x_ext = lower_env[0][-1] + 1
    y_ext = np.polyval(poly, x_ext)
    lower_env = np.array([list(lower_env[0]) + [x_ext],
                          list(lower_env[1]) + [y_ext]])

    # import matplotlib.pyplot as plt
    # plt.subplot(121)
    # plt.scatter(cluster_rot[1], cluster_rot[0], marker='.', c='r')
    # mag_l, col_l = np.array(lower_env_rot).T
    # plt.plot(col_l, mag_l)
    # plt.gca().invert_yaxis()
    # plt.subplot(122)
    # plt.scatter(cluster[1], cluster[0], marker='.', c='r')
    # mag_l, col_l = lower_env
    # plt.plot(col_l, mag_l)
    # plt.gca().invert_yaxis()
    # plt.show()

    # Generate binary envelope
    mag_l, col_l = lower_env
    mag_binar = clusterHandle.mag_combine(mag_l, mag_l)

    # Generate extra points
    l_envelope = isochHandle.interp(lower_env)
    b_envelope = isochHandle.interp(np.array([mag_binar, col_l]))

    # cluster = remOutliers(cluster, l_envelope, col_max, mag_lim, delta)

    # import matplotlib.pyplot as plt
    # plt.scatter(cluster[1], cluster[0], marker='.', c='g')
    # plt.plot(l_envelope[1], l_envelope[0], 'x', ms=2, c='k')
    # plt.plot(b_envelope[1], b_envelope[0], 'x', ms=2, c='b')
    # plt.gca().invert_yaxis()
    # plt.show()

    # Distances to the lower envelope, for all the stars
    dist_l = cdist(cluster.T, l_envelope.T)
    min_dist_l = dist_l.min(1)

    # Distances to the binary envelope, for all the stars
    dist_b = cdist(cluster.T, b_envelope.T)
    min_dist_b = dist_b.min(1)

    # If delta_d>0 then min_dist_l>min_dist_b, and the star is closer to the
    # binary sequence
    delta_d = min_dist_l - min_dist_b

    # Split systems
    binar_msk = delta_d >= 0
    single_msk = ~binar_msk

    return cluster, (l_envelope, b_envelope), single_msk, binar_msk
Пример #21
0
def hist(x, bins=10, range=None, *args, **kwargs):
    """Enhanced histogram

    This is a histogram function that enables the use of more sophisticated
    algorithms for determining bins.  Aside from the `bins` argument allowing
    a string specified how bins are computed, the parameters are the same
    as pylab.hist().

    Parameters
    ----------
    x : array_like
        array of data to be histogrammed

    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'blocks' : use bayesian blocks for dynamic bin widths
        'knuth' : use Knuth's rule to determine bins
        'scott' : use Scott's rule to determine bins
        'freedman' : use the Freedman-diaconis rule to determine bins

    range : tuple or None (optional)
        the minimum and maximum range for the histogram.  If not specified,
        it will be (x.min(), x.max())

    ax : Axes instance (optional)
        specify the Axes on which to draw the histogram.  If not specified,
        then the current active axes will be used.

    **kwargs :
        other keyword arguments are described in pylab.hist().
    """
    if isinstance(bins, str) and "weights" in kwargs:
        warnings.warn("weights argument is not supported: it will be ignored.")
        kwargs.pop('weights')

    x = np.asarray(x)

    if 'ax' in kwargs:
        ax = kwargs['ax']
        del kwargs['ax']
    else:
        # import here so that testing with Agg will work
        from matplotlib import pyplot as plt
        ax = plt.gca()

    # if range is specified, we need to truncate the data for
    # the bin-finding routines
    if (range is not None and (bins in [
            'blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman',
            'freedmans'
    ])):
        x = x[(x >= range[0]) & (x <= range[1])]

    if bins in ['blocks']:
        bins = bayesian_blocks(x)
    elif bins in ['knuth', 'knuths']:
        dx, bins = knuth_bin_width(x, True, disp=False)
    elif bins in ['scott', 'scotts']:
        dx, bins = scott_bin_width(x, True)
    elif bins in ['freedman', 'freedmans']:
        dx, bins = freedman_bin_width(x, True)
    elif isinstance(bins, str):
        raise ValueError("unrecognized bin code: '%s'" % bins)

    return ax.hist(x, bins, range, **kwargs)
Пример #22
0
def bin_edges_f(bin_method,
                mags_cols_cl,
                lkl_manual_bins=None,
                nbins=None,
                min_bins=2,
                max_bins=50):
    """
    Obtain bin edges for each photometric dimension using the cluster region
    diagram. The 'bin_edges' list will contain all magnitudes first, and then
    all colors (in the same order in which they are read).
    """
    bin_edges = []
    if bin_method in ('auto', 'fd', 'doane', 'scott', 'rice', 'sturges',
                      'sqrt'):

        for mag in mags_cols_cl[0]:
            bin_edges.append(np.histogram(mag, bins=bin_method)[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(np.histogram(col, bins=bin_method)[1])

    elif bin_method == 'optm':
        for mag in mags_cols_cl[0]:
            bin_edges.append(np.histogram(mag, bins=nbins * 2)[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(np.histogram(col, bins=nbins)[1])

    elif bin_method == 'fixed':
        # Based on Bonatto & Bica (2007) 377, 3, 1301-1323 but using larger
        # values than those used there (0.25 for colors and 0.5 for magnitudes)
        for mag in mags_cols_cl[0]:
            b_num = int(round(max(2, (max(mag) - min(mag)) / 1.)))
            bin_edges.append(np.histogram(mag, bins=b_num)[1])
        for col in mags_cols_cl[1]:
            b_num = int(round(max(2, (max(col) - min(col)) / .5)))
            bin_edges.append(np.histogram(col, bins=b_num)[1])

    elif bin_method == 'knuth':
        for mag in mags_cols_cl[0]:
            bin_edges.append(
                knuth_bin_width(mag, return_bins=True, quiet=True)[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(
                knuth_bin_width(col, return_bins=True, quiet=True)[1])

    elif bin_method == 'blocks':
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for mag in mags_cols_cl[0]:
                bin_edges.append(bayesian_blocks(mag))
            for col in mags_cols_cl[1]:
                bin_edges.append(bayesian_blocks(col))

    elif bin_method == 'blocks-max':
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for mag in mags_cols_cl[0]:
                bin_edges.append(slpitArr(bayesian_blocks(mag)))
            for col in mags_cols_cl[1]:
                bin_edges.append(slpitArr(bayesian_blocks(col), 1.))

    elif bin_method == 'manual':
        for mag in mags_cols_cl[0]:
            bin_edges.append(
                np.histogram(mag, bins=int(lkl_manual_bins[0]))[1])
        for i, col in enumerate(mags_cols_cl[1]):
            bin_edges.append(
                np.histogram(col, bins=int(lkl_manual_bins[i + 1]))[1])

    # TODO this method is currently hidden from the params file.
    # To be used when #325 is implemented. Currently used to test
    # multi-dimensional likelihoods.
    #
    # For 4 to 6 dimensions the rule below appears to be a somewhat reasonable
    # rule of thumb for the number of bins for each dimension.
    # There is a trade-off between a large number of smaller bins which
    # better match features of the observed cluster but benefits larger
    # mass values, and fewer larger bins which better match masses but losing
    # finer details of the cluster.
    elif bin_method == 'man':
        d = len(mags_cols_cl[0]) + len(mags_cols_cl[1])
        b_num = [15, 10, 7][d - 4]
        for mag in mags_cols_cl[0]:
            bin_edges.append(np.histogram(mag, bins=int(b_num))[1])
        for col in mags_cols_cl[1]:
            bin_edges.append(np.histogram(col, bins=int(b_num))[1])

    # Impose a minimum of 'min_bins' cells per dimension. The number of bins
    # is the number of edges minus 1.
    for i, be in enumerate(bin_edges):
        N_bins = len(be) - 1
        if N_bins < min_bins:
            # print("  WARNING too few bins in histogram, use 'min_bins'")
            bin_edges[i] = np.linspace(be[0], be[-1], min_bins + 1)

    # Impose a maximum of 'max_bins' cells per dimension.
    for i, be in enumerate(bin_edges):
        N_bins = len(be) - 1
        if N_bins > max_bins:
            # print("  WARNING too many bins in histogram, use 'max_bins'")
            bin_edges[i] = np.linspace(be[0], be[-1], max_bins)

    return bin_edges
Пример #23
0
def comp_study(input_data,
               n_events,
               xlims=None,
               resamples=100,
               dist_name='2Gauss'):
    bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks')
    do_log = True

    # data_nom = input_data[:n_events]
    if dist_name == 'Gauss':
        np.random.seed(88)
        data_nom = np.random.normal(125, 2, size=n_events)
        resample_list = np.random.normal(125, 2, size=(resamples, n_events))
        do_log = False

    elif dist_name == '2LP':
        np.random.seed(33)
        data_nom = np.concatenate(
            (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)),
             np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)),
             np.random.uniform(low=80, high=120, size=int(n_events * 0.10))))
        resample_list = np.concatenate(
            (np.random.laplace(
                loc=90, scale=5, size=(resamples, int(n_events * 0.65))),
             np.random.laplace(
                 loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))),
             np.random.uniform(
                 low=80, high=120, size=(resamples, int(n_events * 0.10)))),
            axis=1)
        do_log = False

    elif dist_name == 'jPT':
        np.random.seed(11)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    elif dist_name == 'DY':
        np.random.seed(200)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)
    else:
        np.random.seed(1)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    fig_hist, axes_hist = plt.subplots(3,
                                       3,
                                       sharex=True,
                                       sharey=False,
                                       constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22)
    # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20)
    # axes_hist[2][0].get_xaxis().set_ticks([])
    # axes_hist[2][1].get_xaxis().set_ticks([])
    # axes_hist[2][2].get_xaxis().set_ticks([])

    axes_hist[0][0].set_title('Sturges')
    hist_sturges_bw = skh_plt.hist(x=data_nom,
                                   histtype='stepfilled',
                                   bins='sturges',
                                   errorbars=False,
                                   alpha=0.5,
                                   log=do_log,
                                   scale='binwidth',
                                   err_type='gaussian',
                                   ax=axes_hist[0][0])

    axes_hist[0][1].set_title('Doane')
    hist_doane_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='doane',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][1])

    axes_hist[0][2].set_title('Scott')
    hist_scott_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='scott',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][2])

    axes_hist[1][0].set_title('Freedman Diaconis')
    axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20)
    hist_fd_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins='fd',
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[1][0])

    axes_hist[1][1].set_title('Knuth')
    _, bk = knuth_bin_width(data_nom, return_bins=True)
    hist_knuth_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins=bk,
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[1][1])

    axes_hist[1][2].set_title('Rice')
    hist_rice_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='rice',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[1][2])

    axes_hist[2][0].set_title('Sqrt(N)')
    hist_sqrt_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='sqrt',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[2][0])

    # bep = bep_optimizer(data_nom)
    # _, bep = pd.qcut(data_nom, nep, retbins=True)

    hist_sturges = np.histogram(data_nom, bins='sturges')
    hist_doane = np.histogram(data_nom, bins='doane')
    hist_scott = np.histogram(data_nom, bins='scott')
    hist_fd = np.histogram(data_nom, bins='fd')
    hist_knuth = np.histogram(data_nom, bins=bk)
    hist_rice = np.histogram(data_nom, bins='rice')
    hist_sqrt = np.histogram(data_nom, bins='sqrt')

    r_sturges = rough(hist_sturges_bw, plot=False)
    r_doane = rough(hist_doane_bw)
    r_scott = rough(hist_scott_bw)
    r_fd = rough(hist_fd_bw)
    r_knuth = rough(hist_knuth_bw, plot=False)
    r_rice = rough(hist_rice_bw)
    r_sqrt = rough(hist_sqrt_bw, plot=False)

    eli_sturges = err_li(data_nom, hist_sturges)
    eli_doane = err_li(data_nom, hist_doane)
    eli_scott = err_li(data_nom, hist_scott)
    eli_fd = err_li(data_nom, hist_fd)
    eli_knuth = err_li(data_nom, hist_knuth)
    eli_rice = err_li(data_nom, hist_rice)
    eli_sqrt = err_li(data_nom, hist_sqrt)

    avg_eli_sturges = []
    avg_eli_doane = []
    avg_eli_scott = []
    avg_eli_fd = []
    avg_eli_knuth = []
    avg_eli_rice = []
    avg_eli_sqrt = []
    for i in resample_list:
        avg_eli_sturges.append(err_li(i, hist_sturges))
        avg_eli_doane.append(err_li(i, hist_doane))
        avg_eli_scott.append(err_li(i, hist_scott))
        avg_eli_fd.append(err_li(i, hist_fd))
        avg_eli_knuth.append(err_li(i, hist_knuth))
        avg_eli_rice.append(err_li(i, hist_rice))
        avg_eli_sqrt.append(err_li(i, hist_sqrt))

    avg_eli_sturges = np.mean(avg_eli_sturges)
    avg_eli_doane = np.mean(avg_eli_doane)
    avg_eli_scott = np.mean(avg_eli_scott)
    avg_eli_fd = np.mean(avg_eli_fd)
    avg_eli_knuth = np.mean(avg_eli_knuth)
    avg_eli_rice = np.mean(avg_eli_rice)
    avg_eli_sqrt = np.mean(avg_eli_sqrt)

    avg_eli_list = [
        avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd,
        avg_eli_knuth, avg_eli_rice, avg_eli_sqrt
    ]
    r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt]

    elis_list = [
        eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice,
        eli_sqrt
    ]

    axes_hist[2][1].set_title('Equal Population')
    bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    hist_ep_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bep,
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][1])
    hist_ep = np.histogram(data_nom, bins=bep)
    r_ep = rough(hist_ep_bw)
    eli_ep = err_li(data_nom, hist_ep)
    avg_eli_ep = []
    for i in resample_list:
        avg_eli_ep.append(err_li(i, hist_ep))
    avg_eli_ep = np.mean(avg_eli_ep)

    axes_hist[2][2].set_title('Bayesian Blocks')
    p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    bb = bayesian_blocks(data_nom, p0=p0)
    if xlims:
        bb[0] = xlims[0]
        bb[-1] = xlims[-1]
    hist_bb_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bb,
                              errorbars=False,
                              alpha=1,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][2])
    # if n_events == 1000 and dist_name == '2LP':
    # axes_hist[2][2].set_ylim((0, 100))
    hist_bb = np.histogram(data_nom, bins=bb)
    r_bb = rough(hist_bb_bw, plot=False)
    eli_bb = err_li(data_nom, hist_bb)
    avg_eli_bb = []
    for i in resample_list:
        avg_eli_bb.append(err_li(i, hist_bb))
    avg_eli_bb = np.mean(avg_eli_bb)

    r_list.append(r_ep)
    r_list.append(r_bb)
    avg_eli_list.append(avg_eli_ep)
    avg_eli_list.append(avg_eli_bb)
    elis_list.append(eli_ep)
    elis_list.append(eli_bb)
    plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf')

    xs = [
        'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB'
    ]

    fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}')
    for i in range(len(elis_list)):
        if xs[i] == 'BB':
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=400,
                                   marker='*',
                                   c='k')
        else:
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=200)
    axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)')
    axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)')
    # ax = plt.gca()
    # ax.set_yscale('log')
    # ax.set_xscale('log')
    # ax.relim()
    # ax.autoscale_view()
    axes_metric[0].grid()
    axes_metric[0].legend(ncol=1,
                          bbox_to_anchor=(1.05, 1.15),
                          loc='upper left')
    axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}',
                             fontsize=22)
    # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf')

    # plt.figure()
    rank_rough = rankdata(r_list, method='min')
    rank_avg_eli = rankdata(avg_eli_list, method='min')

    cont = axes_metric[1].bar(xs,
                              rank_rough,
                              0.35,
                              label=r'$W_n$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    cont = axes_metric[1].bar(xs,
                              rank_avg_eli,
                              0.35,
                              bottom=rank_rough,
                              label=r'$\hat{E}$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8))
    # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}')
    axes_metric[1].set_xlabel('Binning Method')
    axes_metric[1].set_ylabel('Rank')
    plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')