예제 #1
0
def test_histogram_bin_range_raises(bins, hist_range):
    data = da.random.random(10, chunks=2)
    with pytest.raises(ValueError) as info:
        da.histogram(data, bins=bins, range=hist_range)
    err_msg = str(info.value)
    assert 'bins' in err_msg
    assert 'range' in err_msg
예제 #2
0
def test_histogram_bin_range_raises(bins, hist_range):
    data = da.random.random(10, chunks=2)
    with pytest.raises(ValueError) as info:
        da.histogram(data, bins=bins, range=hist_range)
    err_msg = str(info.value)
    assert 'bins' in err_msg
    assert 'range' in err_msg
예제 #3
0
def test_histogram_normed_deprecation():
    x = da.arange(10)
    with pytest.raises(ValueError) as info:
        da.histogram(x, bins=[1, 2, 3], normed=True)

    assert 'density' in str(info.value)
    assert 'deprecated' in str(info.value).lower()
예제 #4
0
def test_histogram_normed_deprecation():
    x = da.arange(10)
    with pytest.raises(ValueError) as info:
        da.histogram(x, bins=[1, 2, 3], normed=True)

    assert 'density' in str(info.value)
    assert 'deprecated' in str(info.value).lower()
def test_histogram_extra_args_and_shapes():
    # Check for extra args and shapes
    bins = np.arange(0, 1.01, 0.01)
    v = da.random.random(100, chunks=10)
    data = [(v, bins, da.ones(100, chunks=v.chunks) * 5),
            (da.random.random(
                (50, 50), chunks=10), bins, da.ones((50, 50), chunks=10) * 5)]

    for v, bins, w in data:
        # density
        assert_eq(
            da.histogram(v, bins=bins, normed=True)[0],
            np.histogram(v, bins=bins, normed=True)[0])

        # normed
        assert_eq(
            da.histogram(v, bins=bins, density=True)[0],
            np.histogram(v, bins=bins, density=True)[0])

        # weights
        assert_eq(
            da.histogram(v, bins=bins, weights=w)[0],
            np.histogram(v, bins=bins, weights=w)[0])

        assert_eq(
            da.histogram(v, bins=bins, weights=w, density=True)[0],
            da.histogram(v, bins=bins, weights=w, density=True)[0])
예제 #6
0
def _do_ctp_validation(data, adef, out_size, idxs):
    """ Calculate CTP validation (included in CTTH plot). """

    # detected ctth mask
    detected_clouds = da.logical_and(data['caliop_cma'] == 1,
                                     data['imager_cma'] == 1)
    detected_height = da.logical_and(detected_clouds,
                                     np.isfinite(data['imager_cth']))
    # find pps low and caliop low
    low_clouds_c = gc.get_calipso_low_clouds(data['caliop_cflag'])
    detected_low_c = np.logical_and(detected_height, low_clouds_c)
    low_clouds_pps = da.where(data['imager_ctp'] > 680., 1, 0)
    detected_low_pps = da.logical_and(detected_height, low_clouds_pps)

    # pattern: CALIOP_SEVIRI
    cld_cld_a = da.logical_and(detected_low_c == 1, detected_low_pps == 1)
    clr_cld_b = da.logical_and(detected_low_c == 0, detected_low_pps == 1)
    cld_clr_c = da.logical_and(detected_low_c == 1, detected_low_pps == 0)
    clr_clr_d = da.logical_and(detected_low_c == 0, detected_low_pps == 0)

    cld_cld_a = cld_cld_a.astype(np.int64)
    clr_cld_b = clr_cld_b.astype(np.int64)
    cld_clr_c = cld_clr_c.astype(np.int64)
    clr_clr_d = clr_clr_d.astype(np.int64)

    a, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=cld_cld_a,
                        density=False)
    b, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=clr_cld_b,
                        density=False)
    c, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=cld_clr_c,
                        density=False)
    d, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=clr_clr_d,
                        density=False)

    scu = ScoreUtils(a, b, c, d)
    scores = dict()
    scores['CTP low clouds POD'] = [
        scu.pod_1().reshape(adef.shape), 0, 1, 'rainbow'
    ]
    scores['CTP low clouds FAR'] = [
        scu.far_1().reshape(adef.shape), 0, 1, 'rainbow'
    ]
    scores['CTP low clouds POFD'] = [
        scu.pofd_1().reshape(adef.shape), 0, 1, 'rainbow'
    ]
    # scores['Heidke low clouds'] = [scu.heidke().reshape(adef.shape),0, 1, 'rainbow']

    return scores
예제 #7
0
def do_ctp_validation(data, adef, out_size, idxs):
    """ Scores: low clouds detection """
    # detected ctth mask
    detected_clouds = da.logical_and(data['caliop_cma'] == 1,
                                     data['imager_cma'] == 1)
    detected_height = da.logical_and(detected_clouds,
                                     np.isfinite(data['imager_cth']))
    # find pps low and caliop low
    low_clouds_c = get_calipso_low_clouds(data['caliop_cflag'])
    detected_low_c = np.logical_and(detected_height, low_clouds_c)
    low_clouds_pps = da.where(data['imager_ctp'] > 680., 1, 0)
    detected_low_pps = da.logical_and(detected_height, low_clouds_pps)

    # pattern: CALIOP_SEVIRI
    cld_cld_a = da.logical_and(detected_low_c == 1, detected_low_pps == 1)
    clr_cld_b = da.logical_and(detected_low_c == 0, detected_low_pps == 1)
    cld_clr_c = da.logical_and(detected_low_c == 1, detected_low_pps == 0)
    clr_clr_d = da.logical_and(detected_low_c == 0, detected_low_pps == 0)

    cld_cld_a = cld_cld_a.astype(np.int64)
    clr_cld_b = clr_cld_b.astype(np.int64)
    cld_clr_c = cld_clr_c.astype(np.int64)
    clr_clr_d = clr_clr_d.astype(np.int64)

    a, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=cld_cld_a,
                        density=False)
    b, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=clr_cld_b,
                        density=False)
    c, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=cld_clr_c,
                        density=False)
    d, _ = da.histogram(idxs,
                        bins=out_size,
                        range=(0, out_size),
                        weights=clr_clr_d,
                        density=False)

    # n = a + b + c + d
    # n2d = N.reshape(adef.shape)

    # scores = [hitrate(a, d, n).reshape(adef.shape),
    # 0.7, 1, 'rainbow'] # hitrate low PPS
    pod_low = a / (a + c)
    far_low = c / (a + c)
    scores = dict()
    scores['POD low clouds'] = [pod_low.reshape(adef.shape), 0.2, 1, 'rainbow']
    scores['FAR low clouds'] = [far_low.reshape(adef.shape), 0.2, 1, 'rainbow']

    return scores
예제 #8
0
def plot_hist(size_data, shape_data, range=(0, 10), bins=100):
    size_hist, size_edges = da.histogram(size_data, bins=bins, range=range)
    shape_hist, shape_edges = da.histogram(shape_data, bins=80, range=(0, 0.9))
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(7, 4))
    ax1.semilogy(size_edges[:-1], size_hist.compute(), drawstyle='steps-mid')
    ax1.set_xlabel('FWHM/(2*sqrt(2*ln(2))) [pixels]')
    ax2.set_xlabel('Symmetry')
    ax2.semilogy(shape_edges[:-1], shape_hist.compute(), drawstyle='steps-mid')
    fig.savefig('size_symmetry_hist.png', format='png', dpi=300)
    plt.show()
예제 #9
0
def coarsen(fmask):

    # rename
    v1min, v1max, dv1 = lon_bins[0], lon_bins[-1], dl
    v2min, v2max, dv2 = lat_bins[0], lat_bins[-1], dl
    i1max = int(np.rint((v1max - v1min) / dv1)) + 1
    i2max = int(np.rint((v2max - v2min) / dv2)) + 1

    # meshgrid lon/lat, note: need transposing
    fmask = fmask.to_dataset()
    fmask['lon'] = (1. * fmask['longitude'] +
                    0. * fmask['latitude']).transpose()
    fmask['lat'] = (0. * fmask['longitude'] +
                    1. * fmask['latitude']).transpose()
    # need rechunking
    fmask = fmask.chunk(chunks)

    def get_index(v1, v2):
        ''' This function provides the index of (v1,v2) coupled value position
            in the 2D histogram array
            '''
        i1 = np.maximum(np.floor((v1 - v1min) / dv1) + 1, 0)
        i1 = np.minimum(i1, i1max)
        i2 = np.maximum(np.floor((v2 - v2min) / dv2) + 1, 0)
        i2 = np.minimum(i2, i2max)
        return i1 + i2 * (i1max + 1)

    # sum QA over coarse grid cells
    v12 = da.map_blocks(get_index,
                        fmask['lon'].data,
                        fmask['lat'].data,
                        dtype='float')
    h, lbins = da.histogram(v12,
                            bins=np.arange(-.5,
                                           (i1max + 1) * (i2max + 1) + 0.5,
                                           1.),
                            weights=fmask['QA'].data)
    H = h.compute()
    # compute the number of points per grid cells
    hnorm, lbins = da.histogram(v12,
                                bins=np.arange(-.5,
                                               (i1max + 1) * (i2max + 1) + 0.5,
                                               1.))
    Hnorm = 1. * hnorm.compute()
    Hnorm[np.where(Hnorm == 0)] = np.NaN
    # average the mask over coarse grid cells
    H = (H / Hnorm).reshape((i1max + 1, i2max + 1), order='F')

    cmask = xr.Dataset()
    #cmask['QA'] = (('longitude', 'latitude'), H[1:-1,1:-1].transpose())
    cmask['QA'] = (('longitude', 'latitude'), H[1:-1, 1:-1])
    cmask.coords['longitude'] = (('longitude'), lon_center)
    cmask.coords['latitude'] = (('latitude'), lat_center)

    return cmask
def test_histogram():
    # Test for normal, flattened input
    n = 100
    v = da.random.random(n, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    (a1, b1) = da.histogram(v, bins=bins)
    (a2, b2) = np.histogram(v, bins=bins)

    # Check if the sum of the bins equals the number of samples
    assert a2.sum(axis=0) == n
    assert a1.sum(axis=0) == n
    assert_eq(a1, a2)
    assert same_keys(da.histogram(v, bins=bins)[0], a1)
예제 #11
0
def test_histogram():
    # Test for normal, flattened input
    n = 100
    v = da.random.random(n, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    (a1, b1) = da.histogram(v, bins=bins)
    (a2, b2) = np.histogram(v, bins=bins)

    # Check if the sum of the bins equals the number of samples
    assert a2.sum(axis=0) == n
    assert a1.sum(axis=0) == n
    assert eq(a1, a2)
    assert same_keys(da.histogram(v, bins=bins)[0], a1)
예제 #12
0
 def _mask_sums_with_nan_if_not_skipna(self, skipna, data, out_size, sums):
     if not skipna:
         nans = np.isnan(data)
         nan_sums, _ = da.histogram(self.idxs[nans], bins=out_size,
                                    range=(0, out_size))
         sums = da.where(nan_sums > 0, np.nan, sums)
     return sums
예제 #13
0
def process(input_path, pedestal_path, output_path):
    reader = TIOReader(input_path)
    wf_calib = WaveformCalibrator(
        pedestal_path, reader.n_pixels, reader.n_samples
    )

    wfs = get_da(reader, wf_calib)

    mean, std, mean_pix, std_pix, (hist, edges) = da.compute(
        wfs.mean(),
        wfs.std(),
        wfs.mean(axis=(0, 2)),
        wfs.std(axis=(0, 2)),
        da.histogram(wfs, bins=1000, range=(-10, 10))
    )

    np.savez(
        output_path,
        mean=mean,
        std=std,
        mean_pix=mean_pix,
        std_pix=std_pix,
        hist=hist,
        edges=edges
    )
예제 #14
0
def van_hove_distinct(onset, frame, bins, box=None, use_dask=True, comp=False, bincount=True):
    r"""
    Compute the distinct part of the Van Hove autocorrelation function.

    ..math::
      G(r, t) = \sum_{i, j} \delta(|\vec r_i(0) - \vec r_j(t)| - r)
    """
    if box is None:
        box = onset.box.diagonal()
    dimension = len(box)
    N = len(onset)
    if use_dask:
        onset = darray.from_array(onset, chunks=(500, dimension)).reshape(1, N, dimension)
        frame = darray.from_array(frame, chunks=(500, dimension)).reshape(N, 1, dimension)
        dist = ((pbc_diff(onset, frame, box)**2).sum(axis=-1)**0.5)
        if np.diff(bins).std() < 1e6:
            dx = bins[0] - bins[1]
            hist = darray.bincount((dist // dx).astype(int), minlength=(len(bins) - 1))
        else:
            hist = darray.histogram(dist, bins=bins)[0]
        return hist.compute() / N
    else:
        if comp:

            dx = bins[1] - bins[0]
            minlength = len(bins) - 1

            def f(x):
                d = (pbc_diff(x, frame, box)**2).sum(axis=-1)**0.5
                return np.bincount((d // dx).astype(int), minlength=minlength)[:minlength]
            hist = sum(f(x) for x in onset)
        else:
            dist = (pbc_diff(onset.reshape(1, -1, 3), frame.reshape(-1, 1, 3), box)**2).sum(axis=-1)**0.5
            hist = histogram(dist, bins=bins)[0]
        return hist / N
예제 #15
0
def get_hists(yearStr, monStr, varT, binVals, binWidth, maxValue):
        
        vars=[varT, 'seg_length', 'region_flag', 'ssh_flag']
        #if (monStr=='12'):
        #    dataOutPath=dataPathIS2+releaseStr+'/'+runStr+'/raw/'
        #else:
        #    dataOutPath=dataPathIS2+releaseStr+'/'+runStr+'/raw/'
        print(dataOutPath)
        dFbeams = cF.getProcessedATL10ShotdataNCDF(dataOutPath, yearStr=yearStr, ssh_mask=1, monStr=monStr, dayStr=dayStr, vars=vars, fNum=fNum, beamStr=beam)
        print('Got data')
        dFbeams=dFbeams.where(dFbeams[varT]>0.0, drop=True)
        dFbeams=dFbeams.where(dFbeams[varT]<30, drop=True)
        dFbeams=dFbeams.where(~np.isnan(dFbeams[varT]), drop=True)
        dFbeams=dFbeams.where(dFbeams.seg_length>4, drop=True)

        dFbeams=dFbeams.where(dFbeams.seg_length<200, drop=True)
        
        vals=dFbeams[varT][np.isin(dFbeams.region_flag, regions)]
        segs=dFbeams['seg_length'][np.isin(dFbeams.region_flag, regions)]

        weights=segs/segs.sum().values

        #counts[r, m]=vals.count().values
        meansT=(vals*segs).sum().values/segs.sum().values

        h, bins = da.histogram(vals.data, bins=size(binVals)-1, range=[0, maxValue], weights=weights.data)
        #histVals[m]=h
        histValsT=h.compute()

        return histValsT, meansT
예제 #16
0
def compute_scaling(df,
                    region1,
                    region2=None,
                    dmin=int(1e1),
                    dmax=int(1e7),
                    n_bins=50):

    import dask.array as da

    if region2 is None:
        region2 = region1

    distbins = numutils.logbins(dmin, dmax, N=n_bins)
    areas = contact_areas(distbins, region1, region2)

    df = df[(df["pos1"] >= region1[0])
            & (df["pos1"] < region1[1])
            & (df["pos2"] >= region2[0])
            & (df["pos2"] < region2[1])]
    dists = (df["pos2"] - df["pos1"]).values

    if isinstance(dists, da.Array):
        obs, _ = da.histogram(dists[(dists >= dmin) & (dists < dmax)],
                              bins=distbins)
    else:
        obs, _ = np.histogram(dists[(dists >= dmin) & (dists < dmax)],
                              bins=distbins)

    return distbins, obs, areas
예제 #17
0
def test_histogram_return_type():
    v = da.random.random(100, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    # Check if return type is same as hist
    bins = np.arange(0, 11, 1, dtype='i4')
    assert eq(da.histogram(v * 10, bins=bins)[0],
              np.histogram(v * 10, bins=bins)[0])
예제 #18
0
파일: common.py 프로젝트: sfu-db/dataprep
def uni_histogram(
    srs: dd.Series,
    srs_dtype: DType,
    cfg: Config,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if isinstance(srs_dtype, Continuous):

        counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
예제 #19
0
def test_histogram_return_type():
    v = da.random.random(100, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    # Check if return type is same as hist
    bins = np.arange(0, 11, 1, dtype='i4')
    assert_eq(da.histogram(v * 10, bins=bins)[0],
              np.histogram(v * 10, bins=bins)[0])
예제 #20
0
def calc_hist(srs: dd.Series, bins: int,
              orig_df_len: int) -> Tuple[pd.DataFrame, float]:
    """
    Calculate a histogram over a given series.

    Parameters
    ----------
    srs : dd.Series
        one numerical column over which to compute the histogram
    bins : int
        number of bins to use in the histogram
    orig_df_len : int
        length of the original dataframe

    Returns
    -------
    Tuple[pd.DataFrame, float]:
        The histogram in a dataframe and the percent of missing values
    """
    miss_pct = round(srs.isna().sum().compute() / len(srs) * 100, 1)

    data = srs.dropna().values
    minv, maxv = data.min().compute(), data.max().compute()

    hist_arr, bins_arr = da.histogram(data, range=[minv, maxv], bins=bins)
    hist_arr = hist_arr.compute()
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
        "pct": hist_arr / orig_df_len * 100,
    })
    return hist_df, miss_pct
예제 #21
0
def test_histogram_bins_range_with_nan_array():
    # Regression test for issue #3977
    v = da.from_array(np.array([-2, np.nan, 2]), chunks=1)
    (a1, b1) = da.histogram(v, bins=10, range=(-3, 3))
    (a2, b2) = np.histogram(v, bins=10, range=(-3, 3))
    assert_eq(a1, a2)
    assert_eq(b1, b2)
예제 #22
0
def test_histogram_bins_range_with_nan_array():
    # Regression test for issue #3977
    v = da.from_array(np.array([-2, np.nan, 2]), chunks=1)
    (a1, b1) = da.histogram(v, bins=10, range=(-3, 3))
    (a2, b2) = np.histogram(v, bins=10, range=(-3, 3))
    assert_eq(a1, a2)
    assert_eq(b1, b2)
예제 #23
0
def uni_histogram(
    srs: dd.Series,
    bins: int,
    dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if is_dtype(detect_dtype(srs, dtype), Continuous()):

        counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()])
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
예제 #24
0
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a continuous column in plot(df)
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    if cfg.insight.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values

    # drop infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # histogram
    data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max()))

    if cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
        data["norm"] = normaltest(data["hist"][0])
        data["skew"] = skewtest(data["hist"][0])
        data["nneg"] = (srs < 0).sum()  # number of negative values
        data["nuniq"] = srs.nunique_approx()  # number of unique values
        data["nzero"] = (srs == 0).sum()  # number of zeros
        data["nreals"] = srs.shape[0]  # number of non-inf values
    return data
예제 #25
0
def calc_cat_stats(
    srs: dd.Series,
    df: dd.DataFrame,
    bins: int,
    nrows: int,
    nuniq: Optional[dd.core.Scalar] = None,
) -> Dict[str, Any]:
    """
    Calculate stats for a categorical column

    Parameters
    ----------
    srs
        a categorical column
    df
        groupby-count on the categorical column as a dataframe
    bins
        number of bins for the category length frequency histogram
    nrows
        number of rows before dropping null values
    nuniq
        number of unique values in the column
    """
    # pylint: disable=too-many-locals
    # overview stats
    stats = {
        "nrows": nrows,
        "npres": srs.shape[0],
        "nuniq": nuniq,  # if cfg.bar_endable or cfg.pie_enable else srs.nunique(),
        "mem_use": srs.memory_usage(deep=True),
        "first_rows": srs.reset_index(drop=True).loc[:4],
    }
    # length stats
    lengths = srs.str.len()
    minv, maxv = lengths.min(), lengths.max()
    hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv])
    leng = {
        "Mean": lengths.mean(),
        "Standard Deviation": lengths.std(),
        "Median": lengths.quantile(0.5),
        "Minimum": minv,
        "Maximum": maxv,
    }
    # letter stats
    # computed on groupby-count:
    # compute the statistic for each group then multiply by the count of the group
    grp, col = df.columns
    lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum()
    uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum()
    letter = {
        "Count": lc_cnt + uc_cnt,
        "Lowercase Letter": lc_cnt,
        "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(),
        "Uppercase Letter": uc_cnt,
        "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(),
        "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(),
    }

    return {"stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist}
예제 #26
0
def test_histogram_alternative_bins_range():
    v = da.random.random(100, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    # Other input
    (a1, b1) = da.histogram(v, bins=10, range=(0, 1))
    (a2, b2) = np.histogram(v, bins=10, range=(0, 1))
    assert eq(a1, a2)
    assert eq(b1, b2)
예제 #27
0
def test_histogram_alternative_bins_range():
    v = da.random.random(100, chunks=10)
    bins = np.arange(0, 1.01, 0.01)
    # Other input
    (a1, b1) = da.histogram(v, bins=10, range=(0, 1))
    (a2, b2) = np.histogram(v, bins=10, range=(0, 1))
    assert eq(a1, a2)
    assert eq(b1, b2)
예제 #28
0
def getHist2(imgs, bins=np.arange(-2, 20, 0.05)):
    """ get intensity histogram from a stack of imgs """

    if isinstance(imgs, dask.array.Array):
        H = da.histogram(imgs[da.isfinite(imgs)], bins=bins)[0]
        return H
    else:
        H = np.histogram(imgs[np.isfinite(imgs)], bins)[0]
        return np.asarray(H)
예제 #29
0
 def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size,
                                       statistic):
     if not skipna:
         nans = np.isnan(data)
         nan_bins, _ = da.histogram(self.idxs[nans],
                                    bins=out_size,
                                    range=(0, out_size))
         statistic = da.where(nan_bins > 0, np.nan, statistic)
     return statistic
예제 #30
0
def test_histogram_extra_args_and_shapes():
    # Check for extra args and shapes
    bins = np.arange(0, 1.01, 0.01)
    v = da.random.random(100, chunks=10)
    data = [(v, bins, da.ones(100, chunks=v.chunks) * 5),
            (da.random.random((50, 50), chunks=10), bins, da.ones((50, 50), chunks=10) * 5)]

    for v, bins, w in data:
        # density
        assert_eq(da.histogram(v, bins=bins, density=True)[0],
                  np.histogram(v, bins=bins, density=True)[0])

        # weights
        assert_eq(da.histogram(v, bins=bins, weights=w)[0],
                  np.histogram(v, bins=bins, weights=w)[0])

        assert_eq(da.histogram(v, bins=bins, weights=w, density=True)[0],
                  da.histogram(v, bins=bins, weights=w, density=True)[0])
예제 #31
0
파일: analysis.py 프로젝트: TCvanLeth/PyHAD
def histogram(a, **kwargs):
    y, bins = da.histogram(a.task, **kwargs)
    w = bins[1:] - bins[:-1]
    x = (bins[1:] + bins[:-1]) * 0.5

    x = Index(x, a.name, a.attrs)
    y = type(a)(y, coords=[x], name='number')
    w = type(a)(w, coords=[x], name='binwidth', attrs=a.attrs)
    return x, y, w
예제 #32
0
    def get_sum(self, data, mask_all_nan=False):
        """Calculate sums for each bin with drop-in-a-bucket resampling.

        Parameters
        ----------
        data : Numpy or Dask array
        mask_all_nan : boolean (optional)
            Mask bins that have only NaN results, default: False

        Returns
        -------
        data : Numpy or Dask array
            Bin-wise sums in the target grid
        """
        LOG.info("Get sum of values in each location")
        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()
        # Remove NaN values from the data when used as weights
        weights = da.where(np.isnan(data), 0, data)

        # Rechunk indices to match the data chunking
        if weights.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, weights.chunks)

        # Calculate the sum of the data falling to each bin
        out_size = self.target_area.size
        sums, _ = da.histogram(self.idxs,
                               bins=out_size,
                               range=(0, out_size),
                               weights=weights,
                               density=False)

        if mask_all_nan:
            nans = np.isnan(data)
            nan_sums, _ = da.histogram(self.idxs[nans],
                                       bins=out_size,
                                       range=(0, out_size))
            counts = self.get_count().ravel()
            sums = da.where(nan_sums == counts, np.nan, sums)

        return sums.reshape(self.target_area.shape)
예제 #33
0
def calc_cat_stats(srs: dd.Series,
                   bins: int,
                   nrows: int,
                   nuniq: Optional[dd.core.Scalar] = None) -> Dict[str, Any]:
    """
    Calculate stats for a categorical column

    Parameters
    ----------
    srs
        a categorical column
    nrows
        number of rows before dropping null values
    bins
        number of bins for the category length frequency histogram
    """
    # overview stats
    stats = {
        "nrows": nrows,
        "npres": srs.shape[0],
        "nuniq":
        nuniq,  # if cfg.bar_endable or cfg.pie_enable else srs.nunique(),
        "mem_use": srs.memory_usage(deep=True),
        "first_rows": srs.reset_index(drop=True).loc[:4],
    }
    # length stats
    lengths = srs.str.len()
    minv, maxv = lengths.min(), lengths.max()
    hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv])
    leng = {
        "Mean": lengths.mean(),
        "Standard Deviation": lengths.std(),
        "Median": lengths.quantile(0.5),
        "Minimum": minv,
        "Maximum": maxv,
    }
    # letter stats
    letter = {
        "Count": srs.str.count(r"[a-zA-Z]").sum(),
        "Lowercase Letter": srs.str.count(r"[a-z]").sum(),
        "Space Separator": srs.str.count(r"[ ]").sum(),
        "Uppercase Letter": srs.str.count(r"[A-Z]").sum(),
        "Dash Punctuation": srs.str.count(r"[-]").sum(),
        "Decimal Number": srs.str.count(r"[0-9]").sum(),
    }

    return {
        "stats": stats,
        "len_stats": leng,
        "letter_stats": letter,
        "len_hist": hist
    }
예제 #34
0
def save_histograms(df, h_list, wp_list, out_hdf):
    """
    Creates histograms for each working point and saves them into a dataframe
    """
    print("Saving histograms...")

    for h in h_list:

        ## For SumET this should only be plotted once!
        if h.name == "Tight_Final_SumET":
            hist, _ = da.histogram(
                df[h.name], bins=h.nbins, range=h.range, density=True
            )
            hist = hist.compute()
            hists = pd.DataFrame(
                data=hist,
                index=h.centers,
                columns=[h.name],
            )

        ## All other histograms are plotted per working point
        else:
            hists = []
            for wp in wp_list:
                hist, _ = da.histogram(
                    df[wp + "_" + h.name], bins=h.nbins, range=h.range, density=True
                )
                hists.append(hist.compute())

            hists = pd.DataFrame(
                data=np.vstack(hists).T,
                index=h.centers,
                columns=[wp + "_" + h.name for wp in wp_list],
            )

        ## Print and save
        key = h.name
        hists.to_hdf(out_hdf, h.name)
        print(" - " + key, "\n")
예제 #35
0
def dasky_histogram(a, bins=10, **kwargs):
    """Enhanced histogram for dask arrays.
    The range keyword is ignored. Reads the data at most two times - once to
    determine best bins (if required), and second time to actually calculate
    the histogram.

    Parameters
    ----------
    a : array_like
        array of data to be histogrammed
    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'scotts' : use Scott's rule to determine bins
        'freedman' : use the Freedman-Diaconis rule to determine bins
    other keyword arguments are described in numpy.hist().

    Returns
    -------
    hist : array
        The values of the histogram. See `normed` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.

    See Also
    --------
    numpy.histogram,
    astroML.plotting.hist
    """
    if not isinstance(a, da.Array):
        raise TypeError('the given array has to be a dask.Array')
    if a.ndim != 1:
        a = a.flatten()

    if bins == 'scotts':
        _, bins = dasky_scotts_bin_width(a, True)
    elif bins == 'freedman':
        _, bins = dasky_freedman_bin_width(a, True)
    elif isinstance(bins, str):
        raise ValueError("unrecognized bin code: '%s'" % bins)
    elif not np.iterable(bins):
        with ProgressBar():
            kwargs['range'] = da.compute(a.min(), a.max())

    h, bins = da.histogram(a, bins=bins, **kwargs)
    with ProgressBar():
        return h.compute(), bins
예제 #36
0
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    Computations for a numerical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    bins
        number of bins in the bar chart
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    ## if cfg.insight.missing_enable:
    data["npres"] = srs.shape[0]

    ## if cfg.insight.infinity_enable:
    is_inf_srs = srs.isin({np.inf, -np.inf})
    data["ninf"] = is_inf_srs.sum()

    # remove infinite values
    srs = srs[~is_inf_srs]

    ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
    ## bins = cfg.hist_bins
    data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()])

    ## if cfg.insight.uniform_enable:
    data["chisq"] = chisquare(data["hist"][0])

    ## if cfg.insight.normal_enable
    data["norm"] = normaltest(data["hist"][0])

    ## if cfg.insight.negative_enable:
    data["nneg"] = (srs < 0).sum()

    ## if cfg.insight.skew_enabled:
    data["skew"] = skewtest(data["hist"][0])

    ## if cfg.insight.unique_enabled:
    data["nuniq"] = srs.nunique()

    ## if cfg.insight.zero_enabled:
    data["nzero"] = (srs == 0).sum()

    return data
예제 #37
0
def dasky_histogram(a, bins=10, **kwargs):
    """Enhanced histogram for dask arrays.
    The range keyword is ignored. Reads the data at most two times - once to
    determine best bins (if required), and second time to actually calculate
    the histogram.

    Parameters
    ----------
    a : array_like
        array of data to be histogrammed
    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'scotts' : use Scott's rule to determine bins
        'freedman' : use the Freedman-Diaconis rule to determine bins
    other keyword arguments are described in numpy.hist().

    Returns
    -------
    hist : array
        The values of the histogram. See `normed` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.

    See Also
    --------
    numpy.histogram
    astroML.plotting.hist
    """
    if not isinstance(a, da.Array):
        raise TypeError('the given array has to be a dask.Array')
    if a.ndim != 1:
        a = a.flatten()

    if bins == 'scotts':
        _, bins = dasky_scotts_bin_width(a, True)
    elif bins == 'freedman':
        _, bins = dasky_freedman_bin_width(a, True)
    elif isinstance(bins, str):
        raise ValueError("unrecognized bin code: '%s'" % bins)
    elif not np.iterable(bins):
        with ProgressBar():
            kwargs['range'] = da.compute(a.min(), a.max())

    h, bins = da.histogram(a, bins=bins, **kwargs)
    with ProgressBar():
        return h.compute(), bins
예제 #38
0
def hist1d_from_mpa_data(file_,
                         xchannel,
                         nxbins=1024,
                         chunk_size=TYPICAL_DASK_CHUNK):
    with h5py.File(file_, "r") as f:
        config = f["CFG"]
        xmin = 0
        try:
            xmax = config[xchannel].attrs["range"]
        except:
            xmax = INVALID_ADC_VALUE - 1
        events = f["EVENTS"]
        xdata = da.from_array(events[xchannel], chunks=chunk_size)
        binned, ex = da.histogram(xdata, nxbins, range=(xmin, xmax))
        with DaskProgressBar():
            binned = binned.compute()
    return binned, ex
예제 #39
0
def test_histogram_alternative_bins_range():
    v = da.random.random(100, chunks=10)
    (a1, b1) = da.histogram(v, bins=10, range=(0, 1))
    (a2, b2) = np.histogram(v, bins=10, range=(0, 1))
    assert_eq(a1, a2)
    assert_eq(b1, b2)