예제 #1
0
def similarity_threshold(fine_image_t0):  # , st_dev):
    fine_image_t0 = da.where(fine_image_t0 == 0, np.nan, fine_image_t0)
    st_dev = da.nanstd(fine_image_t0, axis=1)  # new
    sim_threshold = st_dev * 2 / numberClass
    print("Done similarity threshold!", sim_threshold)

    return sim_threshold
예제 #2
0
def _hotspots_dask_numpy(raster, kernel):

    # apply kernel to raster values
    mean_array = convolve_2d(raster.data, kernel / kernel.sum())

    # calculate z-scores
    global_mean = da.nanmean(raster.data)
    global_std = da.nanstd(raster.data)

    # commented out to avoid early compute to check if global_std is zero
    # if global_std == 0:
    #     raise ZeroDivisionError(
    #         "Standard deviation of the input raster values is 0."
    #     )

    z_array = (mean_array - global_mean) / global_std

    _func = partial(_calc_hotspots_numpy)
    pad_h = kernel.shape[0] // 2
    pad_w = kernel.shape[1] // 2

    out = z_array.map_overlap(_func,
                              depth=(pad_h, pad_w),
                              boundary=np.nan,
                              meta=np.array(()))
    return out
예제 #3
0
파일: utils.py 프로젝트: TNonet/lmdec
def get_array_moments(
        array: da.core.Array,
        mean: bool = True,
        std: bool = True,
        std_method: str = 'binom',
        axis: int = 0
) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]:
    """ Computes specified array_moments

    Parameters
    ----------
    array : array_like, shape (N, P)
        Array that moments will be computed from
    mean : bool
        Flag whether to compute mean of "array" along "axis"
    std : bool
        Flag whether to compute std of "array" along "axis"
    std_method : str
        Method used to compute standard deviation.

        Possible methods are:
            'norm' ===> Normal Distribution Standard Deviation. See np.std
            'binom' ====> Binomial Standard Deviation
                            sqrt(2*p*(1-p)), where p = "mean"/2
    axis : int
        Axis to compute mean and std along.

    Returns
    -------
    array_mean : da.core.array, optional
        If "mean" is false, returns None
        Otherwise returns the array mean
    array_std: da.core.array, optional
        If "std" is false, returns None
        Otherwise returns the array std
    """
    array_mean = None
    array_std = None

    if mean:
        array_mean = da.nanmean(array, axis=axis)

    if std:
        if std_method == 'binom':
            u = array_mean if mean else da.nanmean(array, axis=axis)
            u /= 2
            array_std = da.sqrt(2 * u * (1 - u))
        elif std_method == 'norm':
            array_std = da.nanstd(array, axis=axis)
        else:
            raise NotImplementedError(
                f'std_method, {std_method}, is not implemented ')

    array_mean, array_std = persist(array_mean, array_std)

    return array_mean, array_std
예제 #4
0
 def _calculate_summary_statistics(self):
     data = self._lazy_data()
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
예제 #5
0
파일: lazy.py 프로젝트: mwalls/hyperspy
 def _calculate_summary_statistics(self):
     data = self._lazy_data()
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
예제 #6
0
def dasky_scotts_bin_width(data, return_bins=True):
    r"""Dask version of scotts_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is:

    .. math::

        \Delta_b = \frac{3.5\sigma}{n^{1/3}}

    where :math:`\sigma` is the standard deviation of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    freedman_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size
    sigma = da.nanstd(data)

    dx = 3.5 * sigma * 1. / (n**(1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
예제 #7
0
def dasky_scotts_bin_width(data, return_bins=True):
    r"""Dask version of scotts_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is:

    .. math::

        \Delta_b = \frac{3.5\sigma}{n^{1/3}}

    where :math:`\sigma` is the standard deviation of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    freedman_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size
    sigma = da.nanstd(data)

    dx = 3.5 * sigma * 1. / (n ** (1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
예제 #8
0
def test_nan():
    x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]])
    d = da.from_array(x, chunks=(2, 2))

    assert_eq(np.nansum(x), da.nansum(d))
    assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0))
    assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1))
    assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1))
    assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1)))
    assert_eq(np.nanvar(x), da.nanvar(d))
    assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0))
    assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0))
    assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0))
    assert_eq(np.nanprod(x), da.nanprod(d))
예제 #9
0
def test_nan():
    x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]])
    d = da.from_array(x, blockshape=(2, 2))

    assert eq(np.nansum(x), da.nansum(d))
    assert eq(np.nansum(x, axis=0), da.nansum(d, axis=0))
    assert eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1))
    assert eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1))
    assert eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1)))
    assert eq(np.nanvar(x), da.nanvar(d))
    assert eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0))
    assert eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0))
    assert eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0))
    with ignoring(AttributeError):
        assert eq(np.nanprod(x), da.nanprod(d))
예제 #10
0
 def _calculate_summary_statistics(self, rechunk=True):
     if rechunk is True:
         # Use dask auto rechunk instead of HyperSpy's one, what should be
         # better for these operations
         rechunk = "dask_auto"
     data = self._lazy_data(rechunk=rechunk)
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
예제 #11
0
파일: lazy.py 프로젝트: woozey/hyperspy
 def _calculate_summary_statistics(self, rechunk=True):
     if rechunk is True:
         # Use dask auto rechunk instead of HyperSpy's one, what should be
         # better for these operations
         rechunk = "dask_auto"
     data = self._lazy_data(rechunk=rechunk)
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
예제 #12
0
def test_nan():
    x = np.array([[1, np.nan, 3, 4],
                  [5, 6, 7, np.nan],
                  [9, 10, 11, 12]])
    d = da.from_array(x, chunks=(2, 2))

    assert_eq(np.nansum(x), da.nansum(d))
    assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0))
    assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1))
    assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1))
    assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1)))
    assert_eq(np.nanvar(x), da.nanvar(d))
    assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0))
    assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0))
    assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0))
    assert_eq(nanprod(x), da.nanprod(d))
예제 #13
0
def test_nan():
    x = np.array([[1, np.nan, 3, 4],
                  [5, 6, 7, np.nan],
                  [9, 10, 11, 12]])
    d = da.from_array(x, blockshape=(2, 2))

    assert eq(np.nansum(x), da.nansum(d))
    assert eq(np.nansum(x, axis=0), da.nansum(d, axis=0))
    assert eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1))
    assert eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1))
    assert eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1)))
    assert eq(np.nanvar(x), da.nanvar(d))
    assert eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0))
    assert eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0))
    assert eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0))
    with ignoring(AttributeError):
        assert eq(np.nanprod(x), da.nanprod(d))
예제 #14
0
def _scott_bw_dask(data, return_bins=True):
    r"""Dask version of scotts_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is:

    .. math::

        \Delta_b = \frac{3.5\sigma}{n^{1/3}}

    where :math:`\sigma` is the standard deviation of the data, and
    :math:`n` is the number of data points.

    """
    if not isinstance(data, da.Array):
        raise TypeError("Expected a dask array")

    if data.ndim != 1:
        data = data.flatten()

    n = data.size
    sigma = da.nanstd(data)
    dx = 3.5 * sigma * n**(-1.0 / 3.0)
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = max(1, np.ceil((mx - mn) / c_dx))
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
예제 #15
0
def test_make_snp_array_case_normal(shape, threshold):
    assume(shape[0] > 1 and shape[1] > 1)  # Assumes not degenerate 2d Array

    arr = da.random.random(size=shape)
    arr[arr > threshold] = float('nan')

    assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1)
    # Asserts that every tested arr has a non-zero std for each column

    snp_array = utils.make_snp_array(arr,
                                     mean=True,
                                     std=True,
                                     std_method='norm',
                                     dtype='float')

    np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0),
                                         np.ones(shape[1]))
예제 #16
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, chunks=(2,))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #17
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, blockshape=(2, ))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #18
0
def test_make_snp_array_case_normal(shape, max_value, mask_nans):
    assume(shape[0] > 1 and shape[1] > 1)  # Assumes not degenerate 2d Array

    arr = da.random.randint(0, max_value, size=shape)
    if mask_nans:
        arr[arr == max_value - 1] = float('nan')

    assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1)
    # Asserts that every tested arr has a non-zero std for each column

    snp_array = utils.make_snp_array(arr,
                                     mean=True,
                                     std=True,
                                     std_method='norm',
                                     mask_nan=mask_nans,
                                     dtype='int8')

    np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0),
                                         np.ones(shape[1]))
예제 #19
0
def xsapr_clutter(files,
                  clutter_thresh_min=0.0002,
                  clutter_thresh_max=1.5,
                  radius=1,
                  write_radar=True,
                  out_file=None,
                  use_dask=False):
    """
    X-SAPR Wind Farm Clutter Calculation

    Parameters
    ----------
    files : list
        List of radar files used for X-SAPR clutter calculation.

    Other Parameters
    ----------------
    clutter_thresh_min : float
        Threshold value for which, any clutter values above the
        clutter_thres_min will be considered clutter, as long as they
        are also below the clutter_thres_max.
    clutter_thresh_max : float
        Threshold value for which, any clutter values below the
        clutter_thres_max will be considered clutter, as long as they
        are also above the clutter_thres_min.
    radius : int
        Radius of the area surrounding the clutter gate that will
        be also flagged as clutter.
    write_radar : bool
        Whether to or not, to write the clutter radar as a netCDF file.
        Default is True.
    out_file : string
        String of location and filename to write the radar object too,
        if write_radar is True.
    use_dask : bool
        Use dask instead of running stats for calculation
        (good to run in parallel).
    Returns
    -------
    clutter_radar : Radar
        Radar object with the clutter field that was calculated.
        This radar only has the clutter field, but maintains all
        other radar specifications.

    """
    def get_reflect_array(file, first_shape):
        """ Retrieves a reflectivity array for a radar volume. """
        try:
            radar = pyart.io.read(file)
            reflect_array = deepcopy(radar.fields['reflectivity']['data'])
            del radar

            if reflect_array.shape == first_shape:
                return reflect_array.filled(fill_value=np.nan)
        except TypeError:
            print(file + ' is corrupt...skipping!')
        return np.nan * np.zeros(first_shape)

    if use_dask is False:
        run_stats = _RunningStats()
        first_shape = 0
        for file in files:
            try:
                radar = pyart.io.read(file)
                reflect_array = radar.fields['reflectivity']['data']
                if first_shape == 0:
                    first_shape = reflect_array.shape
                    clutter_radar = radar
                    run_stats.push(reflect_array)
                if reflect_array.shape == first_shape:
                    run_stats.push(reflect_array)
                del radar
            except TypeError:
                print(file + ' is corrupt...skipping!')
                continue
        mean = run_stats.mean()
        stdev = run_stats.standard_deviation()
        clutter_values = stdev / mean
    else:
        first_shape = 0
        i = 0
        while first_shape == 0:
            try:
                radar = pyart.io.read(files[i])
                reflect_array = radar.fields['reflectivity']['data']
                first_shape = reflect_array.shape
                clutter_radar = radar
            except TypeError:
                i = i + 1
                print(file + ' is corrupt...skipping!')
                continue
        arrays = [
            delayed(get_reflect_array)(file, first_shape) for file in files
        ]
        array = [
            da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays
        ]
        array = da.stack(array, axis=0)
        print('## Calculating mean in parallel...')
        mean = np.array(da.nanmean(array, axis=0))
        print('## Caluclating standard deviation...')
        stdev = np.array(da.nanstd(array, axis=0))
        clutter_values = stdev / mean
        clutter_values = np.ma.masked_invalid(clutter_values)

    shape = clutter_values.shape
    mask = np.ma.getmask(clutter_values)
    is_clutters = np.argwhere(
        np.logical_and(clutter_values > clutter_thresh_min,
                       clutter_values < clutter_thresh_max))
    clutter_array = _clutter_marker(is_clutters, shape, mask, radius)
    clutter_radar.fields.clear()
    clutter_dict = _clutter_to_dict(clutter_array)
    clutter_radar.add_field('xsapr_clutter',
                            clutter_dict,
                            replace_existing=True)
    if write_radar is True:
        pyart.io.write_cfradial(out_file, clutter_radar)
    del clutter_radar
    return
예제 #20
0
파일: ds.py 프로젝트: elaeon/ML
    def stadistics(self):
        headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"]
        self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
        table = []
        for group, (dtype, _) in self.dtypes.fields.items():
            values = dict()
            values["dtype"] = dtype
            values["group"] = group
            darray = self.data[group].da
            if dtype == np.dtype(float) or dtype == np.dtype(int):
                da_mean = da.around(darray.mean(), decimals=3)
                da_std = da.around(darray.std(), decimals=3)
                da_min = da.around(darray.min(), decimals=3)
                da_max = da.around(darray.max(), decimals=3)
                result = dask.compute([da_mean, da_std, da_min, da_max])[0]
                values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute()
                values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute()
                values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute()
                values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute()
                if len(self.shape[group]) == 1:
                    da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3)
                    result = da_percentile.compute()
                    values["25%"] = result[0]
                    values["50%"] = result[1]
                    values["75%"] = result[2]
                else:
                    values["25%"] = "-"
                    values["50%"] = "-"
                    values["75%"] = "-"
                values["nonzero"] = da.count_nonzero(darray).compute()
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                values["unique"] = "-"
            else:
                values["mean"] = "-"
                values["std dev"] = "-"
                values["min"] = "-"
                values["max"] = "-"
                values["25%"] = "-"
                values["50%"] = "-"
                values["75%"] = "-"
                values["nonzero"] = "-"
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                vunique = darray.to_dask_dataframe().fillna('').nunique().compute()
                values["unique"] = vunique

            row = []
            for column in headers:
                row.append(values[column])
            table.append(row)

        print("# rows {}".format(self.shape[0]))
        return tabulate(table, headers)
예제 #21
0
def tall_clutter(files,
                 config,
                 clutter_thresh_min=0.0002,
                 clutter_thresh_max=0.25,
                 radius=1,
                 write_radar=True,
                 out_file=None,
                 use_dask=False):
    """
    Wind Farm Clutter Calculation

    Parameters
    ----------
    files : list
        List of radar files used for the clutter calculation.
    config : str
        String representing the configuration for the radar.
        Such possible configurations are listed in default_config.py

    Other Parameters
    ----------------
    clutter_thresh_min : float
        Threshold value for which, any clutter values above the
        clutter_thres_min will be considered clutter, as long as they
        are also below the clutter_thres_max.
    clutter_thresh_max : float
        Threshold value for which, any clutter values below the
        clutter_thres_max will be considered clutter, as long as they
        are also above the clutter_thres_min.
    radius : int
        Radius of the area surrounding the clutter gate that will
        be also flagged as clutter.
    write_radar : bool
        Whether to or not, to write the clutter radar as a netCDF file.
        Default is True.
    out_file : string
        String of location and filename to write the radar object too,
        if write_radar is True.
    use_dask : bool
        Use dask instead of running stats for calculation. The will reduce
        run time.

    Returns
    -------
    clutter_radar : Radar
        Radar object with the clutter field that was calculated.
        This radar only has the clutter field, but maintains all
        other radar specifications.

    """
    field_names = get_field_names(config)
    refl_field = field_names["reflectivity"]
    vel_field = field_names["velocity"]
    ncp_field = field_names["normalized_coherent_power"]

    def get_reflect_array(file, first_shape):
        """ Retrieves a reflectivity array for a radar volume. """
        try:
            radar = pyart.io.read(
                file, include_fields=[refl_field, ncp_field, vel_field])
            reflect_array = deepcopy(radar.fields[refl_field]['data'])
            ncp = radar.fields[ncp_field]['data']
            height = radar.gate_z["data"]
            up_in_the_air = height > 2000.0
            the_mask = np.logical_or.reduce(
                (ncp < 0.8, reflect_array.mask, up_in_the_air))
            reflect_array = np.ma.masked_where(the_mask, reflect_array)
            del radar
            if reflect_array.shape == first_shape:
                return reflect_array.filled(fill_value=np.nan)
        except (TypeError, OSError):
            print(file + ' is corrupt...skipping!')
        return np.nan * np.zeros(first_shape)

    if use_dask is False:
        run_stats = _RunningStats()
        first_shape = 0
        for file in files:
            try:
                radar = pyart.io.read(file)
                reflect_array = radar.fields[refl_field]['data']
                ncp = deepcopy(radar.fields[ncp_field]['data'])
                #reflect_array = np.ma.masked_where(ncp < 0.7, reflect_array)

                if first_shape == 0:
                    first_shape = reflect_array.shape
                    clutter_radar = radar
                    run_stats.push(reflect_array)
                if reflect_array.shape == first_shape:
                    run_stats.push(reflect_array)
                del radar
            except (TypeError, OSError):
                print(file + ' is corrupt...skipping!')
                continue
        mean = run_stats.mean()
        stdev = run_stats.standard_deviation()
        clutter_values = stdev / mean
        clutter_values = np.ma.masked_invalid(clutter_values)
        clutter_values_no_mask = clutter_values.filled(clutter_values_max + 1)
    else:
        cluster = LocalCluster(n_workers=20, processes=True)
        client = Client(cluster)
        first_shape = 0
        i = 0
        while first_shape == 0:
            try:
                radar = pyart.io.read(files[i])
                reflect_array = radar.fields[refl_field]['data']
                first_shape = reflect_array.shape
                clutter_radar = radar
            except (TypeError, OSError):
                i = i + 1
                print(file + ' is corrupt...skipping!')
                continue
        arrays = [
            delayed(get_reflect_array)(file, first_shape) for file in files
        ]
        array = [
            da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays
        ]
        array = da.stack(array, axis=0)
        print('## Calculating mean in parallel...')
        mean = np.array(da.nanmean(array, axis=0))
        print('## Calculating standard deviation...')
        count = np.array(da.sum(da.isfinite(array), axis=0))
        stdev = np.array(da.nanstd(array, axis=0))
        clutter_values = stdev / mean
        clutter_values = np.ma.masked_invalid(clutter_values)
        clutter_values = np.ma.masked_where(
            np.logical_or(clutter_values.mask, count < 20), clutter_values)
        # Masked arrays can suck
        clutter_values_no_mask = clutter_values.filled(
            (clutter_thresh_max + 1))

    shape = clutter_values.shape
    mask = np.ma.getmask(clutter_values)
    is_clutters = np.argwhere(
        np.logical_and.reduce((
            clutter_values_no_mask > clutter_thresh_min,
            clutter_values_no_mask < clutter_thresh_max,
        )))
    clutter_array = _clutter_marker(is_clutters, shape, mask, radius)
    clutter_radar.fields.clear()
    clutter_array = clutter_array.filled(0)
    clutter_dict = _clutter_to_dict(clutter_array)
    clutter_value_dict = _clutter_to_dict(clutter_values)
    clutter_value_dict["long_name"] = "Clutter value (std. dev/mean Z)"
    clutter_value_dict["standard_name"] = "clutter_value"
    clutter_radar.add_field('ground_clutter',
                            clutter_dict,
                            replace_existing=True)
    clutter_radar.add_field('clutter_value',
                            clutter_value_dict,
                            replace_existing=True)
    if write_radar is True:
        pyart.io.write_cfradial(out_file, clutter_radar)
    del clutter_radar
    return
예제 #22
0
 def read_geno(bfile, freq_thresh, threads, check=False, max_memory=None,
               usable_snps=None, normalize=False, prefix='my_geno',
               thinning=None):
     chunks = (10000, 10000)
     # set Cache to protect memory spilling
     if max_memory is not None:
         available_memory = max_memory
     else:
         available_memory = psutil.virtual_memory().available
     cache = Chest(available_memory=available_memory)
     (bim, fam, g) = read_plink(bfile)  # read the files using pandas_plink
     g_std = da.nanstd(g, axis=1)
     if check:
         with ProgressBar():
             print('Removing invariant sites')
             idx = (g_std != 0).compute(cache=cache)
         g = g[idx, :]
         bim = bim[idx].copy().reset_index(drop=True)
         bim.i = bim.index.tolist()
         g_std = g_std[idx]
         del idx
         gc.collect()
     if usable_snps is not None:
         print('Restricting genotype to user specified variants')
         idx = sorted(bim[bim.snp.isin(usable_snps)].i.values)
         g = g[idx, :]
         bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True)
         bim.i = bim.index.tolist()
     mafs = g.sum(axis=1) / (2 * g.shape[0]) if freq_thresh > 0 else None
     # Filter MAF
     if freq_thresh > 0:
         print('Filtering MAFs smaller than', freq_thresh)
         print('    Genotype matrix shape before', g.shape)
         assert freq_thresh < 0.5
         good = (mafs < (1 - float(freq_thresh))) & (mafs > float(
             freq_thresh))
         with ProgressBar():
             with dask.config.set(pool=ThreadPool(threads)):
                 good, mafs = dask.compute(good, mafs, cache=cache)
         g = g[good, :]
         print('    Genotype matrix shape after', g.shape)
         bim = bim[good]
         bim['mafs'] = mafs[good]
         bim.reset_index(drop=True, inplace=True)
         bim.i = bim.index.tolist()
         del good
         gc.collect()
     if not is_transposed(g, bim.shape[0], fam.shape[0]):
         g = g.T
     if normalize:
         print('Normalizing to mean 0 and sd 1')
         mean = da.nanmean(g.T, axis=1)
         g = (g - mean) / g_std
     if thinning is not None:
         print("Thinning genotype to %d variants" % thinning)
         idx = np.linspace(0, g.shape[1], num=thinning, dtype=int,
                           endpoint=False)
         bim = bim.reindex(index=idx)
         g = g[:, idx].rechunk('auto')
         bim['i'] = range(thinning)
     h5 = '%s.hdf5' % prefix
     if not os.path.isfile(h5):
         with ProgressBar(), h5py.File(h5) as hd5:
             print("Sending processed genotype to HDF5")
             chroms = sorted(bim.chrom.unique().astype(int))
             gr = bim.groupby('chrom')
             for chrom in chroms:
                 df = gr.get_group(str(chrom))
                 ch = g[:, df.i.values]
                 ch = ch.rechunk(estimate_chunks(ch.shape, threads,
                                                 memory=available_memory))
                 print('\tChromosome %s: %d individuals %d  variants' % (
                     chrom, ch.shape[0], ch.shape[1]))
                 hd5.create_dataset('/%s' % chrom,  data=ch.compute())
                 del ch
             del gr
     return g, h5, bim, fam #g, bim, fam