def _check_array(self, X): t0 = tic() if isinstance(X, pd.DataFrame): X = X.values elif isinstance(X, dd.DataFrame): raise TypeError("Cannot fit on dask.dataframe due to unknown " "partition lengths.") if X.dtype == 'int32': X = X.astype('float32') elif X.dtype == 'int64': X = X.astype('float64') X = check_array(X, accept_dask_dataframe=False, accept_unknown_chunks=False, accept_sparse=False) if isinstance(X, np.ndarray): X = da.from_array(X, chunks=(max(1, len(X) // cpu_count()), X.shape[-1])) bad = (da.isnull(X).any(), da.isinf(X).any()) if any(*compute(bad)): msg = ("Input contains NaN, infinity or a value too large for " "dtype('float64').") raise ValueError(msg) t1 = tic() logger.info("Finished check_array in %0.2f s", t1 - t0) return X
def _check_array(self, X): if isinstance(X, pd.DataFrame): X = X.values if isinstance(X, dd.DataFrame): X = X.to_dask_array(lengths=True) X = check_array( X, accept_dask_dataframe=False, accept_unknown_chunks=False, accept_sparse=False, remove_zero_chunks=True, ) if X.dtype == "int32": X = X.astype("float32") elif X.dtype == "int64": X = X.astype("float64") if isinstance(X, np.ndarray): X = da.from_array(X, chunks=(max(1, len(X) // cpu_count()), X.shape[-1])) bad = (da.isnull(X).any(), da.isinf(X).any()) if any(*compute(bad)): msg = ("Input contains NaN, infinity or a value too large for " "dtype('float64').") raise ValueError(msg) return X
def transform(self, X): if isinstance(X, (pd.Series, pd.DataFrame, dd.Series, dd.DataFrame)): return X.fillna(self.statistics_) elif isinstance(X, da.Array): return da.where(da.isnull(X), self.statistics_, X) else: return super(SimpleImputer, self).transform(X)
def transform(self, X): if isinstance(X, (pd.Series, pd.DataFrame, dd.Series, dd.DataFrame)): if self.strategy == "mean" or self.strategy == "median": X = X.astype(np.number) return X.fillna(self.statistics_) elif isinstance(X, da.Array): return da.where(da.isnull(X.astype(np.number)), self.statistics_, X.astype(np.number)) else: return super(SimpleImputer, self).transform(X)
def test_dtype_complex(): x = np.arange(24).reshape((4, 6)).astype('f4') y = np.arange(24).reshape((4, 6)).astype('i8') z = np.arange(24).reshape((4, 6)).astype('i2') a = da.from_array(x, chunks=(2, 3)) b = da.from_array(y, chunks=(2, 3)) c = da.from_array(z, chunks=(2, 3)) def eq(a, b): return (isinstance(a, np.dtype) and isinstance(b, np.dtype) and str(a) == str(b)) assert eq(a._dtype, x.dtype) assert eq(b._dtype, y.dtype) assert eq((a + 1)._dtype, (x + 1).dtype) assert eq((a + b)._dtype, (x + y).dtype) assert eq(a.T._dtype, x.T.dtype) assert eq(a[:3]._dtype, x[:3].dtype) assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype) assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype) assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype) assert eq(b.std()._dtype, y.std().dtype) assert eq(c.sum()._dtype, z.sum().dtype) assert eq(a.min()._dtype, a.min().dtype) assert eq(b.std()._dtype, b.std().dtype) assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype) assert eq(da.sin(c)._dtype, np.sin(z).dtype) assert eq(da.exp(b)._dtype, np.exp(y).dtype) assert eq(da.floor(a)._dtype, np.floor(x).dtype) assert eq(da.isnan(b)._dtype, np.isnan(y).dtype) with ignoring(ImportError): assert da.isnull(b)._dtype == 'bool' assert da.notnull(b)._dtype == 'bool' x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')]) d = da.from_array(x, chunks=(1,)) assert eq(d['text']._dtype, x['text'].dtype) assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
def test_dtype_complex(): x = np.arange(24).reshape((4, 6)).astype('f4') y = np.arange(24).reshape((4, 6)).astype('i8') z = np.arange(24).reshape((4, 6)).astype('i2') a = da.from_array(x, chunks=(2, 3)) b = da.from_array(y, chunks=(2, 3)) c = da.from_array(z, chunks=(2, 3)) def eq(a, b): return (isinstance(a, np.dtype) and isinstance(b, np.dtype) and str(a) == str(b)) assert eq(a._dtype, x.dtype) assert eq(b._dtype, y.dtype) assert eq((a + 1)._dtype, (x + 1).dtype) assert eq((a + b)._dtype, (x + y).dtype) assert eq(a.T._dtype, x.T.dtype) assert eq(a[:3]._dtype, x[:3].dtype) assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype) assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype) assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype) assert eq(b.std()._dtype, y.std().dtype) assert eq(c.sum()._dtype, z.sum().dtype) assert eq(a.min()._dtype, a.min().dtype) assert eq(b.std()._dtype, b.std().dtype) assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype) assert eq(da.sin(c)._dtype, np.sin(z).dtype) assert eq(da.exp(b)._dtype, np.exp(y).dtype) assert eq(da.floor(a)._dtype, np.floor(x).dtype) assert eq(da.isnan(b)._dtype, np.isnan(y).dtype) with ignoring(ImportError): assert da.isnull(b)._dtype == 'bool' assert da.notnull(b)._dtype == 'bool' x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')]) d = da.from_array(x, chunks=(1, )) assert eq(d['text']._dtype, x['text'].dtype) assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
def test_isnull(): x = np.array([1, np.nan]) a = da.from_array(x, chunks=(2, )) with ignoring(ImportError): assert_eq(da.isnull(a), np.isnan(x)) assert_eq(da.notnull(a), ~np.isnan(x))
def __call__(self, datasets, optional_datasets=None, **info): if len(datasets) != 3: raise ValueError("Expected 3 datasets, got %d" % (len(datasets), )) if not all(x.shape == datasets[0].shape for x in datasets[1:]) or \ (optional_datasets and optional_datasets[0].shape != datasets[0].shape): raise IncompatibleAreas('RatioSharpening requires datasets of ' 'the same size. Must resample first.') new_attrs = {} if optional_datasets: datasets = self.check_areas(datasets + optional_datasets) high_res = datasets[-1] p1, p2, p3 = datasets[:3] if 'rows_per_scan' in high_res.attrs: new_attrs.setdefault('rows_per_scan', high_res.attrs['rows_per_scan']) new_attrs.setdefault('resolution', high_res.attrs['resolution']) if self.high_resolution_band == "red": LOG.debug("Sharpening image with high resolution red band") ratio = high_res / p1 # make ratio a no-op (multiply by 1) where the ratio is NaN or # infinity or it is negative. ratio = ratio.where(xu.isfinite(ratio) | (ratio >= 0), 1.) r = high_res g = p2 * ratio b = p3 * ratio g.attrs = p2.attrs.copy() b.attrs = p3.attrs.copy() elif self.high_resolution_band == "green": LOG.debug("Sharpening image with high resolution green band") ratio = high_res / p2 ratio = ratio.where(xu.isfinite(ratio) | (ratio >= 0), 1.) r = p1 * ratio g = high_res b = p3 * ratio r.attrs = p1.attrs.copy() b.attrs = p3.attrs.copy() elif self.high_resolution_band == "blue": LOG.debug("Sharpening image with high resolution blue band") ratio = high_res / p3 ratio = ratio.where(xu.isfinite(ratio) | (ratio >= 0), 1.) r = p1 * ratio g = p2 * ratio b = high_res r.attrs = p1.attrs.copy() g.attrs = p2.attrs.copy() else: # no sharpening r = p1 g = p2 b = p3 else: datasets = self.check_areas(datasets) r, g, b = datasets[:3] # combine the masks mask = ~(da.isnull(r.data) | da.isnull(g.data) | da.isnull(b.data)) r = r.where(mask) g = g.where(mask) b = b.where(mask) # Collect information that is the same between the projectables # we want to use the metadata from the original datasets since the # new r, g, b arrays may have lost their metadata during calculations info = combine_metadata(*datasets) info.update(new_attrs) # Update that information with configured information (including name) info.update(self.attrs) # Force certain pieces of metadata that we *know* to be true info.setdefault("standard_name", "true_color") return super(RatioSharpenedRGB, self).__call__((r, g, b), **info)
def test_isnull_result_is_an_array(): # regression test for https://github.com/dask/dask/issues/3822 arr = da.from_array(np.arange(3, dtype=np.int64), chunks=-1) with ignoring(ImportError): result = da.isnull(arr[0]).compute() assert type(result) is np.ndarray
def test_isnull(): x = np.array([1, np.nan]) a = da.from_array(x, chunks=(2,)) with ignoring(ImportError): assert_eq(da.isnull(a), np.isnan(x)) assert_eq(da.notnull(a), ~np.isnan(x))
def run_crefl(refl, coeffs, lon, lat, sensor_azimuth, sensor_zenith, solar_azimuth, solar_zenith, avg_elevation=None, percent=False, use_abi=False): """Run main crefl algorithm. All input parameters are per-pixel values meaning they are the same size and shape as the input reflectance data, unless otherwise stated. :param reflectance_bands: tuple of reflectance band arrays :param coefficients: tuple of coefficients for each band (see `get_coefficients`) :param lon: input swath longitude array :param lat: input swath latitude array :param sensor_azimuth: input swath sensor azimuth angle array :param sensor_zenith: input swath sensor zenith angle array :param solar_azimuth: input swath solar azimuth angle array :param solar_zenith: input swath solar zenith angle array :param avg_elevation: average elevation (usually pre-calculated and stored in CMGDEM.hdf) :param percent: True if input reflectances are on a 0-100 scale instead of 0-1 scale (default: False) """ # FUTURE: Find a way to compute the average elevation before hand # Get digital elevation map data for our granule, set ocean fill value to 0 if avg_elevation is None: LOG.debug("No average elevation information provided in CREFL") #height = np.zeros(lon.shape, dtype=np.float) height = 0. else: LOG.debug("Using average elevation information provided to CREFL") lat[(lat <= -90) | (lat >= 90)] = np.nan lon[(lon <= -180) | (lon >= 180)] = np.nan row = ((90.0 - lat) * avg_elevation.shape[0] / 180.0).astype(np.int32) col = ((lon + 180.0) * avg_elevation.shape[1] / 360.0).astype(np.int32) space_mask = da.isnull(lon) | da.isnull(lat) row[space_mask] = 0 col[space_mask] = 0 def _avg_elevation_index(avg_elevation, row, col): return avg_elevation[row, col] height = da.map_blocks(_avg_elevation_index, avg_elevation, row, col, dtype=avg_elevation.dtype) height = xr.DataArray(height, dims=['y', 'x']) # negative heights aren't allowed, clip to 0 height = height.where((height >= 0.) & ~space_mask, 0.0) del lat, lon, row, col mus = da.cos(da.deg2rad(solar_zenith)) mus = mus.where(mus >= 0) muv = da.cos(da.deg2rad(sensor_zenith)) phi = solar_azimuth - sensor_azimuth if use_abi: LOG.debug("Using ABI CREFL algorithm") a_O3 = [268.45, 0.5, 115.42, -3.2922] a_H2O = [0.0311, 0.1, 92.471, -1.3814] a_O2 = [0.4567, 0.007, 96.4884, -1.6970] G_O3 = G_calc(solar_zenith, a_O3) + G_calc(sensor_zenith, a_O3) G_H2O = G_calc(solar_zenith, a_H2O) + G_calc(sensor_zenith, a_H2O) G_O2 = G_calc(solar_zenith, a_O2) + G_calc(sensor_zenith, a_O2) # Note: bh2o values are actually ao2 values for abi sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables_abi( mus, muv, phi, height, G_O3, G_H2O, G_O2, *coeffs) else: LOG.debug("Using original VIIRS CREFL algorithm") sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables( mus, muv, phi, height, *coeffs) del solar_azimuth, solar_zenith, sensor_zenith, sensor_azimuth # Note: Assume that fill/invalid values are either NaN or we are dealing # with masked arrays if percent: corr_refl = ((refl / 100.) / tOG - rhoray) / TtotraytH2O else: corr_refl = (refl / tOG - rhoray) / TtotraytH2O corr_refl /= (1.0 + corr_refl * sphalb) return corr_refl.clip(REFLMIN, REFLMAX)
def _calc_idxminmax( *, array, func: Callable, dim: Hashable = None, skipna: bool = None, fill_value: Any = dtypes.NA, keep_attrs: bool = None, ): """Apply common operations for idxmin and idxmax.""" # This function doesn't make sense for scalars so don't try if not array.ndim: raise ValueError("This function does not apply for scalars") if dim is not None: pass # Use the dim if available elif array.ndim == 1: # it is okay to guess the dim if there is only 1 dim = array.dims[0] else: # The dim is not specified and ambiguous. Don't guess. raise ValueError( "Must supply 'dim' argument for multidimensional arrays") if dim not in array.dims: raise KeyError(f'Dimension "{dim}" not in dimension') if dim not in array.coords: raise KeyError(f'Dimension "{dim}" does not have coordinates') # These are dtypes with NaN values argmin and argmax can handle na_dtypes = "cfO" if skipna or (skipna is None and array.dtype.kind in na_dtypes): # Need to skip NaN values since argmin and argmax can't handle them allna = array.isnull().all(dim) array = array.where(~allna, 0) # This will run argmin or argmax. indx = func(array, dim=dim, axis=None, keep_attrs=keep_attrs, skipna=skipna) # Get the coordinate we want. coordarray = array[dim] # Handle dask arrays. if isinstance(array, dask_array_type): res = dask_array.map_blocks(coordarray, indx, dtype=indx.dtype) else: res = coordarray[indx, ] if skipna or (skipna is None and array.dtype.kind in na_dtypes): # Put the NaN values back in after removing them res = res.where(~allna, fill_value) # The dim is gone but we need to remove the corresponding coordinate. del res.coords[dim] # Copy attributes from argmin/argmax, if any res.attrs = indx.attrs return res