def two_bands_operation(method, X, y=None, sample_weight=None, spec=None, **kwargs): from elm.readers import ElmStore bands = X.band_order.copy() es = {} if not spec: raise ValueError( 'Expected "spec" in kwargs, e.g. {"ndvi": ["band_4", "band_3]}') for idx, (key, (b1, b2)) in enumerate(sorted(spec.items())): band1 = getattr(X, b1) band2 = getattr(X, b2) if method == 'normed_diff': new = (band1 - band2) / (band1 + band2) elif method == 'diff': new = band1 - band2 elif method == 'sum': new = band1 + band2 elif method == 'ratio': new = band1 / band2 new.attrs.update(band1.attrs) es[key] = new bands.append(key) Xnew = ElmStore(xr.merge([ElmStore(es, add_canvas=False), X]), add_canvas=False) Xnew.attrs.update(X.attrs.copy()) Xnew.attrs['band_order'] = bands return (Xnew, y, sample_weight)
def inverse_flatten(flat, add_canvas=False, **attrs): '''Given an ElmStore that has been flattened to (space, band) dims, return a 3-d ElmStore with dims (band, y, x). Requires that metadata about x,y dims were preserved when the 2-d input ElmStore was created Params: :flat: a 2-d ElmStore (space, band) :attrs: attribute dict to update the dict of the returned ElmStore Returns: :es: ElmStore (band, y, x) ''' flat = filled_flattened(flat) attrs2 = copy.deepcopy(flat.attrs) attrs2.update(copy.deepcopy(attrs)) attrs = attrs2 band_list = zip(flat.flat.band_order, flat.old_dims) es_new_dict = OrderedDict() if 'canvas' in attrs: new_coords = canvas_to_coords(attrs['canvas']) else: new_coords = attrs['old_coords'] for idx, (band, dims) in enumerate(band_list): if idx >= flat.flat.values.shape[1]: break new_arr = flat.flat.values[:, idx] shp = tuple(new_coords[k].size for k in dims) new_arr = new_arr.reshape(shp, order='C') data_arr = xr.DataArray(new_arr, coords=new_coords, dims=dims, attrs=attrs) es_new_dict[band] = data_arr return ElmStore(es_new_dict, attrs=attrs, add_canvas=add_canvas)
def filled_flattened(na_dropped): '''Used by inverse_flatten to fill areas that were dropped out of X due to NA/NaN''' shp = getattr(na_dropped, 'shape_before_drop_na_rows', None) if not shp: return na_dropped shp = (shp[0], len(na_dropped.band_order)) filled = np.empty(shp) * np.NaN filled[na_dropped.space, :] = na_dropped.flat.values attrs = copy.deepcopy(na_dropped.attrs) attrs.update(copy.deepcopy(na_dropped.flat.attrs)) attrs.pop('shape_before_drop_na_rows', None) attrs['notnull_shape'] = na_dropped.flat.values.shape band = attrs['band_order'] filled_es = ElmStore( { 'flat': xr.DataArray(filled, coords=[('space', np.arange(shp[0])), ('band', band)], dims=('space', 'band'), attrs=attrs) }, attrs=attrs) return filled_es
def _fit_trans(self, method, X, y=None, sample_weight=None, **kwargs): fitter_func = getattr(self._estimator, method) kw = dict(y=y, sample_weight=sample_weight, **kwargs) kw = {k: v for k, v in kw.items() if k in self._params} if isinstance(X, (ElmStore, xr.Dataset)): if hasattr(X, 'flat'): XX = X.flat.values space = X.flat.space else: raise ValueError( "Call elm.pipeline.steps.Flatten() before Transform in pipeline or otherwise use X as an (elm.readers.ElmStore or xarray.Dataset)" ) else: raise ValueError( 'Expected X to be an xarray.Dataset or elm.readers.ElmStore') out = fitter_func(X.flat.values, **kw) if 'transform' in method: # 'transform' or 'fit_transform' was called out = np.atleast_2d(out) band = ['transform_{}'.format(idx) for idx in range(out.shape[1])] coords = [('space', space), ('band', band)] attrs = copy.deepcopy(X.attrs) attrs.update(X.flat.attrs) attrs['band_order'] = band Xnew = ElmStore( { 'flat': xr.DataArray( out, coords=coords, dims=X.flat.dims, attrs=attrs) }, attrs=attrs) return (Xnew, y, sample_weight) return out # a fitted "self"
def flatten(es, ravel_order='C'): '''Given an ElmStore with different rasters (DataArray) as bands, flatten the rasters into a single 2-D DataArray called "flat" in a new ElmStore. Params: :elm_store: 3-d ElmStore (band, y, x) Returns: :elm_store: 2-d ElmStore (space, band) ''' if check_is_flat(es, raise_err=False): return es shared_canvas = get_shared_canvas(es) if not shared_canvas: raise ValueError( 'es.select_canvas should be called before flatten when, as in this case, the bands do not all have the same Canvas' ) store = None band_names = [band for idx, band in enumerate(es.band_order)] old_canvases = [] old_dims = [] for idx, band in enumerate(band_names): data_arr = getattr(es, band, None) canvas = getattr(data_arr, 'canvas', None) old_canvases.append(canvas) old_dims.append(data_arr.dims) if store is None: # TODO consider canvas here instead # of assume fixed size, but that # makes reverse transform harder (is that important?) store = np.empty( (data_arr.values.size, len(es.data_vars))) * np.NaN if data_arr.values.ndim == 1: # its already flat new_values = data_arr.values else: new_values = data_arr.values.ravel(order=ravel_order) store[:, idx] = new_values attrs = {} attrs['canvas'] = shared_canvas attrs['old_canvases'] = old_canvases attrs['old_dims'] = old_dims attrs['flatten_data_array'] = True attrs.update(copy.deepcopy(es.attrs)) flat = ElmStore( { 'flat': xr.DataArray(store, coords=[('space', np.arange(store.shape[0])), ('band', band_names)], dims=('space', 'band'), attrs=attrs) }, attrs=attrs) return flat
def load_hdf5_array(datafile, meta, band_specs): '''Return an ElmStore where each subdataset is a DataArray Parameters: :datafile: filename :meta: meta from elm.readers.load_hdf5_meta :band_specs: list of elm.readers.BandSpec objects, defaulting to reading all subdatasets as bands Returns: :es: An ElmStore ''' logger.debug('load_hdf5_array: {}'.format(datafile)) f = gdal.Open(datafile, GA_ReadOnly) sds = meta['sub_datasets'] band_metas = meta['band_meta'] band_order_info = [] for band_idx, (band_meta, sd) in enumerate(zip(band_metas, sds)): if band_specs: for idx, bs in enumerate(band_specs): if match_meta(band_meta, bs): band_order_info.append((idx, band_meta, sd, bs)) break else: band_order_info.append((band_idx, band_meta, sd, 'band_{}'.format(band_idx))) if band_specs and len(band_order_info) != len(band_specs): raise ValueError('Number of bands matching band_specs {} was not equal ' 'to the number of band_specs {}'.format(len(band_order_info), len(band_specs))) band_order_info.sort(key=lambda x:x[0]) elm_store_data = OrderedDict() band_order = [] for _, band_meta, sd, band_spec in band_order_info: if isinstance(band_spec, BandSpec): name = band_spec.name reader_kwargs = {k: getattr(band_spec, k) for k in READ_ARRAY_KWARGS if getattr(band_spec, k)} else: reader_kwargs = {} name = band_spec reader_kwargs = window_to_gdal_read_kwargs(**reader_kwargs) attrs = copy.deepcopy(meta) attrs.update(copy.deepcopy(band_meta)) elm_store_data[name] = load_subdataset(sd[0], attrs, band_spec, **reader_kwargs) band_order.append(name) attrs = copy.deepcopy(attrs) attrs['band_order'] = band_order gc.collect() return ElmStore(elm_store_data, attrs=attrs)
def drop_na_rows(flat): '''Drop any NA rows from ElmStore flat''' check_is_flat(flat) flat_dropped = flat.flat.dropna(dim='space') flat_dropped.attrs.update(flat.attrs) flat_dropped.attrs[ 'drop_na_rows'] = flat.flat.values.shape[0] - flat_dropped.shape[0] attrs = copy.deepcopy(flat.attrs) attrs.update(flat_dropped.attrs) attrs['shape_before_drop_na_rows'] = flat.flat.values.shape no_na = ElmStore({'flat': flat_dropped}, attrs=attrs) return no_na
def _predict_one_sample_one_arg(estimator, serialize, to_raster, predict_tag, elm_predict_path, X_y_sample_weight): X, y, sample_weight = X_y_sample_weight if not isinstance(X, (ElmStore, xr.Dataset)): raise ValueError('Expected an ElmStore or xarray.Dataset') out = [] prediction, X_final = estimator.predict(X, return_X=True) if prediction.ndim == 1: prediction = prediction[:, np.newaxis] ndim = 2 elif prediction.ndim == 2: pass else: raise ValueError( 'Expected 1- or 2-d output of model.predict but found ndim of prediction: {}' .format(prediction.ndim)) bands = ['predict'] attrs = X_final.attrs attrs.update(X_final.flat.attrs) attrs['elm_predict_date'] = datetime.datetime.utcnow().isoformat() attrs['band_order'] = [ 'predict', ] logger.debug('Predict X shape {} X.flat.dims {} ' '- y shape {}'.format(X_final.flat.shape, X_final.flat.dims, prediction.shape)) prediction = ElmStore( { 'flat': xr.DataArray(prediction, coords=[('space', X_final.flat.space), ('band', bands)], dims=('space', 'band'), attrs=attrs) }, attrs=attrs) if to_raster: new_es = inverse_flatten(prediction) else: new_es = prediction if serialize: new_es = serialize(y=new_es, X=X_final, tag=predict_tag, elm_predict_path=elm_predict_path) out.append(new_es) return out
def ts_describe(X, y=None, sample_weight=None, **kwargs): '''scipy.describe on the `band` from kwargs that is a 3-D DataArray in X Parameters: X: ElmStore or xarray.Dataset y: passed through sample_weight: passed through kwargs: Keywords: axis: Integer like 0, 1, 2 to indicate which is the time axis of cube band: The name of the DataArray in ElmStore to run scipy.describe on Returns: X: ElmStore with DataArray class "flat" ''' band = kwargs['band'] logger.debug('Start scipy_describe band: {}'.format(band)) band_arr = getattr(X, band) cols = ('var', 'skew', 'kurt', 'min', 'max', 'median', 'std', 'np_skew') num_cols = len(cols) inds = _ij_for_axis(kwargs['axis'], 0, 0) shp = tuple(s for idx, s in enumerate(band_arr.values.shape) if isinstance(inds[idx], int)) num_rows = np.prod(shp) new_arr = np.empty((num_rows, num_cols)) for row, (i, j) in enumerate(product(*(range(s) for s in shp))): ind1, ind2, ind3 = _ij_for_axis(kwargs['axis'], i, j) values = band_arr.values[ind1, ind2, ind3] d = describe(values) t = (d.variance, d.skewness, d.kurtosis, d.minmax[0], d.minmax[1]) median = np.median(values) std = np.std(values) non_param_skew = (d.mean - median) / std r = t + (median, std, non_param_skew) new_arr[row, :] = r attrs = copy.deepcopy(X.attrs) attrs.update(kwargs) da = xr.DataArray(new_arr, coords=[('space', np.arange(num_rows)), ('band', np.array(cols))], dims=('space', 'band'), attrs=attrs) X_new = ElmStore({'flat': da}, attrs=attrs, add_canvas=False) return (X_new, y, sample_weight)
def aggregate_simple(es, **kwargs): '''aggregate ElmStore - elm.pipeline.steps.Agg Parameters: :kwargs: Keywords may contain - :func: aggregation func name like "mean", "std" - :dim: dimension name - :axis: dimension integer Returns: :ElmStore: aggregated ''' func = kwargs['func'] if not func in AGG_METHODS: raise ValueError( 'Expected an agg "func" among: {}'.format(AGG_METHODS)) kw = {k: v for k, v in kwargs.items() if k not in ('func', )} dim = kwargs.get('dim') axis = kwargs.get('axis') if isinstance(axis, int) and dim or (not isinstance(axis, int) and not dim): raise ValueError( 'kwargs given to aggregate_simple must include *one* of "dim" or "axis"' ) agged = OrderedDict() lost_axes = [] for band in es.data_vars: data_arr = getattr(es, band) lost_axes.append(data_arr.dims.index(dim) if dim else axis) agged[band] = getattr(data_arr, func)(**kw) if len(set(lost_axes)) != 1: raise ValueError( 'Cannot aggregate when the axis (dim) of aggregation is not the same for all DataArrays in ElmStore' ) return ElmStore(agged, attrs=es.attrs, add_canvas=False, lost_axis=lost_axes[0])
def transpose(es, new_dims): '''Transpose an ElmStore - elm.pipeline.steps.Transpose Parameters: :new_dims: passed to xarray.DataArray.transpose Returns: :ElmStore transposed ''' trans = OrderedDict() for band in es.data_vars: data_arr = getattr(es, band) if not len(set(new_dims) & set(data_arr.dims)) == len(new_dims): raise ValueError( 'At least one of new_dims is not an existing dim (new_dims {}, existing {})' .format(new_dims, data_arr.dims)) trans[band] = data_arr.transpose(*new_dims) canvas = attr.asdict(trans[band].canvas) canvas['dims'] = new_dims trans[band].attrs['canvas'] = Canvas(**canvas) return ElmStore(trans, attrs=es.attrs)
def select_canvas(es, new_canvas): '''reindex_like new_canvas for every band (DataArray) in ElmStore Parameters: :es: ElmStore :new_canvas: an elm.readers.Canvas object Returns: :es: ElmStore where every band (DataArray) has the same coordinates - those of new_canvas ''' if getattr(es, '_dummy_canvas', False): raise ValueError( 'This ElmStore cannot be run through select_canvas because geo transform was not read correctly from input data' ) es_new_dict = OrderedDict() for band in es.data_vars: data_arr = getattr(es, band) if data_arr.canvas == new_canvas: new_arr = data_arr attrs = data_arr.attrs else: new_coords = canvas_to_coords(new_canvas) old_coords = canvas_to_coords(data_arr.canvas) old_dims = data_arr.canvas.dims new_dims = new_canvas.dims shp_order = [] attrs = copy.deepcopy(data_arr.attrs) attrs['canvas'] = new_canvas for nd in new_dims: if not nd in old_dims: raise ValueError() shp_order.append(old_dims.index(nd)) index_to_make = xr.Dataset(new_coords) data_arr = data_arr.reindex_like(index_to_make, method='nearest') es_new_dict[band] = data_arr attrs = copy.deepcopy(es.attrs) attrs['canvas'] = new_canvas es_new = ElmStore(es_new_dict, attrs=attrs) return es_new
def load_netcdf_array(datafile, meta, band_specs=None): ''' Loads metadata for NetCDF Parameters: :datafile: str: Path on disk to NetCDF file :meta: dict: netcdf metadata object :variables: dict<str:str>, list<str>: list of variables to load Returns: :new_es: ElmStore xarray.Dataset ''' logger.debug('load_netcdf_array: {}'.format(datafile)) ds = xr.open_dataset(datafile) if band_specs: data = [] if isinstance(band_specs, dict): data = { k: ds[getattr(v, 'name', v)] for k, v in band_specs.items() } band_spec = tuple(band_specs.values())[0] if isinstance(band_specs, (list, tuple)): data = { getattr(v, 'name', v): ds[getattr(v, 'name', v)] for v in band_specs } band_spec = band_specs[0] data = OrderedDict(data) else: data = OrderedDict([(v, ds[v]) for v in meta['variables']]) band_spec = None geo_transform = take_geo_transform_from_meta(band_spec=band_spec, required=True, **meta['meta']) for b, sub_dataset_name in zip(meta['band_meta'], data): b['geo_transform'] = meta['geo_transform'] = geo_transform b['sub_dataset_name'] = sub_dataset_name new_es = ElmStore(data, coords=_normalize_coords(ds), attrs=meta) return new_es
def ts_probs(X, y=None, sample_weight=None, **kwargs): '''Fixed or unevenly spaced histogram binning for the time dimension of a 3-D cube DataArray in X Parameters: X: ElmStore or xarray.Dataset y: passed through sample_weight: passed through kwargs: Keywords: axis: Integer like 0, 1, 2 to indicate which is the time axis of cube band: The name of DataArray to time series bin (required) bin_size: Size of the fixed bin or None to use np.histogram (irregular bins) num_bins: How many bins log_probs: Return probabilities associated with log counts? True / False Returns: X: ElmStore with DataArray called flat that has columns composed of: * log transformed counts (if kwargs["log_probs"]) or * counts (if kwargs["counts"]) Number of columns will be equal to num_bins ''' band = kwargs['band'] band_arr = getattr(X, band) num_bins = kwargs['num_bins'] bin_size = kwargs.get('bin_size', None) log_probs = kwargs.get('log_probs', None) if bin_size is not None: bins = np.linspace(-bin_size * num_bins // 2, bin_size * num_bins // 2, num_bins) num_rows = np.prod(band_arr.shape[1:]) col_count = num_bins new_arr = np.empty((num_rows, col_count), dtype=np.float64) logger.info("Histogramming...") small = 1e-8 inds = _ij_for_axis(kwargs['axis'], 0, 0) shp = tuple(s for idx, s in enumerate(band_arr.values.shape) if isinstance(inds[idx], int)) for row, (i, j) in enumerate(product(*(range(s) for s in shp))): ind1, ind2, ind3 = _ij_for_axis(kwargs['axis'], i, j) values_slc = band_arr.values[ind1, ind2, ind3] if bin_size is not None: indices = np.searchsorted(bins, values_slc, side='left') binned = np.bincount(indices).astype(np.float64) # add small to avoid log zero if log_probs: was_zero = binned[binned == 0].size binned[binned == 0] = small else: extra = 0. binned /= binned.sum() if log_probs: binned = np.log10(binned) new_arr[row, :binned.size] = binned if binned.size < new_arr.shape[1]: new_arr[row, binned.size:] = 0 else: hist, edges = np.histogram(values_slc, num_bins) # add one observation to avoid log zero if log_probs: was_zero = hist[hist == 0].size hist[hist == 0] = small else: extra = 1.0 hist = hist.sum() if log_probs: hist = np.log10(hist) new_arr[row, :] = hist gc.collect() attrs = copy.deepcopy(X.attrs) attrs.update(kwargs) da = xr.DataArray(new_arr, coords=[('space', np.arange(num_rows)), ('band', np.arange(col_count))], dims=('space', 'band'), attrs=attrs) X_new = ElmStore({'flat': da}, attrs=attrs, add_canvas=False) return (X_new, y, sample_weight)
def load_dir_of_tifs_array(dir_of_tiffs, meta, band_specs=None): '''Return an ElmStore where each subdataset is a DataArray Parameters: :dir_of_tiffs: directory of GeoTiff files where each is a single band raster :meta: meta from elm.readers.load_dir_of_tifs_meta :band_specs: list of elm.readers.BandSpec objects, defaulting to reading all subdatasets as bands Returns: :X: ElmStore ''' logger.debug('load_dir_of_tifs_array: {}'.format(dir_of_tiffs)) band_order_info = meta['band_order_info'] tifs = ls_tif_files(dir_of_tiffs) logger.info('Load tif files from {}'.format(dir_of_tiffs)) if not len(band_order_info): raise ValueError('No matching bands with ' 'band_specs {}'.format(band_specs)) native_dims = ('y', 'x') elm_store_dict = OrderedDict() attrs = {'meta': meta} attrs['band_order'] = [] for (idx, filename, band_spec), band_meta in zip(band_order_info, meta['band_meta']): band_name = getattr(band_spec, 'name', band_spec) if not isinstance(band_spec, str): reader_kwargs = { k: getattr(band_spec, k) for k in READ_ARRAY_KWARGS if getattr(band_spec, k) } else: reader_kwargs = {} if 'buf_xsize' in reader_kwargs: reader_kwargs['width'] = reader_kwargs.pop('buf_xsize') if 'buf_ysize' in reader_kwargs: reader_kwargs['height'] = reader_kwargs.pop('buf_ysize') if 'window' in reader_kwargs: reader_kwargs['window'] = tuple(map(tuple, reader_kwargs['window'])) # TODO multx, multy should be handled here as well? if reader_kwargs: multy = band_meta['height'] / reader_kwargs.get( 'height', band_meta['height']) multx = band_meta['width'] / reader_kwargs.get( 'width', band_meta['width']) else: multx = multy = 1. band_meta.update(reader_kwargs) geo_transform = take_geo_transform_from_meta(band_spec, **attrs) handle, raster = open_prefilter(filename, band_meta, **reader_kwargs) raster = raster_as_2d(raster) if getattr(band_spec, 'stored_coords_order', ['y', 'x'])[0] == 'y': rows, cols = raster.shape else: rows, cols = raster.T.shape if geo_transform is None: band_meta['geo_transform'] = handle.get_transform() else: band_meta['geo_transform'] = geo_transform band_meta['geo_transform'][1] *= multx band_meta['geo_transform'][-1] *= multy coords_x, coords_y = geotransform_to_coords(cols, rows, band_meta['geo_transform']) elm_store_dict[band_name] = xr.DataArray(raster, coords=[ ('y', coords_y), ('x', coords_x), ], dims=native_dims, attrs=band_meta) attrs['band_order'].append(band_name) gc.collect() return ElmStore(elm_store_dict, attrs=attrs)
def load_hdf4_array(datafile, meta, band_specs=None): '''Return an ElmStore where each subdataset is a DataArray Parameters: :datafile: filename :meta: meta from elm.readers.load_hdf4_meta :band_specs: list of elm.readers.BandSpec objects, defaulting to reading all subdatasets as bands Returns: :Elmstore: Elmstore of teh hdf4 data ''' from elm.readers import ElmStore from elm.sample_util.metadata_selection import match_meta logger.debug('load_hdf4_array: {}'.format(datafile)) f = gdal.Open(datafile, GA_ReadOnly) sds = meta['sub_datasets'] band_metas = meta['band_meta'] band_order_info = [] if band_specs: for band_meta, s in zip(band_metas, sds): for idx, band_spec in enumerate(band_specs): if match_meta(band_meta, band_spec): band_order_info.append((idx, band_meta, s, band_spec)) break band_order_info.sort(key=lambda x: x[0]) if not len(band_order_info): raise ValueError('No matching bands with ' 'band_specs {}'.format(band_specs)) else: band_order_info = [(idx, band_meta, s, 'band_{}'.format(idx)) for idx, (band_meta, s) in enumerate(zip(band_metas, sds))] native_dims = ('y', 'x') elm_store_data = OrderedDict() band_order = [] for _, band_meta, s, band_spec in band_order_info: attrs = copy.deepcopy(meta) attrs.update(copy.deepcopy(band_meta)) if isinstance(band_spec, BandSpec): name = band_spec.name reader_kwargs = { k: getattr(band_spec, k) for k in READ_ARRAY_KWARGS if getattr(band_spec, k) } geo_transform = take_geo_transform_from_meta(band_spec, **attrs) else: reader_kwargs = {} name = band_spec geo_transform = None reader_kwargs = window_to_gdal_read_kwargs(**reader_kwargs) dat0 = gdal.Open(s[0], GA_ReadOnly) band_meta.update(reader_kwargs) raster = raster_as_2d(dat0.ReadAsArray(**reader_kwargs)) if geo_transform is None: geo_transform = dat0.GetGeoTransform() attrs['geo_transform'] = geo_transform if hasattr(band_spec, 'store_coords_order'): if band_spec.stored_coords_order[0] == 'y': rows, cols = raster.shape else: rows, cols = raster.T.shape else: rows, cols = raster.shape coord_x, coord_y = geotransform_to_coords(cols, rows, geo_transform) canvas = Canvas(geo_transform=geo_transform, buf_xsize=cols, buf_ysize=rows, dims=native_dims, ravel_order='C', bounds=geotransform_to_bounds(cols, rows, geo_transform)) attrs['canvas'] = canvas elm_store_data[name] = xr.DataArray(raster, coords=[('y', coord_y), ('x', coord_x)], dims=native_dims, attrs=attrs) band_order.append(name) del dat0 attrs = copy.deepcopy(attrs) attrs['band_order'] = band_order gc.collect() return ElmStore(elm_store_data, attrs=attrs)