def _decode_datetime_cf(data_array, decode_times, decode_timedelta): """ Decide the datetime based on CF conventions """ if decode_timedelta is None: decode_timedelta = decode_times for coord in data_array.coords: time_var = None if decode_times and "since" in data_array[coord].attrs.get("units", ""): time_var = times.CFDatetimeCoder(use_cftime=True).decode( as_variable(data_array[coord]), name=coord ) elif ( decode_timedelta and data_array[coord].attrs.get("units") in times.TIME_UNITS ): time_var = times.CFTimedeltaCoder().decode( as_variable(data_array[coord]), name=coord ) if time_var is not None: dimensions, data, attributes, encoding = variables.unpack_for_decoding( time_var ) data_array = data_array.assign_coords( { coord: IndexVariable( dims=dimensions, data=data, attrs=attributes, encoding=encoding, ) } ) return data_array
def to_xarray_variable(self, value): """Convert the input value to an `xarray.Variable` object. Parameters ---------- value : object The input value can be in the form of a single value, an array-like, a ``(dims, data[, attrs])`` tuple, another `xarray.Variable` object or a `xarray.DataArray` object. if None, the `default_value` attribute is used to set the value. Returns ------- variable : `xarray.Variable` A xarray Variable object whith data corresponding to the input (or default) value and with attributes ('description' and other key:value pairs found in `Variable.attrs`). """ if value is None: value = self.default_value # in case where value is a 1-d array without dimension name, # dimension name is set to 'this_variable' and has to be renamed # later by the name of the variable in a process/model. xr_variable = as_variable(value, name='this_variable') xr_variable.attrs.update(self.attrs) if self.description: xr_variable.attrs['description'] = self.description return xr_variable
def _calc_concat_dims_coords(dims): """ Infer the dimension names and 1d coordinate variables (if appropriate) for concatenating along the new dimensions. Based on the function _calc_concat_dim_coord in xarray, but updated to support multiple dims. """ dimensions = [] coordinates = [] for dim in dims: if isinstance(dim, basestring): coord = None elif not hasattr(dim, 'dims'): # dim is not a DataArray or IndexVariable dim_name = getattr(dim, "name", None) if dim_name is None: dim_name = "concat_dim" coord = IndexVariable(dim_name, dim) dim = dim_name elif not hasattr(dim, 'name'): coord = as_variable(dim).to_index_variable() (dim, ) = coord.dims else: coord = dim (dim, ) = coord.dims dimensions.append(dim) coordinates.append(coord) return dimensions, coordinates
def enforce_cf_variable(var, mask_and_scale=True): """ Given a Variable constructed from GEOS-Chem output, enforce CF-compliant metadata and formatting. Until a bug with lazily-loaded data and masking/scaling is resolved in xarray, you have the option to manually mask and scale the data here. Parameters ---------- var : xarray.Variable A variable holding information decoded from GEOS-Chem output. mask_and_scale : bool Flag to scale and mask the data given the unit conversions provided Returns ------- out : xarray.Variable The original variable processed to conform to CF standards .. note:: This method borrows heavily from the ideas in ``xarray.decode_cf_variable`` """ var = as_variable(var) data = var._data # avoid loading by accessing _data instead of data dims = var.dims attrs = var.attrs.copy() encoding = var.encoding.copy() orig_dtype = data.dtype # Process masking/scaling coordinates. We only expect a "scale" value # for the units with this output. if 'scale' in attrs: scale = attrs.pop('scale') attrs['scale_factor'] = scale encoding['scale_factor'] = scale # TODO: Once the xr.decode_cf bug is fixed, we won't need to manually # handle masking/scaling if mask_and_scale: data = scale*data # Process units # TODO: How do we want to handle parts-per-* units? These are not part of # the udunits standard, and the CF conventions suggest using units # like 1e-6 for parts-per-million. But we potentially mix mass and # volume/molar mixing ratios in GEOS-Chem output, so we need a way # to handle that edge case. if 'unit' in attrs: unit = attrs.pop('unit') unit = get_cfcompliant_units(unit) attrs['units'] = unit # TODO: Once the xr.decode_cf bug is fixed, we won't need to manually # handle masking/scaling return Variable(dims, data, attrs, encoding=encoding)
def assertVariableNotEqual(self, v1, v2): self.assertFalse(as_variable(v1).equals(v2))
def assertVariableIdentical(self, v1, v2): assert as_variable(v1).identical(v2), (v1, v2)
def assertVariableEqual(self, v1, v2): assert as_variable(v1).equals(v2), (v1, v2)
def _infer_coords_and_dims(shape, coords, dims): """All the logic for creating a new DataArray Note: Copied with minor modifications from xarray.variable.py version 0.9.6 as it was not part of the xarray public API. """ if (coords is not None and not is_dict_like(coords) and len(coords) != len(shape)): raise ValueError('coords is not dict-like, but it has %s items, ' 'which does not match the %s dimensions of the ' 'data' % (len(coords), len(shape))) if isinstance(dims, six.string_types): dims = (dims, ) if dims is None: dims = ['dim_%s' % n for n in range(len(shape))] if coords is not None and len(coords) == len(shape): # try to infer dimensions from coords if is_dict_like(coords): raise TypeError( 'inferring DataArray dimensions from dictionary ' 'like ``coords`` has been deprecated. Use an ' 'explicit list of ``dims`` instead.') else: for n, (dim, coord) in enumerate(zip(dims, coords)): coord = as_variable(coord, name=dims[n]).to_index_variable() dims[n] = coord.name dims = tuple(dims) else: for d in dims: if not isinstance(d, six.string_types): raise TypeError('dimension %s is not a string' % d) new_coords = OrderedDict() if is_dict_like(coords): for k, v in coords.items(): new_coords[k] = as_variable(v, name=k) elif coords is not None: for dim, coord in zip(dims, coords): var = as_variable(coord, name=dim) var.dims = (dim, ) new_coords[dim] = var sizes = dict(zip(dims, shape)) for k, v in new_coords.items(): if any(d not in dims for d in v.dims): raise ValueError('coordinate %s has dimensions %s, but these ' 'are not a subset of the DataArray ' 'dimensions %s' % (k, v.dims, dims)) for d, s in zip(v.dims, v.shape): if s != sizes[d]: raise ValueError('conflicting sizes for dimension %r: ' 'length %s on the data but length %s on ' 'coordinate %r' % (d, sizes[d], s, k)) assert_unique_multiindex_level_names(new_coords) return new_coords, dims
def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, decode_times=True, decode_endianness=True): """ Decodes a variable which may hold CF encoded information. This includes variables that have been masked and scaled, which hold CF style time variables (this is almost always the case if the dataset has been serialized) and which have strings encoded as character arrays. Parameters ---------- var : Variable A variable holding potentially CF encoded information. concat_characters : bool Should character arrays be concatenated to strings, for example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' mask_and_scale: bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). decode_times : bool Decode cf times ('hours since 2000-01-01') to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. Returns ------- out : Variable A variable holding the decoded equivalent of var """ # use _data instead of data so as not to trigger loading data var = as_variable(var) data = var._data dimensions = var.dims attributes = var.attrs.copy() encoding = var.encoding.copy() original_dtype = data.dtype if concat_characters: if data.dtype.kind == 'S' and data.dtype.itemsize == 1 and data.shape[ -1] != 0: dimensions = dimensions[:-1] data = CharToStringArray(data) if mask_and_scale: if 'missing_value' in attributes: # missing_value is deprecated, but we still want to support it as # an alias for _FillValue. if ('_FillValue' in attributes and not utils.equivalent( attributes['_FillValue'], attributes['missing_value'])): raise ValueError( "Discovered conflicting _FillValue " "and missing_value. Considering " "opening the offending dataset using " "decode_cf=False, corrected the attributes", "and decoding explicitly using " "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " "all values to NaN.".format(str(fill_value)), RuntimeWarning, stacklevel=3) scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or scale_factor is not None or add_offset is not None): if fill_value.dtype.kind in ['U', 'S']: dtype = object else: dtype = float data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset, dtype) if decode_times and 'units' in attributes: if 'since' in attributes['units']: # datetime units = pop_to(attributes, encoding, 'units') calendar = pop_to(attributes, encoding, 'calendar') data = DecodedCFDatetimeArray(data, units, calendar) elif attributes['units'] in TIME_UNITS: # timedelta units = pop_to(attributes, encoding, 'units') data = DecodedCFTimedeltaArray(data, units) if decode_endianness and not data.dtype.isnative: # do this last, so it's only done if we didn't already unmask/scale data = NativeEndiannessArray(data) original_dtype = data.dtype if 'dtype' in encoding: if original_dtype != encoding['dtype']: warnings.warn("CF decoding is overwriting dtype") else: encoding['dtype'] = original_dtype if 'dtype' in attributes and attributes['dtype'] == 'bool': del attributes['dtype'] data = BoolTypeArray(data) return Variable(dimensions, indexing.LazilyIndexedArray(data), attributes, encoding=encoding)
def test_as_variable(self): data = np.arange(10) expected = Variable("x", data) self.assertVariableIdentical(expected, as_variable(expected)) ds = Dataset({"x": expected}) self.assertVariableIdentical(expected, as_variable(ds["x"])) self.assertNotIsInstance(ds["x"], Variable) self.assertIsInstance(as_variable(ds["x"]), Variable) FakeVariable = namedtuple("FakeVariable", "values dims") fake_xarray = FakeVariable(expected.values, expected.dims) self.assertVariableIdentical(expected, as_variable(fake_xarray)) xarray_tuple = (expected.dims, expected.values) self.assertVariableIdentical(expected, as_variable(xarray_tuple)) with self.assertRaisesRegexp(TypeError, "tuples to convert"): as_variable(tuple(data)) with self.assertRaisesRegexp(TypeError, "without an explicit list of dimensions"): as_variable(data) actual = as_variable(data, name="x") self.assertVariableIdentical(expected, actual) self.assertIsInstance(actual, Coordinate) actual = as_variable(0) expected = Variable([], 0) self.assertVariableIdentical(expected, actual)
def test_as_variable(self): data = np.arange(10) expected = Variable('x', data) self.assertVariableIdentical(expected, as_variable(expected)) ds = Dataset({'x': expected}) self.assertVariableIdentical(expected, as_variable(ds['x'])) self.assertNotIsInstance(ds['x'], Variable) self.assertIsInstance(as_variable(ds['x']), Variable) self.assertIsInstance(as_variable(ds['x'], strict=False), DataArray) FakeVariable = namedtuple('FakeVariable', 'values dims') fake_xarray = FakeVariable(expected.values, expected.dims) self.assertVariableIdentical(expected, as_variable(fake_xarray)) xarray_tuple = (expected.dims, expected.values) self.assertVariableIdentical(expected, as_variable(xarray_tuple)) with self.assertRaisesRegexp(TypeError, 'cannot convert arg'): as_variable(tuple(data)) with self.assertRaisesRegexp(TypeError, 'cannot infer .+ dimensions'): as_variable(data) actual = as_variable(data, key='x') self.assertVariableIdentical(expected, actual) actual = as_variable(0) expected = Variable([], 0) self.assertVariableIdentical(expected, actual)
def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, decode_times=True, decode_endianness=True): """ Decodes a variable which may hold CF encoded information. This includes variables that have been masked and scaled, which hold CF style time variables (this is almost always the case if the dataset has been serialized) and which have strings encoded as character arrays. Parameters ---------- var : Variable A variable holding potentially CF encoded information. concat_characters : bool Should character arrays be concatenated to strings, for example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' mask_and_scale: bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). decode_times : bool Decode cf times ('hours since 2000-01-01') to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. Returns ------- out : Variable A variable holding the decoded equivalent of var """ # use _data instead of data so as not to trigger loading data var = as_variable(var) data = var._data dimensions = var.dims attributes = var.attrs.copy() encoding = var.encoding.copy() original_dtype = data.dtype if concat_characters: if data.dtype.kind == 'S' and data.dtype.itemsize == 1 and data.shape[-1] != 0: dimensions = dimensions[:-1] data = CharToStringArray(data) if mask_and_scale: if 'missing_value' in attributes: # missing_value is deprecated, but we still want to support it as # an alias for _FillValue. if ('_FillValue' in attributes and not utils.equivalent(attributes['_FillValue'], attributes['missing_value'])): raise ValueError("Discovered conflicting _FillValue " "and missing_value. Considering " "opening the offending dataset using " "decode_cf=False, corrected the attributes", "and decoding explicitly using " "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " "all values to NaN.".format(str(fill_value)), RuntimeWarning, stacklevel=3) scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or scale_factor is not None or add_offset is not None): if fill_value.dtype.kind in ['U', 'S']: dtype = object else: dtype = float data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset, dtype) if decode_times and 'units' in attributes: if 'since' in attributes['units']: # datetime units = pop_to(attributes, encoding, 'units') calendar = pop_to(attributes, encoding, 'calendar') data = DecodedCFDatetimeArray(data, units, calendar) elif attributes['units'] in TIME_UNITS: # timedelta units = pop_to(attributes, encoding, 'units') data = DecodedCFTimedeltaArray(data, units) if decode_endianness and not data.dtype.isnative: # do this last, so it's only done if we didn't already unmask/scale data = NativeEndiannessArray(data) original_dtype = data.dtype if 'dtype' in encoding: if original_dtype != encoding['dtype']: warnings.warn("CF decoding is overwriting dtype") else: encoding['dtype'] = original_dtype if 'dtype' in attributes and attributes['dtype'] == 'bool': del attributes['dtype'] data = BoolTypeArray(data) return Variable(dimensions, indexing.LazilyIndexedArray(data), attributes, encoding=encoding)
def test_as_variable(self): data = np.arange(10) expected = Variable('x', data) self.assertVariableIdentical(expected, as_variable(expected)) ds = Dataset({'x': expected}) self.assertVariableIdentical(expected, as_variable(ds['x'])) self.assertNotIsInstance(ds['x'], Variable) self.assertIsInstance(as_variable(ds['x']), Variable) FakeVariable = namedtuple('FakeVariable', 'values dims') fake_xarray = FakeVariable(expected.values, expected.dims) self.assertVariableIdentical(expected, as_variable(fake_xarray)) xarray_tuple = (expected.dims, expected.values) self.assertVariableIdentical(expected, as_variable(xarray_tuple)) with self.assertRaisesRegexp(TypeError, 'tuples to convert'): as_variable(tuple(data)) with self.assertRaisesRegexp(TypeError, 'without an explicit list of dimensions'): as_variable(data) actual = as_variable(data, name='x') self.assertVariableIdentical(expected, actual) self.assertIsInstance(actual, IndexVariable) actual = as_variable(0) expected = Variable([], 0) self.assertVariableIdentical(expected, actual)