def inplace_isel(dataset, **indexers): invalid = [k for k in indexers if k not in dataset.dims] if invalid: raise ValueError("dimensions %r do not exist" % invalid) # all indexers should be int, slice or np.ndarrays indexers = [ (k, (np.asarray(v) if not isinstance(v, (int, np.integer, slice)) else v)) for k, v in iteritems(indexers) ] for name, var in iteritems(dataset._variables): var_indexers = dict((k, v) for k, v in indexers if k in var.dims) dataset._variables[name] = var.isel(**var_indexers) return dataset
def test_concat(self): # TODO: simplify and split this test case # drop the third dimension to keep things relatively understandable data = create_test_data() for k in list(data): if 'dim3' in data[k].dims: del data[k] split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] self.assertDatasetIdentical(data, concat(split_data, 'dim1')) def rectify_dim_order(dataset): # return a new dataset with all variable dimensions transposed into # the order in which they are found in `data` return Dataset(dict((k, v.transpose(*data[k].dims)) for k, v in iteritems(dataset.data_vars)), dataset.coords, attrs=dataset.attrs) for dim in ['dim1', 'dim2']: datasets = [g for _, g in data.groupby(dim, squeeze=False)] self.assertDatasetIdentical(data, concat(datasets, dim)) dim = 'dim2' self.assertDatasetIdentical( data, concat(datasets, data[dim])) self.assertDatasetIdentical( data, concat(datasets, data[dim], coords='minimal')) datasets = [g for _, g in data.groupby(dim, squeeze=True)] concat_over = [k for k, v in iteritems(data.coords) if dim in v.dims and k != dim] actual = concat(datasets, data[dim], coords=concat_over) self.assertDatasetIdentical(data, rectify_dim_order(actual)) actual = concat(datasets, data[dim], coords='different') self.assertDatasetIdentical(data, rectify_dim_order(actual)) # make sure the coords argument behaves as expected data.coords['extra'] = ('dim4', np.arange(3)) for dim in ['dim1', 'dim2']: datasets = [g for _, g in data.groupby(dim, squeeze=True)] actual = concat(datasets, data[dim], coords='all') expected = np.array([data['extra'].values for _ in range(data.dims[dim])]) self.assertArrayEqual(actual['extra'].values, expected) actual = concat(datasets, data[dim], coords='different') self.assertDataArrayEqual(data['extra'], actual['extra']) actual = concat(datasets, data[dim], coords='minimal') self.assertDataArrayEqual(data['extra'], actual['extra']) # verify that the dim argument takes precedence over # concatenating dataset variables of the same name dim = (2 * data['dim1']).rename('dim1') datasets = [g for _, g in data.groupby('dim1', squeeze=False)] expected = data.copy() expected['dim1'] = dim self.assertDatasetIdentical(expected, concat(datasets, dim))
def test_open_encodings(self): # Create a netCDF file with explicit time units # and make sure it makes it into the encodings # and survives a round trip with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, 'w') as ds: ds.createDimension('time', size=10) ds.createVariable('time', np.int32, dimensions=('time', )) units = 'days since 1999-01-01' ds.variables['time'].setncattr('units', units) ds.variables['time'][:] = np.arange(10) + 4 expected = Dataset() time = pd.date_range('1999-01-05', periods=10) encoding = {'units': units, 'dtype': np.dtype('int32')} expected['time'] = ('time', time, {}, encoding) with open_dataset(tmp_file) as actual: self.assertVariableEqual(actual['time'], expected['time']) actual_encoding = dict( (k, v) for k, v in iteritems(actual['time'].encoding) if k in expected['time'].encoding) self.assertDictEqual(actual_encoding, expected['time'].encoding)
def rectify_dim_order(dataset): # return a new dataset with all variable dimensions transposed into # the order in which they are found in `data` return Dataset(dict((k, v.transpose(*data[k].dims)) for k, v in iteritems(dataset.data_vars)), dataset.coords, attrs=dataset.attrs)
def null_wrap(ds): """ Given a data store this wraps each variable in a NullWrapper so that it appears to be out of memory. """ variables = dict((k, Variable(v.dims, NullWrapper(v.values), v.attrs)) for k, v in iteritems(ds)) return InMemoryDataStore(variables=variables, attributes=ds.attrs)
def rectify_dim_order(dataset): # return a new dataset with all variable dimensions tranposed into # the order in which they are found in `data` return Dataset( dict((k, v.transpose(*data[k].dims)) for k, v in iteritems(dataset.data_vars)), dataset.coords, attrs=dataset.attrs, )
def test_compression_encoding(self): data = create_test_data() data['var2'].encoding.update({'zlib': True, 'chunksizes': (5, 5), 'fletcher32': True, 'original_shape': data.var2.shape}) with self.roundtrip(data) as actual: for k, v in iteritems(data['var2'].encoding): self.assertEqual(v, actual['var2'].encoding[k]) # regression test for #156 expected = data.isel(dim1=0) with self.roundtrip(expected) as actual: self.assertDatasetEqual(expected, actual)
def test_compression_encoding(self): data = create_test_data() data['var2'].encoding.update({ 'zlib': True, 'chunksizes': (5, 5), 'fletcher32': True, 'original_shape': data.var2.shape }) with self.roundtrip(data) as actual: for k, v in iteritems(data['var2'].encoding): self.assertEqual(v, actual['var2'].encoding[k]) # regression test for #156 expected = data.isel(dim1=0) with self.roundtrip(expected) as actual: self.assertDatasetEqual(expected, actual)
def drop(self, labels, dim=None, inplace=False): """Drop variables or index labels from this dataset. Based on xarray.dataset.drop, but adds inplace option. Parameters ---------- labels : scalar or list of scalars Name(s) of variables or index labels to drop. dim : None or str, optional Dimension along which to drop index labels. By default (if ``dim is None``), drops variables rather than index labels. inplace : whether the original dataset should be modified or a new one created Returns ------- dropped : Dataset (self if inplace=True) """ if utils.is_scalar(labels): labels = [labels] if dim is None: self._assert_all_in_dataset(labels) drop = set(labels) variables = OrderedDict( (k, v) for k, v in iteritems(self._variables) if k not in drop) coord_names = set(k for k in self._coord_names if k in variables) result = self._replace_vars_and_dims(variables, coord_names, inplace=inplace) else: try: index = self.indexes[dim] except KeyError: raise ValueError('dimension %r does not have coordinate labels' % dim) new_index = index.drop(labels) result = self.loc[{dim: new_index}] return self if inplace else result
def test_open_encodings(self): # Create a netCDF file with explicit time units # and make sure it makes it into the encodings # and survives a round trip with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, 'w') as ds: ds.createDimension('time', size=10) ds.createVariable('time', np.int32, dimensions=('time',)) units = 'days since 1999-01-01' ds.variables['time'].setncattr('units', units) ds.variables['time'][:] = np.arange(10) + 4 expected = Dataset() time = pd.date_range('1999-01-05', periods=10) encoding = {'units': units, 'dtype': np.dtype('int32')} expected['time'] = ('time', time, {}, encoding) with open_dataset(tmp_file) as actual: self.assertVariableEqual(actual['time'], expected['time']) actual_encoding = dict((k, v) for k, v in iteritems(actual['time'].encoding) if k in expected['time'].encoding) self.assertDictEqual(actual_encoding, expected['time'].encoding)
def _dataset_multi_concat( datasets, dim, data_vars, coords, compat, positions, join="outer", ): """ Concatenate a sequence of datasets along a dimension, trying concatenation along alternate dimensions when the chosen dimension is not present. This function is based on _dataset_concat from xarray.core.concat.py in xarray 0.15. It includes a modification to drop mismatched coordinates from datasets instead of throwing a ValueError. This drop removes the variable from coordinates, but it remains a variable in the dataset. """ # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] # determine what dimensions we will be concatenating over, including the preferred dim and any alternatives when # the preferred dim is absent dims = _find_concat_dims(datasets, dim) dims, coordinates = _calc_concat_dims_coords(dims) datasets = align(*datasets, join=join, copy=False, exclude=dims) dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) dim_names = set(dim_coords) unlabeled_dims = dim_names - coord_names both_data_and_coords = coord_names & data_names if both_data_and_coords: # Instead of throwing a ValueError, make the coordinates match by removing the mismatched coordinate for ds in datasets: for variable in both_data_and_coords: if variable in ds.coords: # This makes the variable no longer a coordinate, but does not remove it from the dataset entirely ds._coord_names.remove(variable) coord_names.discard(variable) # we don't want the concat dimensions in the result dataset yet for dim in dims: dim_coords.pop(dim, None) dims_sizes.pop(dim, None) # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: datasets = [ds.expand_dims(dim) for ds in datasets] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( datasets, dims, dim_names, data_vars, coords, compat) # determine which variables to merge, and then merge them according to compat variables_to_merge = (coord_names | data_names) - concat_over - dim_names result_vars = {} if variables_to_merge: to_merge = {var: [] for var in variables_to_merge} for ds in datasets: for var in variables_to_merge: if var in ds: to_merge[var].append(ds.variables[var]) for var in variables_to_merge: result_vars[var] = unique_variable(var, to_merge[var], compat=compat, equals=equals.get(var, None)) else: result_vars = {} result_vars.update(dim_coords) # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv( ds.attrs, result_attrs): raise ValueError("Dataset global attributes not equal.") # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the # concat dimension common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) # find the first concat dimension available in vars concat_dim = [x for x in dims if x in common_dims][0] if not concat_dim: # none of the concat dims are present - add the first one dim = dims[0] common_dims = (dim, ) + common_dims concat_dim = dim for var, dim_len in zip(vars, concat_dim_lengths[concat_dim]): if var.dims != common_dims: common_shape = tuple( dims_sizes.get(d, dim_len) for d in common_dims) var = var.expand_dims(common_dims, common_shape) yield var # stack up each variable to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: try: vars = ensure_common_dims([ds.variables[k] for ds in datasets]) except KeyError: raise ValueError("%r is not present in all datasets." % k) # get the dimension to concatenate this variable on - choose first applicable dim from dims dim = _get_concat_dim(dims, [ds.variables[k] for ds in datasets]) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) absent_coord_names = coord_names - set(result.variables) if absent_coord_names: raise ValueError( "Variables %r are coordinates in some datasets but not others." % absent_coord_names) # current versions of dataset.set_coords and dataset.drop force a _assert_all_in_dataset check that we don't want # xarray 0.15 has the option to disable this via errors='ignore', but for now just call the underlying logic #result = result.set_coords(coord_names, errors='ignore') result._coord_names.update(coord_names) result.encoding = result_encoding #result = result.drop(unlabeled_dims, errors='ignore') drop = set(unlabeled_dims) variables = OrderedDict( (k, v) for k, v in iteritems(result._variables) if k not in drop) coord_names = set(k for k in result._coord_names if k in variables) result._replace_vars_and_dims(variables, coord_names) for coord in coordinates: if coord: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord return result