def __enter__(self): self.nca = NetCDF(self.ncaPath, self.mode) self.groups = {} for groupname in self.nca.groups: self.groups[groupname] = S3NetCDFGroup(self, groupname) return self
def __run(self, vname, indices, _data=None, _value=None): shape = self.shape masterShape = self.master partitions = getPartitions(indices, shape, masterShape) masterIndices = getMasterIndices(indices, shape, masterShape) for part in partitions: idata, ipart = getSubIndex(part, shape, masterIndices) # GET if _data is not None: d = _data.flatten() filepath = self._getFile(vname, part) with NetCDF(filepath, "r") as netcdf: d = getItemNetCDF(netcdf[vname], d, ipart, idata) if not self.parent.localOnly and self.parent.autoRemove: os.remove(filepath) _data = d.reshape(_data.shape) # SET if _value is not None: filepath = self._setFile(vname, part) with NetCDF(filepath, "r+") as netcdf: setItemNetCDF(netcdf[vname], _value, ipart, idata) if not self.parent.localOnly: self.parent.s3.upload(filepath) if self.parent.autoRemove: os.remove(filepath) return _data
def _setFile(self, vname, part): localOnly = self.parent.localOnly s3 = self.parent.s3 filepath = self._getFilepath(vname, part) if not os.path.exists(filepath): if localOnly or not s3.exists(filepath): NetCDF.create(filepath, dimensions=self.childDimensions, variables=copy.deepcopy(self.variables)) else: s3.download(filepath) return filepath
def _quickSet(self, vname, partIndex, value): parts = np.zeros(self.ndata, dtype="int") parts[0] = partIndex filepath = self._setFile(vname, parts) with NetCDF(filepath, "r+") as netcdf: netcdf[vname][:] = value if not self.parent.localOnly: self.parent.s3.upload(filepath)
def _quickGet(self, vname, i): print("here", vname, i) parts = np.zeros(self.ndata, dtype="int") parts[0] = int(np.floor(i / self.child[0])) index = int(i % self.child[0]) filepath = self._getFile(vname, parts) with NetCDF(filepath, "r") as netcdf: var = netcdf[vname][index] return var
class S3NetCDF(object): """ A NetCDF class Parameters ---------- object: name : str, Name of cdffile folder: str,path metadata : squeeze : squeeze get data - mainly used for testing bucket: Name of S3 bucket localOnly:bool To include or ignore s3 storage Default:True ncSize:float,optional Max partition size (MB) nca:object dimensions:[obj] name:str value:int groups:[obj] name:str, variables:[obj] name:str type:str dimensions:[str], etc.. Attributes ---------- name :str, Name of NetCDF folder :path localOnly:bool bucket:str ncPath : path File contains nodes, connectivity table and static variables ncaPath :path Master file, contains information about master and child netcdf files groups : [NetCDF2DGroup] Contains information on different groups or folders (i.e "s","t") Each group contains different variables but with the same dimensions """ def __init__(self, obj, mode="r"): obj = copy.deepcopy(obj) self.mode = mode = mode self.name = name = obj.pop("name", None) self.bucket = bucket = obj.pop("bucket", None) self.dynamodb = dynamodb = obj.pop("dynamodb", None) self.s3prefix = s3prefix = obj.pop("s3prefix", None) self.localOnly = localOnly = obj.pop("localOnly", True) self.squeeze = squeeze = obj.pop("squeeze", False) self.cacheLocation = cacheLocation = obj.pop("cacheLocation", os.getcwd) self.maxPartitions = maxPartitions = obj.pop("maxPartitions", 10) self.ncSize = ncSize = obj.pop("ncSize", 10) self.memorySize = memorySize = obj.pop("memorySize", 20) self.cacheSize = cacheSize = obj.pop("cacheSize", 10) * 1024**2 self.verbose = verbose = obj.pop("verbose", False) self.overwrite = overwrite = obj.pop("overwrite", False) self.autoRemove = autoRemove = obj.pop("autoRemove", True) self.s3 = s3 = S3Client(self, obj.pop("credentials", {})) self.folder = folder = os.path.join(self.cacheLocation, self.name) self.cache = cache = Cache(self) self.groups = None self.nca = None if name is None: raise Exception("NetCDF needs a name") if not localOnly and bucket is None: raise Exception("Need a S3 bucket") if not os.path.exists(folder): os.makedirs(folder, exist_ok=True) self.ncaPath = ncaPath = os.path.join(folder, "{}.nca".format(name)) if not os.path.exists(ncaPath) or overwrite: if localOnly or not s3.exists(ncaPath) or overwrite: if verbose: print( "Creating a new .nca from object (localOnly={},ncaPath={},overwrite={})" .format(localOnly, s3.exists(ncaPath), overwrite)) if not "nca" in obj: raise Exception("NetCDF needs a nca object") createNetCDF(ncaPath, ncSize=ncSize, **obj["nca"]) if not localOnly: s3.upload(ncaPath) if not localOnly and dynamodb: s3.insert() elif s3.exists(ncaPath): if verbose: print("Downloading .nca from S3 - {}".format(ncaPath)) s3.download(ncaPath) else: raise Exception("Unknown error") def __enter__(self): self.nca = NetCDF(self.ncaPath, self.mode) self.groups = {} for groupname in self.nca.groups: self.groups[groupname] = S3NetCDFGroup(self, groupname) return self def __exit__(self, exc_type, exc_val, exc_tb): self.nca.close() def updateMetadata(self, obj): self.nca.updateMetadata(obj) @property def obj(self): return self.nca.obj @property def dimensions(self): return self.nca.obj['dimensions'] @property def variables(self): return self.nca.allvariables def getGroupsByVariable(self, vname): groupsByVariable = self.nca.obj['groupsByVariable'] if not vname in groupsByVariable: raise Exception("Variable {} does not exist".format(vname)) return groupsByVariable[vname] def getVariablesByDimension(self, dname): variablesByDimension = self.nca.obj['variablesByDimension'] if not dname in variablesByDimension: raise Exception("Dimension {} does not exist".format(dname)) return variablesByDimension[dname] def setlocalOnly(self, value): self.localOnly = value def _item_(self, idx): if not isinstance(idx, tuple) or len(idx) < 2: raise TypeError( "groupname and variablename are required, e.g netcdf2d['{groupname}','{variablename}']" ) idx = list(idx) groupname = idx.pop(0) idx = tuple(idx) groups = self.groups if not groupname in groups: raise Exception("Group '{}' does not exist".format(groupname)) group = groups[groupname] return group, idx def __getitem__(self, idx): """ netcdf2d["{groupname}","{variablename}",{...indices...}] """ if isinstance(idx, str): return self.groups[idx] group, idx = self._item_(idx) data = group[idx] data = np.squeeze(data) if self.squeeze else data return data def __setitem__(self, idx, value): """ netcdf2d["{groupname}","{variablename}",{...indices...}]=np.array() """ group, idx = self._item_(idx) group[idx] = value def query(self, obj, return_dimensions=False, return_indices=False): """ Get data using obj instead of using __getitem__ This function will search the obj using keys such ash "group","variable" and name of dimensions (e.g. "x","time") If "group" does not exist, it will find the "group" based on the name of the variable and with the least amount of partition (e.g "s","t") """ dimensions = self.dimensions obj = parseObj(obj, dimensions) # if not 'variable' in obj:raise Exception("Needs 'variable' in query") vname = obj['variable'] gname = obj['group'] if gname is None: groups = self.getGroupsByVariable(vname) if len(groups) > 1: dims = obj.pop('dims') _groups = list( filter( lambda x: any(dim in dims for dim in self.groups[x].dimensions), groups)) if (len(_groups) > 0): groups = _groups # groups=containers.get(container,_groups) # for g in groups: # if not g in _groups:raise Exception("Please review attribute (container={}) since variable '{}' does not exist in group '{}' (s3netcdf/metadata/{})".format(container,vname,g,container)) gname = min( groups, key=lambda x: len(self.groups[x].getPartitions(vname, obj))) partitions, group, idx, indices = self.groups[gname].getPartitions( vname, obj, False) if len(partitions) > self.maxPartitions: raise Exception( "Change group or select smaller query - {} /MaxPartitions is {}" .format(len(partitions), self.maxPartitions)) data = group[(vname, *idx)] data = np.squeeze(data) if self.squeeze else data if return_dimensions and return_indices: return data, group.dimensions, indices if return_dimensions: return data, group.dimensions if return_indices: return data, indices return data
def createNetCDF(filePath, folder=os.getcwd(), metadata={}, dimensions={}, variables={}, groups={}, ncSize=1.0): """ Create typical NetCDF file based on set of variables Parameters ---------- filePath: str,path folder: str,optional. metadata:object,optional. dimensions:object,optional. variables:object,optional. groups:object,optional. ncSize:object,optional. Notes ----- Only applicable for 1 layer of groups """ NetCDF.create(filePath, metadata, dimensions, variables) with Dataset(filePath, "r+") as netcdf: for name in groups: group = groups[name] if not 'variables' in group: raise Exception("Group needs variables") if not 'dimensions' in group: raise Exception("Group needs dimensions") dims = group['dimensions'] variables = group['variables'] # Add dimensions to variable for vname in variables: variables[vname]['dimensions'] = dims shape = [] for d in dims: if (d == name): raise Exception( "group can't have the same name of a dimension") if not d in netcdf.dimensions: raise Exception("Dimension {} does not exist".format(d)) value = netcdf.dimensions[d].size shape.append(value) shape = np.array(shape, dtype="i4") master, child = getMasterShape(shape, return_childshape=True, ncSize=ncSize) group = netcdf.createGroup(name) group = netcdf[name] cdims = {} for i, d in enumerate(dims): cdims[d] = child[i] NetCDF._create( group, { 'cdims': cdims, 'shape': shape, 'master': master, 'child': child, 'dims': dims }, {}, variables)
def test_createNetCDF(): folder = "../s3" filePath = os.path.join(folder, "test1.nc") metadata = dict(title="Mytitle") dimensions = dict(npe=3, nnode=100, ntime=1000, nelem=10) variables = dict( a=dict(type="float32", dimensions=["nnode"], units="m", standard_name="", long_name="", least_significant_digit=3), lat=dict(type="float64", dimensions=["nnode"], units="m", standard_name="", long_name=""), lng=dict(type="float64", dimensions=["nnode"], units="m", standard_name="", long_name=""), elem=dict(type="int32", dimensions=["nelem"], units="m", standard_name="", long_name=""), time=dict(type="float64", dimensions=["ntime"], units="hours since 1970-01-01 00:00:00.0", calendar="gregorian", standard_name="", long_name=""), ) variables2 = dict(u=dict(type="float32", units="m/s", standard_name="", long_name=""), ) groups = dict(s=dict(dimensions=["ntime", "nnode"], variables=variables2), ) createNetCDF(filePath, folder=folder, metadata=metadata, dimensions=dimensions, variables=variables, ncSize=1.0) with NetCDF(filePath, "r") as nc: np.testing.assert_array_equal(nc.obj['metadata'], metadata) np.testing.assert_array_equal(nc.obj['dimensions'], dimensions) np.testing.assert_array_equal( nc.obj['variables']['a'], { 'dimensions': ['nnode'], 'type': 'f', 'least_significant_digit': 3, 'units': 'm', 'standard_name': '', 'long_name': '', 'ftype': 'f' }) createNetCDF(filePath, folder=folder, metadata=metadata, dimensions=dimensions, groups=groups, ncSize=1.0) with NetCDF(filePath, "r") as nc: np.testing.assert_array_equal(nc.obj['metadata'], metadata) np.testing.assert_array_equal(nc.obj['dimensions'], dimensions) np.testing.assert_array_equal( nc.obj['groups']["s"]['variables'], { 'u': { 'dimensions': ['ntime', 'nnode'], 'type': 'f', 'units': 'm/s', 'standard_name': '', 'long_name': '', 'ftype': 'f' } })
def test_1(): folder = "test" input = { "metadata": { "string": "string", "integer": 1, "float": 0.1, "object": { "o1": 1, "o2": "a" } }, "dimensions": { "d1": 8, "d2": 256, "d3": 32, "d4": 512, "d5": 5, "nchar": 6, "d0": 1 }, "variables": { "a": { "type": "b", "dimensions": ["d2"], "units": "m", "standard_name": "A Variable", "long_name": "Long A Variable", "data": np.arange(-128, 128, dtype="byte") }, "b": { "type": "f4", "stype": "u1", "dimensions": ["d2"], "max": 255, "min": 0, "data": np.arange(0, 256, dtype="f4") }, "c": { "type": "f4", "stype": "u1", "dimensions": ["d2"], "max": 255, "min": 0, "data": np.arange(0, 256) }, "d": { "type": "b", "dimensions": ["d1"], "data": np.arange(0, 8) }, "e": { "type": "f4", "stype": "i4", "dimensions": ["d4"], "data": np.arange(0, 512, dtype="f4") }, "f": { "type": "f4", "dimensions": ["d1", "d3"], "max": 255, "min": 0, "data": np.arange(0, 256).reshape((8, 32)) }, "g": { "type": "B", "dimensions": ["d1"], "data": np.arange(0, 8) }, "h": { "type": "B", "dimensions": ["d1"], "data": np.arange(0, 8) }, "i": { "type": "M", "dimensions": ["d1"], "data": np.datetime64('2017-01-01') + np.arange(8) * np.timedelta64(1, 'h') }, "j": { "type": "d", "dimensions": ["d2"], "data": np.arange(0, 256, dtype="d") }, "k": { "type": "S1", "dimensions": ["d5", "nchar"], "data": np.array(["a", "bc", "def", "ghij a", "b"]) }, "l": { "type": "f4", "stype": "i2", "dimensions": ["d1", "d3"], "max": 255, "min": 0, "data": np.arange(0, 256).reshape((8, 32)) }, "m": { "type": "d", "ftype": "M", "dimensions": ["d1"], "data": np.datetime64('2017-01-01') + np.arange(8) * np.timedelta64(1, 'h') }, "n": { "type": "b", "dimensions": ["d0"], "data": np.arange(0, 1) }, "o": { "type": "b", "dimensions": ["d0"], "data": 1 }, }, "groups": { "g1": { "metadata": { "shape": [250000, 250000], "integer": 1, "float": 0.1, "object": { "o1": 1, "o2": "a" } }, "dimensions": { "e1": 256 }, "variables": { "a": { "type": "b", "dimensions": ["d2"], "units": "m", "standard_name": "A Variable", "long_name": "Long A Variable", "data": np.arange(-128, 128, dtype="byte") }, } } }, } for netcdf3 in [False, True]: filePath = os.path.join(folder, "test_1.nc") obj = copy.deepcopy(input) if netcdf3: del obj['groups'] NetCDF.create(filePath, netcdf3=netcdf3, **obj) with NetCDF(filePath, "r") as netcdf: np.testing.assert_array_equal( netcdf.metadata, { "string": "string", "integer": 1, "float": 0.1, "object": { "o1": 1, "o2": "a" } }) np.testing.assert_array_equal(netcdf['a'][:], np.arange(-128, 128, dtype="i1")) np.testing.assert_array_equal(netcdf['b'][:], np.arange(0, 256, dtype="f4")) np.testing.assert_array_equal(netcdf['c'][:], np.arange(0, 256)) np.testing.assert_array_equal(netcdf['d'][:], np.arange(0, 8)) np.testing.assert_almost_equal(netcdf['e'][:], np.arange(0, 512, dtype="f4"), 4) np.testing.assert_array_equal(netcdf['f'][:], np.arange(0, 256).reshape((8, 32))) np.testing.assert_array_equal(netcdf['g'][:], np.arange(0, 8)) np.testing.assert_array_equal(netcdf['h'][:], np.arange(0, 8)) np.testing.assert_array_equal( netcdf['i'][:], np.datetime64('2017-01-01') + np.arange(8) * np.timedelta64(1, 'h')) np.testing.assert_array_equal(netcdf['j'][:], np.arange(0, 256, dtype="d")) np.testing.assert_array_equal( netcdf['k'][:], np.array(["a", "bc", "def", "ghij a", "b"])) np.testing.assert_array_equal(netcdf['l'][:], np.arange(0, 256).reshape((8, 32))) np.testing.assert_array_equal( netcdf['m'][:], np.datetime64('2017-01-01') + np.arange(8) * np.timedelta64(1, 'h')) np.testing.assert_array_equal(netcdf['n'][:], np.arange(0, 1)) np.testing.assert_array_equal(netcdf['o'][:], np.arange(1, 2)) if not netcdf3: np.testing.assert_array_equal(netcdf['g1'].metadata['shape'], np.array([250000, 250000])) np.testing.assert_array_equal(netcdf['g1']['a'][:], np.arange(-128, 128, dtype="i1"))