def climByAveraging( urls, # list of granule URLs for a time period variable, # name of primary variable in file mask, # name of mask variable coordinates, # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon') maskFn=qcMask, # mask function to compute mask from mask variable averageFn=average # averaging function to use ): '''Compute a climatology over N arrays by applying a mask and averaging function. Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary. ***Assumption: This routine assumes that the N grids will fit in memory.*** ''' n = len(urls) varList = [variable, mask] for i, url in enumerate(urls): fn = retrieveFile(url, '~/cache') if VERBOSE: print >> sys.stderr, 'Read variables and mask ...' var, fh = getVariables( fn, varList) # return dict of variable objects by name if i == 0: dtype = var[variable].dtype shape = (n, ) + var[variable].shape accum = N.ma.empty(shape, dtype) v = maskFn(var[variable], var[mask]) # apply quality mask variable to get numpy MA # v = var[variable][:] accum[i] = v # accumulate N arrays for 'averaging' if i + 1 != len( urls): # keep var dictionary from last file to grab metadata close(fh) # REMEMBER: closing fh loses in-memory data structures if VERBOSE: print >> sys.stderr, 'Averaging ...' coord, fh = getVariables( fn, coordinates) # read coordinate arrays and add to dict for c in coordinates: var[c] = coord[c][:] if averageFn == average: avg = averageFn(accum) # call averaging function else: var[variable] = accum if averageFn == gaussInterp: varNames = variable + coordinates avg, vweight, status = \ gaussInterp(var, varNames, latGrid, lonGrid, wlat, wlon, slat, slon, stime, vfactor, missingValue) var['attributes'] = var[ variable].__dict__ # save attributes of primary variable var[variable] = avg # return primary variable & mask arrays in dict var[mask] = N.ma.getmask(avg) # close(fh) # Can't close, lose netCDF4.Variable objects, leaking two fh return var
def readAndMask(url, variable, mask=None, cachePath='/tmp/cache', hdfsPath=None): """ Read a variable from a netCDF or HDF file and return a numpy masked array. If the URL is remote or HDFS, first retrieve the file into a cache directory. """ from variables import getVariables, close v = None if mask: variables = [variable, mask] else: variables = [variable] try: from cache import retrieveFile path = retrieveFile(url, cachePath, hdfsPath) except: print >> sys.stderr, 'readAndMask: Error, continuing without file %s' % url return v if CCMPWind.Variable in variables: var, fh = getVariables( path, ['uwnd', 'vwnd'], arrayOnly=True, set_auto_mask=True) # return dict of variable objects by name uwnd_avg = np.average(var['uwnd'], axis=0) vwnd_avg = np.average(var['vwnd'], axis=0) wind_magnitude = np.sqrt( np.add(np.multiply(uwnd_avg, uwnd_avg), np.multiply(vwnd_avg, vwnd_avg))) v = wind_magnitude if v.shape[0] == 1: v = v[ 0] # throw away trivial time dimension for CF-style files close(fh) else: try: print >> sys.stderr, 'Reading variable %s from %s' % (variable, path) var, fh = getVariables( path, variables, arrayOnly=True, set_auto_mask=True ) # return dict of variable objects by name v = var[variable] # could be masked array if v.shape[0] == 1: v = v[ 0] # throw away trivial time dimension for CF-style files close(fh) except: print >> sys.stderr, 'readAndMask: Error, cannot read variable %s from file %s' % ( variable, path) return v
def accumulate(urls, variable, accumulators, cachePath=CachePath): '''Accumulate data into statistics accumulators like count, sum, sumsq, min, max, M3, M4, etc.''' keys, urls = urls accum = {} for i, url in enumerate(urls): try: path = retrieveFile(url, cachePath) fn = os.path.split(path)[1] except: print >> sys.stderr, 'accumulate: Error, continuing without file %s' % url continue try: var, fh = getVariables( path, [variable], arrayOnly=True, set_auto_mask=True) # return dict of variable objects by name v = var[variable] # masked array close(fh) except: print >> sys.stderr, 'accumulate: Error, cannot read variable %s from file %s' % ( variable, path) continue if i == 0: for k in accumulators: if k == 'min': accum[k] = default_fillvals['f8'] * N.ones(v.shape, dtype=N.float64) elif k == 'max': accum[k] = -default_fillvals['f8'] * N.ones( v.shape, dtype=N.float64) elif k == 'count': accum[k] = N.zeros(v.shape, dtype=N.int64) else: accum[k] = N.zeros(v.shape, dtype=N.float64) if 'count' in accumulators: accum['count'] += ~v.mask if 'min' in accumulators: accum['min'] = N.ma.minimum(accum['min'], v) if 'max' in accumulators: accum['max'] = N.ma.maximum(accum['max'], v) v = N.ma.filled(v, 0.) if 'sum' in accumulators: accum['sum'] += v if 'sumsq' in accumulators: accum['sumsq'] += v * v return (keys, accum)
def climByAveraging( urls, # list of granule URLs for a time period variable, # name of primary variable in file mask, # name of mask variable coordinates, # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon') maskFn=qcMask, # mask function to compute mask from mask variable averageFn=mean, # averaging function to use averagingConfig={}, # parameters to control averaging function (e.g. gaussInterp) optimization='fortran', # optimization mode (fortran or cython) cachePath=CachePath): '''Compute a climatology over N arrays by applying a mask and averaging function. Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary. ***Assumption: This routine assumes that the N grids will fit in memory.*** ''' n = len(urls) varList = [variable, mask] var = {} vtime = N.zeros((n, ), N.int32) for i, url in enumerate(urls): try: path = retrieveFile(url, cachePath) fn = os.path.split(path)[1] vtime[i] = int(fn[5:8]) # KLUDGE: extract DOY from filename except: print >> sys.stderr, 'climByAveraging: Error, continuing without file %s' % url accum[i] = emptyVar continue if path is None: continue print >> sys.stderr, 'Read variables and mask ...' try: var, fh = getVariables( path, varList, arrayOnly=True, order='F', set_auto_mask=False) # return dict of variable objects by name except: print >> sys.stderr, 'climByAveraging: Error, cannot read file %s' % path accum[i] = emptyVar continue if i == 0: dtype = var[variable].dtype if 'int' in dtype.name: dtype = N.float32 shape = (n, ) + var[variable].shape accum = N.ma.empty(shape, dtype, order='F') emptyVar = N.array( N.ma.masked_all(var[variable].shape, dtype), order='F' ) # totally masked variable array for missing or bad file reads print >> sys.stderr, 'Read coordinates ...' var, fh = getVariables( path, coordinates, var, arrayOnly=True, order='F') # read coordinate arrays and add to dict var[variable] = maskFn( var[variable], var[mask] ) # apply quality mask variable to get numpy MA, turned off masking done by netCDF4 library # var[variable] = var[variable][:] # Echo variable range for sanity check vals = var[variable].compressed() print >> sys.stderr, 'Variable Range: min, max:', vals.min(), vals.max( ) # Plot input grid # figFile = histogram(vals, variable, n, os.path.split(path)[1]) # figFile = contourMap(var, variable, coordinates[1:], n, os.path.split(path)[1]) accum[i] = var[variable] # accumulate N arrays for 'averaging""" # if i != 0 and i+1 != n: close(fh) # REMEMBER: closing fh loses netCDF4 in-memory data structures close(fh) coordinates = [ 'time' ] + coordinates # add constructed time (days) as first coordinate var['time'] = vtime if averagingConfig['name'] == 'pixelMean': print >> sys.stderr, 'Doing Pixel Average over %d grids ...' % n start = time.time() avg = averageFn(accum) # call pixel averaging function end = time.time() print >> sys.stderr, 'pixelMean execution time:', (end - start) outlat = var[coordinates[1]].astype(N.float32)[:] outlon = var[coordinates[2]].astype(N.float32)[:] elif averagingConfig['name'] == 'gaussInterp': print >> sys.stderr, 'Doing Gaussian Interpolation over %d grids ...' % n var[variable] = accum c = averagingConfig latGrid = c['latGrid'] lonGrid = c['lonGrid'] if latGrid is not None and lonGrid is not None: outlat = N.arange(latGrid[0], latGrid[1] + latGrid[2], latGrid[2], dtype=N.float32, order='F') outlon = N.arange(lonGrid[0], lonGrid[1] + lonGrid[2], lonGrid[2], dtype=N.float32, order='F') else: outlat = N.array(var[coordinates[1]], dtype=N.float32, order='F') outlon = N.array(var[coordinates[2]], dtype=N.float32, order='F') varNames = [variable] + coordinates start = time.time() avg, weight, status = \ gaussInterp(var, varNames, outlat, outlon, c['wlat'], c['wlon'], c['slat'], c['slon'], c['stime'], c['vfactor'], c['missingValue'], VERBOSE, optimization) end = time.time() var['outweight'] = weight.astype(N.float32) print >> sys.stderr, 'gaussInterp execution time:', (end - start) elif averagingConfig['name'] == 'spatialFilter': print >> sys.stderr, 'Applying Spatial 3x3 Filter and then averaging over %d grids ...' % n var[variable] = accum c = averagingConfig varNames = [variable] + coordinates start = time.time() avg, count, status = \ spatialFilter(var, varNames, c['spatialFilter'], c['normalization'], c['missingValue'], VERBOSE, optimization) end = time.time() print >> sys.stderr, 'spatialFilter execution time:', (end - start) outlat = var[coordinates[1]].astype(N.float32)[:] outlon = var[coordinates[2]].astype(N.float32)[:] var['out' + variable] = avg.astype( N.float32) # return primary variable & mask arrays in dict var['out' + mask] = N.ma.getmask(avg) var['outlat'] = outlat var['outlon'] = outlon return var
def accumulateClim(urls, # list of granule URLs for a time period variable, # name of primary variable in file mask, # name of mask variable coordinates, # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon') maskFn=qcMask, # mask function to compute mask from mask variable averageFn=mean, # averaging function to use averagingConfig={}, # parameters to control averaging function (e.g. gaussInterp) optimization='fortran', # optimization mode (fortran or cython) cachePath=CachePath ): '''Compute a climatology over N arrays by applying a mask and averaging function. Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary. ***Assumption: This routine assumes that the N grids will fit in memory.*** ''' print >>sys.stderr, 'accumulateClim: Doing %s ...' % averagingConfig['name'] varList = [variable, mask] for i, url in enumerate(urls): try: path = retrieveFile(url, cachePath) fn = os.path.split(path)[1] vtime = int(fn[5:8]) # KLUDGE: extract DOY from filename except: print >>sys.stderr, 'climByAveraging: Error, continuing without file %s' % url continue if path is None: continue try: print >>sys.stderr, 'Reading file %s ...' % path var, fh = getVariables(path, varList, arrayOnly=True, order='F', set_auto_mask=False) # return dict of variable objects by name except: print >>sys.stderr, 'climByAveraging: Error, cannot read file %s' % path continue if i == 0: dtype = var[variable].dtype if 'int' in dtype.name: dtype = N.float32 shape = var[variable].shape vsum = N.ma.empty(shape, dtype, order='F') vcount = N.ma.empty(shape, dtype, order='F') emptyVar = N.array(N.ma.masked_all(var[variable].shape, dtype), order='F') # totally masked variable array for missing or bad file reads print >>sys.stderr, 'Read coordinates ...' var, fh = getVariables(path, coordinates, var, arrayOnly=True, order='F') # read coordinate arrays and add to dict var[variable] = maskFn(var[variable], var[mask]) # apply quality mask variable to get numpy MA, turned off masking done by netCDF4 library # Echo variable range for sanity check vals = var[variable].compressed() print >>sys.stderr, 'Variable Range: min, max:', vals.min(), vals.max() if averagingConfig['name'] == 'pixelMean': vsum += var[variable] # update accumulators vcount += ~var[mask] elif averagingConfig['name'] == 'gaussInterp': var[variable] = accum c = averagingConfig latGrid = c['latGrid']; lonGrid = c['lonGrid'] if latGrid is not None and lonGrid is not None: outlat = N.arange(latGrid[0], latGrid[1]+latGrid[2], latGrid[2], dtype=N.float32, order='F') outlon = N.arange(lonGrid[0], lonGrid[1]+lonGrid[2], lonGrid[2], dtype=N.float32, order='F') else: outlat = N.array(var[coordinates[1]], dtype=N.float32, order='F') outlon = N.array(var[coordinates[2]], dtype=N.float32, order='F') varNames = [variable] + coordinates start = time.time() avg, weight, status = \ gaussInterp(var, varNames, outlat, outlon, c['wlat'], c['wlon'], c['slat'], c['slon'], c['stime'], c['vfactor'], c['missingValue'], VERBOSE, optimization) end = time.time() vcount = weight.astype(N.float32) vsum = avg print >>sys.stderr, 'gaussInterp execution time:', (end - start) elif averagingConfig['name'] == 'spatialFilter': var[variable] = accum c = averagingConfig varNames = [variable] + coordinates start = time.time() avg, vcount, status = \ spatialFilter(var, varNames, c['spatialFilter'], c['normalization'], c['missingValue'], VERBOSE, optimization) vsum = avg end = time.time() print >>sys.stderr, 'spatialFilter execution time:', (end - start) close(fh) return (vcount, vsum)