def readAndMask(url,
                    variable,
                    mask=None,
                    cachePath='/tmp/cache',
                    hdfsPath=None):
        """
        Read a variable from a netCDF or HDF file and return a numpy masked array.
        If the URL is remote or HDFS, first retrieve the file into a cache directory.
        """
        from variables import getVariables, close
        v = None
        if mask:
            variables = [variable, mask]
        else:
            variables = [variable]
        try:
            from cache import retrieveFile
            path = retrieveFile(url, cachePath, hdfsPath)
        except:
            print >> sys.stderr, 'readAndMask: Error, continuing without file %s' % url
            return v

        if CCMPWind.Variable in variables:
            var, fh = getVariables(
                path, ['uwnd', 'vwnd'], arrayOnly=True,
                set_auto_mask=True)  # return dict of variable objects by name
            uwnd_avg = np.average(var['uwnd'], axis=0)
            vwnd_avg = np.average(var['vwnd'], axis=0)
            wind_magnitude = np.sqrt(
                np.add(np.multiply(uwnd_avg, uwnd_avg),
                       np.multiply(vwnd_avg, vwnd_avg)))
            v = wind_magnitude
            if v.shape[0] == 1:
                v = v[
                    0]  # throw away trivial time dimension for CF-style files
            close(fh)
        else:
            try:
                print >> sys.stderr, 'Reading variable %s from %s' % (variable,
                                                                      path)
                var, fh = getVariables(
                    path, variables, arrayOnly=True, set_auto_mask=True
                )  # return dict of variable objects by name
                v = var[variable]  # could be masked array
                if v.shape[0] == 1:
                    v = v[
                        0]  # throw away trivial time dimension for CF-style files
                close(fh)
            except:
                print >> sys.stderr, 'readAndMask: Error, cannot read variable %s from file %s' % (
                    variable, path)

        return v
示例#2
0
def accumulate(urls, variable, accumulators, cachePath=CachePath):
    '''Accumulate data into statistics accumulators like count, sum, sumsq, min, max, M3, M4, etc.'''
    keys, urls = urls
    accum = {}
    for i, url in enumerate(urls):
        try:
            path = retrieveFile(url, cachePath)
            fn = os.path.split(path)[1]
        except:
            print >> sys.stderr, 'accumulate: Error, continuing without file %s' % url
            continue

        try:
            var, fh = getVariables(
                path, [variable], arrayOnly=True,
                set_auto_mask=True)  # return dict of variable objects by name
            v = var[variable]  # masked array
            close(fh)
        except:
            print >> sys.stderr, 'accumulate: Error, cannot read variable %s from file %s' % (
                variable, path)
            continue

        if i == 0:
            for k in accumulators:
                if k == 'min':
                    accum[k] = default_fillvals['f8'] * N.ones(v.shape,
                                                               dtype=N.float64)
                elif k == 'max':
                    accum[k] = -default_fillvals['f8'] * N.ones(
                        v.shape, dtype=N.float64)
                elif k == 'count':
                    accum[k] = N.zeros(v.shape, dtype=N.int64)
                else:
                    accum[k] = N.zeros(v.shape, dtype=N.float64)

        if 'count' in accumulators:
            accum['count'] += ~v.mask
        if 'min' in accumulators:
            accum['min'] = N.ma.minimum(accum['min'], v)
        if 'max' in accumulators:
            accum['max'] = N.ma.maximum(accum['max'], v)

        v = N.ma.filled(v, 0.)
        if 'sum' in accumulators:
            accum['sum'] += v
        if 'sumsq' in accumulators:
            accum['sumsq'] += v * v
    return (keys, accum)
示例#3
0
def writeStats(urls,
               variable,
               stats,
               outFile,
               copyToHdfsPath=None,
               format='NETCDF4',
               cachePath=CachePath):
    '''Write out stats arrays to netCDF with some attributes.
    '''
    keys, stats = stats
    dout = Dataset(outFile, 'w', format=format)
    print >> sys.stderr, 'Writing %s ...' % outFile
    dout.setncattr('variable', variable)
    dout.setncattr('urls', str(urls))
    dout.setncattr('level', str(keys))

    inFile = retrieveFile(urls[0], cachePath)
    din = Dataset(inFile, 'r')
    try:
        coordinates = din.variables[variable].getncattr('coordinates')
        coordinates = coordinates.split()
    except:
        coordinates = ('lat', 'lon')  # kludge: FIX ME

    # Add dimensions and variables, copying data
    coordDim = [
        dout.createDimension(coord, din.variables[coord].shape[0])
        for coord in coordinates
    ]  # here lat, lon, alt, etc.
    for coord in coordinates:
        var = dout.createVariable(coord, din.variables[coord].dtype, (coord, ))
        var[:] = din.variables[coord][:]

    # Add stats variables
    for k, v in stats.items():
        var = dout.createVariable(k, stats[k].dtype, coordinates)
        var[:] = v[:]

    din.close()
    dout.close()
    return outFile
示例#4
0
def climByAveraging(
        urls,  # list of granule URLs for a time period
        variable,  # name of primary variable in file
        mask,  # name of mask variable
        coordinates,  # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon')
        maskFn=qcMask,  # mask function to compute mask from mask variable
        averageFn=mean,  # averaging function to use
        averagingConfig={},  # parameters to control averaging function (e.g. gaussInterp)
        optimization='fortran',  # optimization mode (fortran or cython)
        cachePath=CachePath):
    '''Compute a climatology over N arrays by applying a mask and averaging function.
Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary.
***Assumption:  This routine assumes that the N grids will fit in memory.***
    '''
    n = len(urls)
    varList = [variable, mask]
    var = {}
    vtime = N.zeros((n, ), N.int32)

    for i, url in enumerate(urls):
        try:
            path = retrieveFile(url, cachePath)
            fn = os.path.split(path)[1]
            vtime[i] = int(fn[5:8])  # KLUDGE: extract DOY from filename
        except:
            print >> sys.stderr, 'climByAveraging: Error, continuing without file %s' % url
            accum[i] = emptyVar
            continue
        if path is None: continue
        print >> sys.stderr, 'Read variables and mask ...'
        try:
            var, fh = getVariables(
                path, varList, arrayOnly=True, order='F',
                set_auto_mask=False)  # return dict of variable objects by name
        except:
            print >> sys.stderr, 'climByAveraging: Error, cannot read file %s' % path
            accum[i] = emptyVar
            continue
        if i == 0:
            dtype = var[variable].dtype
            if 'int' in dtype.name: dtype = N.float32
            shape = (n, ) + var[variable].shape
            accum = N.ma.empty(shape, dtype, order='F')
            emptyVar = N.array(
                N.ma.masked_all(var[variable].shape, dtype), order='F'
            )  # totally masked variable array for missing or bad file reads

            print >> sys.stderr, 'Read coordinates ...'
            var, fh = getVariables(
                path, coordinates, var, arrayOnly=True,
                order='F')  # read coordinate arrays and add to dict

        var[variable] = maskFn(
            var[variable], var[mask]
        )  # apply quality mask variable to get numpy MA, turned off masking done by netCDF4 library
        #       var[variable] = var[variable][:]

        # Echo variable range for sanity check
        vals = var[variable].compressed()
        print >> sys.stderr, 'Variable Range: min, max:', vals.min(), vals.max(
        )

        # Plot input grid
        #        figFile = histogram(vals, variable, n, os.path.split(path)[1])
        #        figFile = contourMap(var, variable, coordinates[1:], n, os.path.split(path)[1])

        accum[i] = var[variable]  # accumulate N arrays for 'averaging"""
        #        if i != 0 and i+1 != n: close(fh)              # REMEMBER: closing fh loses netCDF4 in-memory data structures
        close(fh)

    coordinates = [
        'time'
    ] + coordinates  # add constructed time (days) as first coordinate
    var['time'] = vtime

    if averagingConfig['name'] == 'pixelMean':
        print >> sys.stderr, 'Doing Pixel Average over %d grids ...' % n
        start = time.time()
        avg = averageFn(accum)  # call pixel averaging function
        end = time.time()
        print >> sys.stderr, 'pixelMean execution time:', (end - start)
        outlat = var[coordinates[1]].astype(N.float32)[:]
        outlon = var[coordinates[2]].astype(N.float32)[:]
    elif averagingConfig['name'] == 'gaussInterp':
        print >> sys.stderr, 'Doing Gaussian Interpolation over %d grids ...' % n
        var[variable] = accum
        c = averagingConfig
        latGrid = c['latGrid']
        lonGrid = c['lonGrid']
        if latGrid is not None and lonGrid is not None:
            outlat = N.arange(latGrid[0],
                              latGrid[1] + latGrid[2],
                              latGrid[2],
                              dtype=N.float32,
                              order='F')
            outlon = N.arange(lonGrid[0],
                              lonGrid[1] + lonGrid[2],
                              lonGrid[2],
                              dtype=N.float32,
                              order='F')
        else:
            outlat = N.array(var[coordinates[1]], dtype=N.float32, order='F')
            outlon = N.array(var[coordinates[2]], dtype=N.float32, order='F')
        varNames = [variable] + coordinates
        start = time.time()
        avg, weight, status = \
            gaussInterp(var, varNames, outlat, outlon, c['wlat'], c['wlon'],
                        c['slat'], c['slon'], c['stime'], c['vfactor'], c['missingValue'],
                        VERBOSE, optimization)
        end = time.time()
        var['outweight'] = weight.astype(N.float32)
        print >> sys.stderr, 'gaussInterp execution time:', (end - start)
    elif averagingConfig['name'] == 'spatialFilter':
        print >> sys.stderr, 'Applying Spatial 3x3 Filter and then averaging over %d grids ...' % n
        var[variable] = accum
        c = averagingConfig
        varNames = [variable] + coordinates
        start = time.time()
        avg, count, status = \
            spatialFilter(var, varNames, c['spatialFilter'], c['normalization'],
                          c['missingValue'], VERBOSE, optimization)
        end = time.time()
        print >> sys.stderr, 'spatialFilter execution time:', (end - start)
        outlat = var[coordinates[1]].astype(N.float32)[:]
        outlon = var[coordinates[2]].astype(N.float32)[:]

    var['out' + variable] = avg.astype(
        N.float32)  # return primary variable & mask arrays in dict
    var['out' + mask] = N.ma.getmask(avg)
    var['outlat'] = outlat
    var['outlon'] = outlon
    return var
def accumulateClim(urls,                    # list of granule URLs for a time period
                   variable,                # name of primary variable in file
                   mask,                    # name of mask variable
                   coordinates,             # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon')
                   maskFn=qcMask,           # mask function to compute mask from mask variable
                   averageFn=mean,          # averaging function to use
                   averagingConfig={},      # parameters to control averaging function (e.g. gaussInterp)
                   optimization='fortran',  # optimization mode (fortran or cython)
                   cachePath=CachePath
                  ):
    '''Compute a climatology over N arrays by applying a mask and averaging function.
Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary.
***Assumption:  This routine assumes that the N grids will fit in memory.***
    '''
    print >>sys.stderr, 'accumulateClim: Doing %s ...' % averagingConfig['name']
    varList = [variable, mask]
    for i, url in enumerate(urls):
        try:
            path = retrieveFile(url, cachePath)
            fn = os.path.split(path)[1]
            vtime = int(fn[5:8])     # KLUDGE: extract DOY from filename
        except:
            print >>sys.stderr, 'climByAveraging: Error, continuing without file %s' % url
            continue
        if path is None: continue
        try:
            print >>sys.stderr, 'Reading file %s ...' % path
            var, fh = getVariables(path, varList, arrayOnly=True, order='F', set_auto_mask=False)   # return dict of variable objects by name
        except:
            print >>sys.stderr, 'climByAveraging: Error, cannot read file %s' % path
            continue
        if i == 0:
            dtype = var[variable].dtype
            if 'int' in dtype.name: dtype = N.float32
            shape = var[variable].shape
            vsum = N.ma.empty(shape, dtype, order='F')
            vcount = N.ma.empty(shape, dtype, order='F')
            emptyVar = N.array(N.ma.masked_all(var[variable].shape, dtype), order='F')  # totally masked variable array for missing or bad file reads

            print >>sys.stderr, 'Read coordinates ...'
            var, fh = getVariables(path, coordinates, var, arrayOnly=True, order='F')   # read coordinate arrays and add to dict

        var[variable] = maskFn(var[variable], var[mask])     # apply quality mask variable to get numpy MA, turned off masking done by netCDF4 library

        # Echo variable range for sanity check
        vals = var[variable].compressed()
        print >>sys.stderr, 'Variable Range: min, max:', vals.min(), vals.max()

        if averagingConfig['name'] == 'pixelMean':
            vsum += var[variable]                        # update accumulators
            vcount += ~var[mask]

        elif averagingConfig['name'] == 'gaussInterp':
            var[variable] = accum
            c = averagingConfig
            latGrid = c['latGrid']; lonGrid = c['lonGrid']
            if latGrid is not None and lonGrid is not None:
                outlat = N.arange(latGrid[0], latGrid[1]+latGrid[2], latGrid[2], dtype=N.float32, order='F')
                outlon = N.arange(lonGrid[0], lonGrid[1]+lonGrid[2], lonGrid[2], dtype=N.float32, order='F')
            else:
                outlat = N.array(var[coordinates[1]], dtype=N.float32, order='F')
                outlon = N.array(var[coordinates[2]], dtype=N.float32, order='F')
            varNames = [variable] + coordinates
            start = time.time()
            avg, weight, status = \
                gaussInterp(var, varNames, outlat, outlon, c['wlat'], c['wlon'],
                            c['slat'], c['slon'], c['stime'], c['vfactor'], c['missingValue'],
                            VERBOSE, optimization)
            end = time.time()
            vcount = weight.astype(N.float32)
            vsum = avg
            print >>sys.stderr, 'gaussInterp execution time:', (end - start)
        elif averagingConfig['name'] == 'spatialFilter':
            var[variable] = accum
            c = averagingConfig
            varNames = [variable] + coordinates
            start = time.time()
            avg, vcount, status = \
                spatialFilter(var, varNames, c['spatialFilter'], c['normalization'], 
                              c['missingValue'], VERBOSE, optimization)
            vsum = avg
            end = time.time()
            print >>sys.stderr, 'spatialFilter execution time:', (end - start)
        close(fh)

    return (vcount, vsum)