Пример #1
0
def proc_write(twx_cfg, xval_stnids, start_ymd, end_ymd, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0
    
    stn_da = StationDataDb(twx_cfg.fpath_stndata_nc_tair_homog,
                           (start_ymd, end_ymd))
    
    if xval_stnids is None:
        xval_stnids = load_default_xval_stnids(stn_da.stn_ids)
    
    ttl_infills = xval_stnids.size * 2
    
    xval_stns = stn_da.stns[np.in1d(stn_da.stn_ids, xval_stnids, True)]
    
    create_quick_db(twx_cfg.fpath_xval_infill_nc, xval_stns, stn_da.days,
                    NETCDF_OUT_VARIABLES)
    ds_out = Dataset(twx_cfg.fpath_xval_infill_nc, 'r+')
    
    stn_idxs = {}
    for x in np.arange(xval_stnids.size):
        stn_idxs[xval_stnids[x]] = x
                    
    stat_chk = StatusCheck(ttl_infills, 10)
    
    while 1:

        stn_id, tair_var, infill_tair, obs_tair = MPI.COMM_WORLD.recv(source=MPI.ANY_SOURCE,
                                                                      tag=MPI.ANY_TAG,
                                                                      status=status)
        
        if status.tag == TAG_STOPWORK:
            
            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:
                print "WRITER: Finished"
                return 0
        else:
            
            infill_tair = np.ma.masked_array(infill_tair, np.isnan(infill_tair))
            obs_tair = np.ma.masked_array(obs_tair, np.isnan(obs_tair))
            
            i = stn_idxs[stn_id]
            
            difs = infill_tair - obs_tair
            bias = np.ma.mean(difs)
            mae = np.ma.mean(np.ma.abs(difs))
            
            print "|".join(["WRITER", stn_id, tair_var,
                            "MAE: %.2f" % (mae,), "BIAS: %.2f" % (bias,)])
            
            obs_tair = np.ma.filled(obs_tair, netCDF4.default_fillvals['f4'])
            ds_out.variables["obs_%s" % (tair_var,)][:, i] = obs_tair
            
            infill_tair = np.ma.filled(infill_tair, netCDF4.default_fillvals['f4'])
            ds_out.variables["infilled_%s" % (tair_var,)][:, i] = infill_tair
            
            ds_out.sync()
            
            stat_chk.increment()
Пример #2
0
def find_dup_stns(stnda):
    '''
    Find duplicate stations in a netCDF4 infilled station database. Two or
    more stations are considered duplicates if they are at the exact
    same location. For two or more stations with the same
    location, the one with the longest non-infilled period-of-record is
    kept and the others are considered duplicates and will be returned
    by this function.
    
    Parameters
    ----------
    stnda : twx.db.StationSerialDataDb
        A StationSerialDataDb object pointing to the infilled 
        database that should be searched for duplicate stations.
        
    Returns
    ----------
    rm_stnids : ndarray
        An array of duplicate station ids
    '''

    dup_stnids = []
    rm_stnids = []

    stat_chk = StatusCheck(stnda.stns.size, 1000)

    for stn in stnda.stns:

        if stn[STN_ID] not in dup_stnids:

            ngh_stns = stnda.stns[stnda.stn_ids != stn[STN_ID]]
            dists = grt_circle_dist(stn[LON], stn[LAT], ngh_stns[LON],
                                    ngh_stns[LAT])

            dup_nghs = ngh_stns[dists == 0]

            if dup_nghs.size > 0:

                dup_stnids.extend(dup_nghs[STN_ID])

                stn_ids_load = np.sort(
                    np.concatenate([
                        np.array([stn[STN_ID]]).ravel(),
                        np.array([dup_nghs[STN_ID]]).ravel()
                    ]))
                # print stn_ids_load
                stn_idxs = np.nonzero(
                    np.in1d(stnda.stn_ids, stn_ids_load, True))[0]
                imp_flgs = stnda.ds.variables['flag_infilled'][:, stn_idxs]
                imp_flg_sum = np.sum(imp_flgs, axis=0)

                stn_ids_rm = stn_ids_load[imp_flg_sum != np.min(imp_flg_sum)]

                rm_stnids.extend(stn_ids_rm)

        stat_chk.increment()

    rm_stnids = np.array(rm_stnids)

    return rm_stnids
Пример #3
0
def proc_write(fpath_stndb, elem, fpath_out, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0

    stn_da = StationSerialDataDb(fpath_stndb, elem)
    stn_ids = stn_da.stn_ids
    stns = stn_da.stns
    stn_mask = np.logical_and(np.isfinite(stn_da.stns[MASK]),
                              np.isnan(stn_da.stns[BAD]))
    days = stn_da.days
    stn_da.ds.close()
    stn_da = None

    print "WRITER: Creating output station netCDF database..."

    create_quick_db(fpath_out, stns, days, DB_VARIABLES[elem])
    stnda_out = StationSerialDataDb(fpath_out, elem, mode='r+')

    mth_names = []
    for mth in np.arange(1, 13):

        norm_var_name = get_norm_varname(mth)
        stnda_out.add_stn_variable(norm_var_name,
                                   '',
                                   units='C',
                                   dtype='f8',
                                   fill_value=netCDF4.default_fillvals['f8'])
        mth_names.append(norm_var_name)

    stnda_out.ds.sync()

    print "WRITER: Output station netCDF database created."

    mths = np.arange(12)

    stat_chk = StatusCheck(np.sum(stn_mask), 50)

    while 1:

        stn_id, tair_daily, tair_norms = MPI.COMM_WORLD.recv(
            source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:
                print "WRITER: Finished"
                return 0
        else:

            x = np.nonzero(stn_ids == stn_id)[0][0]
            stnda_out.ds.variables[elem][:, x] = tair_daily

            for i in mths:
                stnda_out.ds.variables[mth_names[i]][x] = tair_norms[i]

            stnda_out.ds.sync()

            stat_chk.increment()
Пример #4
0
def set_optim_nstns_tair_anom(stnda, path_xval_ds):
    '''
    Set the local optimal number of stations to be used for anomaly
    interpolation each U.S. climate division based on cross-validation
    mean absolute error.
    
    Parameters
    ----------
    stnda : twx.db.StationSerialDataDb
        A StationSerialDataDb object pointing to the
        database for which the local optimal number of
        neighbors should be set. 
    path_xval_ds : str
        Path where netCDF cross-validation MAE files from
        create_climdiv_optim_nstns_db are located
    '''

    climdiv_stns = stnda.stns[CLIMDIV]

    vars_optim = {}
    for mth in np.arange(1, 13):

        var_name_optim = get_optim_anom_varname(mth)
        long_name = "Optimal number of neighbors to use for daily anomaly interpolation for month %d" % mth
        var_optim = stnda.add_stn_variable(
            var_name_optim,
            long_name,
            "",
            'f8',
            fill_value=netCDF4.default_fillvals['f8'])
        vars_optim[mth] = var_optim

    divs = np.unique(climdiv_stns[np.isfinite(climdiv_stns)])

    stchk = StatusCheck(divs.size, 10)

    for clim_div in divs:

        fpath = os.path.join(
            path_xval_ds,
            "optim_nstns_%s_climdiv%d.nc" % (stnda.var_name, clim_div))

        ds_climdiv = Dataset(fpath)

        mae_climdiv = ds_climdiv.variables['mae'][:]
        nnghs_climdiv = ds_climdiv.variables['min_nghs'][:]

        climdiv_mask = np.nonzero(climdiv_stns == clim_div)[0]

        for mth in np.arange(1, 13):

            mae_climdiv_mth = mae_climdiv[mth - 1, :, :]
            mmae = np.mean(mae_climdiv_mth, axis=1)
            min_idx = np.argmin(mmae)
            vars_optim[mth][climdiv_mask] = nnghs_climdiv[min_idx]

        stchk.increment()

    stnda.ds.sync()
Пример #5
0
def add_monthly_normals(stnda, start_norm_yr=1981, end_norm_yr=2010):
    '''
    Calculate and add station monthly normals to a serially-complete 
    netCDF station database.
    
    Parameters
    ----------
    stnda : twx.db.StationSerialDataDb
        A StationSerialDataDb object pointing to the
        database to which station monthly normals should
        be added.
    start_norm_yr : int, optional
        The start year for the normals.
    end_norm_yr : int, optional
        The end year for the normals
    '''

    tagg = TairAggregate(stnda.days)
    stns = stnda.stns

    norm_vars = {}
    for mth in np.arange(1, 13):

        norm_var_name = 'norm%02d' % mth
        long_name = "%d - %d Monthly Normal" % (start_norm_yr, end_norm_yr)
        norm_var = stnda.add_stn_variable(
            norm_var_name,
            long_name,
            units='C',
            dtype='f8',
            fill_value=netCDF4.default_fillvals['f8'])
        norm_vars[mth] = norm_var

    dly_var = stnda.var

    chk_size = 50
    stchk = StatusCheck(np.int(np.round(stns.size / np.float(chk_size))), 10)

    for i in np.arange(0, stns.size, chk_size):

        if i + chk_size < stns.size:
            nstns = chk_size
        else:
            nstns = stns.size - i

        dly_vals = np.ma.masked_equal(dly_var[:, i:i + nstns],
                                      dly_var._FillValue)
        norm_vals = tagg.daily_to_mthly_norms(dly_vals, start_norm_yr,
                                              end_norm_yr)

        for mth in np.arange(1, 13):
            norm_vars[mth][i:i + nstns] = norm_vals[mth - 1, :]

        stnda.ds.sync()
        stchk.increment()
def proc_write(fpath_stndb, elem, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0

    stn_da = StationSerialDataDb(fpath_stndb, elem, mode="r+")
    mask_stns = np.logical_and(np.isfinite(stn_da.stns[MASK]),
                               np.isnan(stn_da.stns[BAD]))
    nstns = np.sum(mask_stns)

    dsvars = {}
    for mth in np.arange(1, 13):

        vname_nug = get_krigparam_varname(mth, VARIO_NUG)
        vname_psill = get_krigparam_varname(mth, VARIO_PSILL)
        vname_rng = get_krigparam_varname(mth, VARIO_RNG)

        dsvars[vname_nug] = stn_da.add_stn_variable(vname_nug, vname_nug,
                                                    "C**2", 'f8')
        dsvars[vname_psill] = stn_da.add_stn_variable(vname_psill, vname_nug,
                                                      "C**2", 'f8')
        dsvars[vname_rng] = stn_da.add_stn_variable(vname_rng, vname_nug, "km",
                                                    'f8')

    stat_chk = StatusCheck(nstns, 250)

    while 1:

        stn_id, nug, psill, rng = MPI.COMM_WORLD.recv(source=MPI.ANY_SOURCE,
                                                      tag=MPI.ANY_TAG,
                                                      status=status)

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:
                print "WRITER: Finished"
                return 0
        else:

            x = stn_da.stn_idxs[stn_id]

            for mth in np.arange(1, 13):

                dsvars[get_krigparam_varname(mth, VARIO_NUG)][x] = nug[mth - 1]
                dsvars[get_krigparam_varname(mth,
                                             VARIO_PSILL)][x] = psill[mth - 1]
                dsvars[get_krigparam_varname(mth, VARIO_RNG)][x] = rng[mth - 1]

            stn_da.ds.sync()

            stat_chk.increment()
Пример #7
0
    def find_xval_outliers(self,
                           stn_ids=None,
                           bw_nngh=100,
                           zscore_threshold=6):
        '''
        Runs a leave-one-out cross validation of a geographically
        weighted regression model of station monthly and annual normals
        (norm~lst+elev+lon+lat) and returns those stations whose error is
        a specified # of standard deviations above/below the mean
        
        Parameters
        ----------
        stn_ids : list_like, optional
            The station ids for which to run the cross validation.
            If None, the cross validation will be run for all stations
            in the database
        bw_nngh : int, optional
            The number of neighbors to use for the
            geographically weighted regression. Default: 100.
        zscore_threshold : float, optional
            The zcore threshold by which a station's error should be
            considered an outlier.
        
        Returns
        ----------
        out_stnids : ndarray
            The outlier stations
        out_errs : ndarray
            The model error associated with each outlier
        '''

        if stn_ids is None:
            stn_ids = self.stn_da.stn_ids

        schk = StatusCheck(stn_ids.size, check_cnt=250)

        xval_errs = np.zeros((13, stn_ids.size))

        for i, a_id in enumerate(stn_ids):

            xval_errs[:, i] = self.run_xval_stn(a_id, bw_nngh)
            schk.increment()

        xval_errs = pd.DataFrame(xval_errs)
        xval_errs.columns = stn_ids
        zscores = (xval_errs.subtract(xval_errs.mean(axis=1),
                                      axis=0).divide(xval_errs.std(axis=1),
                                                     axis=0).abs())
        out_stnids = zscores.columns[(zscores > zscore_threshold).any(
            axis=0)].values

        return out_stnids
Пример #8
0
def proc_write(twx_cfg, mask_stns, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0

    iter_all = IterMultiFlagUpdate()

    nstns = np.nonzero(mask_stns)[0].size
    stat_chk = StatusCheck(nstns, 10)

    while 1:

        a_iter = MPI.COMM_WORLD.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:
                
                print "Writer: updating QA flags in database..."
                
                iter_all.update_flags(twx_cfg.fpath_stndata_nc_all)
                
                # Recalculate period-of-record for Tmin and Tmax
                # since some stations might now fall below the min
                # por requirement after QA is run
                
                for elem in ['tmin', 'tmax']:
                
                    print ("Updating monthly observation counts for %s from %d to %d... " % 
                           (elem, ymdL(twx_cfg.obs_start_date),
                            ymdL(twx_cfg.obs_end_date)))

                    add_obs_cnt(twx_cfg.fpath_stndata_nc_all, elem,
                                twx_cfg.obs_start_date, twx_cfg.obs_end_date,
                                twx_cfg.stn_agg_chunk)
                    
                    print ("Updating monthly observation counts for %s from %d to %d... " % 
                           (elem, ymdL(twx_cfg.interp_start_date), ymdL(twx_cfg.interp_end_date)))
                    
                    add_obs_cnt(twx_cfg.fpath_stndata_nc_all, elem,
                                twx_cfg.interp_start_date, twx_cfg.interp_end_date,
                                twx_cfg.stn_agg_chunk)
                
                                
                print "Writer: Finished"
                return 0
        else:

            iter_all.add_iter(a_iter)
            stat_chk.increment()
Пример #9
0
    # Load location QA HDF file
    locqa = LocQA(twx_cfg.fpath_locqa_hdf,
                  usrname_geonames=twx_cfg.username_geonames)
    # Add location QA data columns to stations
    stns = locqa.add_locqa_cols(stns)

    # Retrieve the DEM-based elevations for any stations that does not currently
    # have one and update the location QA HDF file
    stns_elevdem = stns[stns.elevation_dem.isnull()]

    print "Retrieving DEM elevation data for %d stations..." % len(
        stns_elevdem)

    write_chk = 50
    schk = StatusCheck(len(stns), check_cnt=write_chk)

    for i in np.arange(len(stns_elevdem), step=write_chk):

        stns_chk = stns_elevdem.iloc[i:(i + write_chk)].copy()

        for stnid in stns_chk.station_id:

            lon, lat = stns_chk.loc[stnid, ['longitude', 'latitude']]
            elevdem = locqa.get_elevation_dem(lon, lat)
            stns_chk.loc[stnid, 'elevation_dem'] = elevdem
            schk.increment()

        locqa.update_locqa_hdf(stns_chk, reload_locqa=False)

    locqa.reload_stns_locqa()
def proc_write(twx_cfg, ncdf_mode, start_ymd, end_ymd, nwrkers):

    status = MPI.Status()
    stn_da = StationDataDb(twx_cfg.fpath_stndata_nc_tair_homog,
                           (start_ymd, end_ymd))
    days = stn_da.days
    nwrkrs_done = 0

    bcast_msg = None
    bcast_msg = MPI.COMM_WORLD.bcast(bcast_msg, root=RANK_COORD)
    stnids_tmin, stnids_tmax = bcast_msg
    print "WRITER: Received broadcast msg"

    if ncdf_mode == 'r+':

        ds_tmin = Dataset(twx_cfg.fpath_stndata_nc_infill_tmin, 'r+')
        ds_tmax = Dataset(twx_cfg.fpath_stndata_nc_infill_tmax, 'r+')
        ttl_infills = stnids_tmin.size + stnids_tmax.size
        stnids_tmin = ds_tmin.variables[STN_ID][:].astype(np.str)
        stnids_tmax = ds_tmax.variables[STN_ID][:].astype(np.str)

    else:

        stns_tmin = stn_da.stns[np.in1d(stn_da.stns[STN_ID], stnids_tmin,
                                        assume_unique=True)]
        variables_tmin = [('tmin', 'f4', netCDF4.default_fillvals['f4'],
                           'minimum air temperature', 'C'),
                          ('flag_infilled', 'i1', netCDF4.default_fillvals['i1'],
                           'infilled flag', ''),
                          ('tmin_infilled', 'f4', netCDF4.default_fillvals['f4'],
                           'infilled minimum air temperature', 'C')]
        create_quick_db(twx_cfg.fpath_stndata_nc_infill_tmin, stns_tmin, days,
                        variables_tmin)
        stnda_out_tmin = StationDataDb(twx_cfg.fpath_stndata_nc_infill_tmin,
                                       mode="r+")
        stnda_out_tmin.add_stn_variable('mae', 'mean absolute error', 'C', "f8")
        stnda_out_tmin.add_stn_variable('bias', 'bias', 'C', "f8")
        ds_tmin = stnda_out_tmin.ds

        stns_tmax = stn_da.stns[np.in1d(stn_da.stns[STN_ID], stnids_tmax,
                                        assume_unique=True)]
        variables_tmax = [('tmax', 'f4', netCDF4.default_fillvals['f4'],
                           'maximum air temperature', 'C'),
                          ('flag_infilled', 'i1', netCDF4.default_fillvals['i1'],
                           'infilled flag', ''),
                          ('tmax_infilled', 'f4', netCDF4.default_fillvals['f4'],
                           'infilled maximum air temperature', 'C')]
        create_quick_db(twx_cfg.fpath_stndata_nc_infill_tmax, stns_tmax, days,
                        variables_tmax)
        stnda_out_tmax = StationDataDb(twx_cfg.fpath_stndata_nc_infill_tmax,
                                       mode="r+")
        stnda_out_tmax.add_stn_variable('mae', 'mean absolute error', 'C', "f8")
        stnda_out_tmax.add_stn_variable('bias', 'bias', 'C', "f8")
        ds_tmax = stnda_out_tmax.ds

        ttl_infills = stnids_tmin.size + stnids_tmax.size

    print "WRITER: Infilling a total of %d station time series " % (ttl_infills,)
    print "WRITER: Output NCDF files ready"

    stat_chk = StatusCheck(ttl_infills, 10)

    while 1:

        result = MPI.COMM_WORLD.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG,
                                     status=status)
        stn_id, tair_var, tair, fill_mask, tair_infill, mae, bias = result

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:

                print "Writer: Finished"
                return 0
        else:

            if tair_var == 'tmin':
                stn_idx = np.nonzero(stnids_tmin == stn_id)[0][0]
                ds = ds_tmin
            else:
                stn_idx = np.nonzero(stnids_tmax == stn_id)[0][0]
                ds = ds_tmax

            ds.variables[tair_var][:, stn_idx] = tair
            ds.variables["".join([tair_var, "_infilled"])][:, stn_idx] = tair_infill
            ds.variables['flag_infilled'][:, stn_idx] = fill_mask
            ds.variables['bias'][stn_idx] = bias
            ds.variables[LAST_VAR_WRITTEN][stn_idx] = mae

            ds.sync()

            print "|".join(["WRITER", stn_id, tair_var, "%.4f" % (mae,),
                            "%.4f" % (bias,)])

            stat_chk.increment()
Пример #11
0
def proc_write(fpath_stndb, elem, climdivs, ngh_rng, path_out_optim, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0

    bcast_msg = None
    bcast_msg = MPI.COMM_WORLD.bcast(bcast_msg, root=RANK_COORD)
    stn_ids = bcast_msg
    print "WRITER: Received broadcast msg"

    stn_da = StationSerialDataDb(fpath_stndb, elem, mode="r+")
    stn_mask = np.in1d(stn_da.stn_ids, stn_ids, True)
    stns = stn_da.stns[stn_mask]

    climdiv_ds = {}
    ttl_xval_stns = 0
    for climdiv in climdivs:

        stnids_climdiv = stns[STN_ID][stns[CLIMDIV] == climdiv]

        a_ds = create_climdiv_optim_nstns_db(path_out_optim, elem,
                                             stnids_climdiv, ngh_rng, climdiv)
        climdiv_ds[climdiv] = a_ds, stnids_climdiv

        ttl_xval_stns += stnids_climdiv.size

    print "WRITER: Output NCDF files created"

    stn_idxs = {}
    for x in np.arange(stns.size):
        stn_idxs[stns[STN_ID][x]] = x

    min_ngh_wins = ngh_rng
    ngh_idxs = {}
    for x in np.arange(min_ngh_wins.size):
        ngh_idxs[min_ngh_wins[x]] = x

    ttl_xvals = ttl_xval_stns

    stat_chk = StatusCheck(ttl_xvals, 250)

    while 1:

        stn_id, mae, bias, r2 = MPI.COMM_WORLD.recv(source=MPI.ANY_SOURCE,
                                                    tag=MPI.ANY_TAG,
                                                    status=status)

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:

                #######################################################
                print "WRITER: Setting the optim # of nghs..."

                set_optim_nstns_tair_anom(stn_da, path_out_optim)

                ######################################################

                print "WRITER: Finished"
                return 0

        else:

            stn = stns[stn_idxs[stn_id]]
            ds, stnids_climdiv = climdiv_ds[stn[CLIMDIV]]
            dim2 = np.nonzero(stnids_climdiv == stn_id)[0][0]
            ds.variables['mae'][:, :, dim2] = mae
            ds.sync()

            stat_chk.increment()
Пример #12
0
def create_serially_complete_db(fpath_infill_db, tair_var,
                                fpath_out_serial_db):
    '''
    Create a netCDF single variable, serially-complete station database and
    insert serially-complete station observations. Based on a specific threshold
    of total missing data, a station's serially-complete time series will either
    consist of a mix of actual and infilled observations or entirely of infilled
    observations from the infill model.
    
    Parameters
    ----------
    fpath_infill_db : str
        The file path to the infilled station database
    tair_var : str
        The temperature variable ('tmin' or 'tmax') for the database
    fpath_out_serial_db : str
        The file path for the output serially-complete database
    '''

    ds_infill = Dataset(fpath_infill_db)
    var_time = ds_infill.variables['time']
    stns = _build_stn_struct(ds_infill)
    start, end = num2date([var_time[0], var_time[-1]], var_time.units)
    days = get_days_metadata(start, end)

    create_quick_db(fpath_out_serial_db, stns, days,
                    SERIAL_DB_VARIABLES[tair_var])
    ds_out = Dataset(fpath_out_serial_db, 'r+')

    all_infill_flags = np.ones(days.size, dtype=np.bool)
    all_infill_stns = np.zeros(stns.size, dtype=np.bool)

    stat_chk = StatusCheck(stns.size, 100)

    for x in np.arange(stns.size):

        infill_mask = ds_infill.variables['flag_infilled'][:,
                                                           x].astype(np.bool)
        infill_runs = _runs_of_ones_array(infill_mask)

        if infill_runs.size > 0:
            max_infill = np.max(infill_runs)
        else:
            max_infill = 0

        if max_infill >= USE_ALL_INFILL_THRESHOLD:

            # This station has greater than USE_ALL_INFILL_THRESHOLD continuous
            # years of missing data. Use all infilled values for this station to avoid
            # discontinuities between infilled and observed portions of time series
            tair_stn = ds_infill.variables["".join([tair_var, "_infilled"])][:,
                                                                             x]
            flag_stn = all_infill_flags

            all_infill_stns[x] = True

        else:

            tair_stn = ds_infill.variables[tair_var][:, x]
            flag_stn = infill_mask

        ds_out.variables[tair_var][:, x] = tair_stn
        ds_out.variables['flag_infilled'][:, x] = flag_stn
        ds_out.sync()

        stat_chk.increment()

    ds_out.close()

    print "% of stns with all infilled values: " + str(
        (np.sum(all_infill_stns) / np.float(all_infill_stns.size)) * 100.)
Пример #13
0
def add_stn_raster_values(stnda,
                          var_name,
                          long_name,
                          units,
                          a_rast,
                          extract_method=1,
                          revert_nn=False,
                          force_data_value=False):
    '''
    Extract raster values for station locations and add them to a 
    serially-complete netCDF station database. Uses mpl_toolkits.basemap.interp
    to extract raster values. 
    
    Parameters
    ----------
    stnda : twx.db.StationSerialDataDb
        A StationSerialDataDb object pointing to the
        database to which station raster values should
        be added.
    var_name : str
        The netCDF variable name to be used for the raster values
    long_name : str
        The long netCDF variable name to be used for the raster values
    units : str
        The units of the raster values
    a_rast : RasterDataset
        The raster dataset from which to extract values
    extract_method : int, optional
        The mpl_toolkits.basemap.interp interpolation method to use for
        extraction specified as an integer (0 = nearest neighbor, 1 = bilinear,
        3 = cubic spline). Default = 1.
    revert_nn : boolean, optional
        Set to True if extract_methods > 0 should revert to nearest neighbor
        if a value cannot be extracted with the specified extract method. Default = False.
    force_data_value : boolean, optional
        If True, for station locations that have no data raster values or are outside
        the extent of the raster, the nearest grid cell with a value will be used.
        Default = False.
        
    Returns
    ----------
    stnids_ndata : ndarray
        An array of station ids for which raster values could not be extracted
    '''

    lon = stnda.stns[LON]
    lat = stnda.stns[LAT]

    newvar = stnda.add_stn_variable(var_name,
                                    long_name,
                                    units,
                                    'f8',
                                    fill_value=a_rast.ndata)

    # Setup data and coordinates for mpl_toolkits.basemap.interp
    a = a_rast.read_as_array()
    aflip = np.flipud(a)
    aflip = aflip.astype(np.float)
    a = a.data

    yGrid, xGrid = a_rast.get_coord_grid_1d()
    yGrid = np.sort(yGrid)

    # Initialize output array
    rvals = np.zeros(len(newvar[:]))

    # Loop through stations
    schk = StatusCheck(lon.size, 5000)
    for x in np.arange(lon.size):

        try:
            rval = bm.interp(aflip,
                             xGrid,
                             yGrid,
                             np.array(lon[x]),
                             np.array(lat[x]),
                             checkbounds=True,
                             masked=False,
                             order=extract_method)
        except ValueError:
            # ValueError means that station point is outside the bounds of the raster
            rval = np.ma.masked

        # Re-run nearest neighbor extraction with no checkbounds constraint if rval is
        # masked (i.e.--no data) and at least one of the following conditions is met:
        # 1.) The station point is in bounds and extract method is nearest neighbor or
        # revert_nn is True.Since mpl_toolkits.basemap.interp uses coordinates based on cell centers,
        # stations near very edges of raster will produce ValueErrors with a checkbounds restriction
        # when they aren't actually out-of-bounds.
        # 2.) force_data_value = True. If point is outside bounds of raster, the returned
        # value will be clipped to the edge of raster.
        if (np.ma.is_masked(rval) and
            ((a_rast.is_inbounds(lon[x], lat[x]) and
              (extract_method == 0 or revert_nn)) or force_data_value)):

            rval = bm.interp(aflip,
                             xGrid,
                             yGrid,
                             np.array(lon[x]),
                             np.array(lat[x]),
                             checkbounds=False,
                             masked=False,
                             order=0)

        # If rval is still masked (i.e.--no data) and force_data_value is True,
        # find the nearest grid cell with data to the point
        if np.ma.is_masked(rval) and force_data_value:

            if np.ma.is_masked(rval):

                row, col = a_rast.get_row_col(lon[x],
                                              lat[x],
                                              check_bounds=False)
                rval, dist = _find_nn_data(a, a_rast, col, row)

        if np.ma.is_masked(rval):
            rval = a_rast.ndata

        rvals[x] = rval
        schk.increment()

    newvar[:] = rvals
    stnda.ds.sync()

    stnids_ndata = stnda.stn_ids[rvals == a_rast.ndata]

    if force_data_value and stnids_ndata.size > 0:
        raise Exception(
            'force_data_value turned on, but station points still had no data values.'
        )

    return stnids_ndata
Пример #14
0
def proc_write(twx_cfg, start_ymd, end_ymd, nwrkers):

    status = MPI.Status()
    nwrkrs_done = 0
    stn_da = StationDataDb(twx_cfg.fpath_stndata_nc_tair_homog,
                           (start_ymd, end_ymd),
                           mode="r+")

    mths = np.arange(1, 13)

    for mth in mths:

        for varname in ['tmin', 'tmax']:

            varname_mean = get_mean_varname(varname, mth)
            varname_vari = get_variance_varname(varname, mth)

            stn_da.add_stn_variable(varname_mean, varname_mean, "C", 'f8')
            stn_da.add_stn_variable(varname_vari, varname_vari, "C**2", 'f8')

    stn_da.ds.sync()

    bcast_msg = None
    bcast_msg = MPI.COMM_WORLD.bcast(bcast_msg, root=RANK_COORD)
    mask_por_tmin, mask_por_tmax = bcast_msg
    stn_ids_tmin, stn_ids_tmax = (stn_da.stn_ids[mask_por_tmin],
                                  stn_da.stn_ids[mask_por_tmax])
    print "WRITER: Received broadcast msg"
    stn_ids_uniq = np.unique(np.concatenate([stn_ids_tmin, stn_ids_tmax]))

    stn_idxs = {}
    for x in np.arange(stn_da.stn_ids.size):
        if stn_da.stn_ids[x] in stn_ids_uniq:
            stn_idxs[stn_da.stn_ids[x]] = x

    ttl_infills = stn_ids_tmin.size + stn_ids_tmax.size

    stat_chk = StatusCheck(ttl_infills, 30)

    while 1:

        stn_id, tair_var, stn_mean, stn_vari = MPI.COMM_WORLD.recv(
            source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)

        if status.tag == TAG_STOPWORK:

            nwrkrs_done += 1
            if nwrkrs_done == nwrkers:
                print "WRITER: Finished"
                return 0
        else:

            stnid_dim = stn_idxs[stn_id]

            for mth in mths:

                vname_mean = get_mean_varname(tair_var, mth)
                stn_da.ds.variables[vname_mean][stnid_dim] = stn_mean[mth - 1]

                vname_vary = get_variance_varname(tair_var, mth)
                stn_da.ds.variables[vname_vary][stnid_dim] = stn_vari[mth - 1]

            stn_da.ds.sync()

            stat_chk.increment()
Пример #15
0
def create_tobs_db(fpath_tobs_file, fpath_db, stnids, min_date, max_date):
    '''
    Create a time-of-observation (tobs) netCDF4 database from a
    tobs file generated from create_tobs_file.
    
    Parameters
    ----------
    fpath_tobs_file : str
        The file path to the tobs file from create_tobs_file
    fpath_db : str
        The file path to which to write the tobs database.
    stnids : sequence of str
        The sorted station ids of stns whose tobs should be written
        to the database
    min_date : datetime
        The earliest observation date
    max_date : datetime
        The latest observation date
    '''

    ds = Dataset(fpath_db, 'w')

    # Set global attributes
    ds.title = "Time-of-Observation Database"
    ds.institution = "University of Montana Numerical Terradynamics Simulation Group"
    ds.history = "".join([
        "Created on: ",
        datetime.datetime.strftime(datetime.date.today(), "%Y-%m-%d")
    ])

    days = get_days_metadata(min_date, max_date)

    print "Creating netCDF4 Time-of-Observation Database for " + str(min_date) + \
    " to " + str(max_date) + " for " + str(stnids.size) + " stations."

    ds.createDimension('time', days.size)
    ds.createDimension('stn_id', stnids.size)

    times = ds.createVariable('time', 'f8', ('time', ), fill_value=False)
    times.long_name = "time"
    times.units = "".join([
        "days since ",
        str(min_date.year), "-",
        str(min_date.month), "-",
        str(min_date.day), " 0:0:0"
    ])
    times.standard_name = "time"
    times.calendar = "standard"
    times[:] = date2num(days[DATE], times.units)

    stations = ds.createVariable('stn_id', 'str', ('stn_id', ))
    stations.long_name = "station id"

    for x in np.arange(stnids.size):

        ds.variables['stn_id'][x] = str(stnids[x])

    tobs = ds.createVariable('tobs',
                             np.int16, ('time', 'stn_id'),
                             fill_value=-1,
                             chunksizes=(days[DATE].size, NCDF_CHK_COLS))
    tobs.long_name = "time-of-observation"
    tobs.missing_value = -1

    stnidsOrig = np.char.replace(stnids, "GHCN_", "", 1)

    fileTobs = open(fpath_tobs_file)
    aline = fileTobs.readline()

    atobs = np.ones((days.size, stnids.size)) * -1

    curYmd = days[YMD][0]
    time_idx = 0
    stn_idx = 0

    n_obs = int(
        subprocess.check_output(["wc", "-l", fpath_tobs_file]).split()[0])

    stchk = StatusCheck(n_obs, 1000000)

    stn_idxs = {}
    for x in np.arange(stnidsOrig.size):
        stn_idxs[stnidsOrig[x]] = x

    while aline != "":

        try:

            stn_idx = stn_idxs[aline[0:11]]

            aYmd = np.int(aline[12:20])
            if aYmd != curYmd:
                time_idx = np.nonzero(days[YMD] == aYmd)[0][0]
                curYmd = aYmd

            atobs[time_idx, stn_idx] = np.int(aline[-5:])


#
        except KeyError:
            pass
        stchk.increment()
        aline = fileTobs.readline()
    tobs[:] = atobs