示例#1
0
def bufr(config, output_file=None, cycle='18'):
    """
    Generates model data from BUFKIT profiles and saves to a file, which can later be retrieved for either training
    data or model run data.

    :param config:
    :param output_file: str: output file path
    :param cycle: str: model cycle (init hour)
    :return:
    """
    bufr_station_id = config['BUFR']['bufr_station_id']
    # Base arguments dictionary. dset and date will be modified iteratively.
    bufarg = {
        'dset': '',
        'date': '',
        'cycle': cycle,
        'stations': bufr_station_id.lower(),
        'noascii': '',
        'nozipit': '',
        'noverbose': '',
        'prepend': ''
    }
    if config['verbose']:
        print('\n')
    bufr_default_dir = '%s/metdat/bufkit' % config['BUFR_ROOT']
    bufr_data_dir = config['BUFR']['bufr_data_dir']
    if not (os.path.isdir(bufr_data_dir)):
        os.makedirs(bufr_data_dir)
    bufrgruven = config['BUFR']['bufrgruven']
    if config['verbose']:
        print('bufr: using BUFKIT files in %s' % bufr_data_dir)
    bufr_format = '%s/%s%s.%s_%s.buf'
    missing_dates = []
    models = config['BUFR']['bufr_models']
    model_names = config['BUFR']['models']
    start_date = datetime.strptime(config['data_start_date'],
                                   '%Y%m%d') - timedelta(days=1)
    end_date = datetime.strptime(config['data_end_date'],
                                 '%Y%m%d') - timedelta(days=1)
    dates = generate_dates(config, start_date=start_date, end_date=end_date)
    for date in dates:
        bufarg['date'] = datetime.strftime(date, '%Y%m%d')
        if date.year < 2010:
            if config['verbose']:
                print('bufr: skipping BUFR data for %s; data starts in 2010.' %
                      bufarg['date'])
            continue
        if config['verbose']:
            print('bufr: date: %s' % bufarg['date'])

        for m in range(len(models)):
            if config['verbose']:
                print('bufr: trying to retrieve BUFR data for %s...' %
                      model_names[m])
            bufr_new_name = bufr_format % (bufr_data_dir, bufarg['date'],
                                           '%02d' % int(bufarg['cycle']),
                                           model_names[m], bufarg['stations'])
            if os.path.isfile(bufr_new_name):
                if config['verbose']:
                    print('bufr: file %s already exists; skipping!' %
                          bufr_new_name)
                break

            if type(models[m]) == list:
                for model in models[m]:
                    try:
                        bufarg['dset'] = model
                        bufr_retrieve(bufrgruven, bufarg)
                        bufr_name = bufr_format % (
                            bufr_default_dir, bufarg['date'], '%02d' %
                            int(bufarg['cycle']), model, bufarg['stations'])
                        bufr_file = open(bufr_name)
                        bufr_file.close()
                        os.rename(bufr_name, bufr_new_name)
                        if config['verbose']:
                            print('bufr: BUFR file found for %s at date %s.' %
                                  (model, bufarg['date']))
                            print('bufr: writing BUFR file: %s' %
                                  bufr_new_name)
                        break
                    except:
                        if config['verbose']:
                            print(
                                'bufr: BUFR file for %s at date %s not retrieved.'
                                % (model, bufarg['date']))
            else:
                try:
                    model = models[m]
                    bufarg['dset'] = model
                    bufr_retrieve(bufrgruven, bufarg)
                    bufr_name = bufr_format % (
                        bufr_default_dir, bufarg['date'],
                        '%02d' % int(bufarg['cycle']), bufarg['dset'],
                        bufarg['stations'])
                    bufr_file = open(bufr_name)
                    bufr_file.close()
                    os.rename(bufr_name, bufr_new_name)
                    if config['verbose']:
                        print('bufr: BUFR file found for %s at date %s.' %
                              (model, bufarg['date']))
                        print('bufr: writing BUFR file: %s' % bufr_new_name)
                except:
                    if config['verbose']:
                        print(
                            'bufr: BUFR file for %s at date %s not retrieved.'
                            % (model, bufarg['date']))
            if not (os.path.isfile(bufr_new_name)):
                print(
                    'bufr: warning: no BUFR file found for model %s at date %s'
                    % (model_names[m], bufarg['date']))
                missing_dates.append((date, model_names[m]))

    # Process data
    print('\n')
    bufr_dict = OrderedDict({
        'PROF': OrderedDict(),
        'SFC': OrderedDict(),
        'DAY': OrderedDict()
    })
    for model in model_names:
        bufr_dict['PROF'][model] = OrderedDict()
        bufr_dict['SFC'][model] = OrderedDict()
        bufr_dict['DAY'][model] = OrderedDict()

    for date in dates:
        date_str = datetime.strftime(date, '%Y%m%d')
        verif_date = date + timedelta(days=1)
        start_dt = verif_date + timedelta(hours=config['forecast_hour_start'])
        end_dt = verif_date + timedelta(hours=config['forecast_hour_start'] +
                                        24)
        for model in model_names:
            if (date, model) in missing_dates:
                if config['verbose']:
                    print('bufr: skipping %s data for %s; file missing.' %
                          (model, date_str))
                continue
            if config['verbose']:
                print('bufr: processing %s data for %s' % (model, date_str))
            bufr_name = bufr_format % (bufr_data_dir, date_str, '%02d' % int(
                bufarg['cycle']), model, bufarg['stations'])
            if not (os.path.isfile(bufr_name)):
                if config['verbose']:
                    print('bufr: skipping %s data for %s; file missing.' %
                          (model, date_str))
                continue
            profile = bufkit_parser_time_height(config, bufr_name, 6, start_dt,
                                                end_dt)
            sfc, daily = bufkit_parser_surface(bufr_name, 3, start_dt, end_dt)
            # Drop 'PRES' variable which is useless
            for key, values in profile.items():
                values.pop('PRES', None)
                profile[key] = values
            bufr_dict['PROF'][model][verif_date] = profile
            bufr_dict['SFC'][model][verif_date] = sfc
            bufr_dict['DAY'][model][verif_date] = daily

    # Export data
    if output_file is None:
        output_file = '%s/%s_bufr.pkl' % (config['SITE_ROOT'],
                                          config['station_id'])
    if config['verbose']:
        print('bufr: -> exporting to %s' % output_file)
    with open(output_file, 'wb') as handle:
        pickle.dump(bufr_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return
示例#2
0
def verification(config,
                 output_files=None,
                 csv_files=None,
                 use_cf6=True,
                 use_climo=True,
                 force_rain_quantity=False):
    """
    Generates verification data from MesoWest and saves to a file, which is used to train the model and check test
    results.
    :param config:
    :param output_files: str: output file path if just one station, or list of output file paths if multiple stations
    :param csv_files: str: path to csv file containing observations if just one station, or list of paths to csv files if multiple stations
    :param use_cf6: bool: if True, uses wind values from CF6 files
    :param use_climo: bool: if True, uses wind values from NCDC climatology
    :param force_rain_quantity: if True, returns the actual quantity of rain (rather than POP); useful for validation
    files
    :return:
    """
    if config['multi_stations']:  #Train on multiple stations
        station_ids = config['station_id']
        if len(station_ids) != len(
                output_files
        ):  #There has to be the same number of output files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of output files as station IDs")
        if len(station_ids) != len(
                csv_files
        ):  #There has to be the same number of output files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of csv files as station IDs")
    else:
        station_ids = [config['station_id']]
        if output_files is not None:
            output_files = [output_files]
        if csv_files is not None:
            csv_files = [csv_files]

    for i in range(len(station_ids)):
        station_id = station_ids[i]
        if output_files is None:
            output_file = '%s/%s_verif.pkl' % (config['SITE_ROOT'], station_id)
        else:
            output_file = output_files[i]

        if csv_files is None:
            csv_file = '%s/%s_verif.csv' % (config['SITE_ROOT'], station_id)
        else:
            csv_file = csv_files[i]

        dates = generate_dates(config)
        api_dates = generate_dates(config,
                                   api=True,
                                   api_add_hour=config['forecast_hour_start'] +
                                   24)
        datename = 'date_time_minus_%d' % config['forecast_hour_start']

        if config['verbose']:
            print('verification: obtaining observations from csv file')
        with open(
                '%s/%s_obs_vars_request.txt' %
            (config['SITE_ROOT'], station_id), 'rb') as fp:
            vars_request = pickle.load(fp)

        all_obspd = pd.read_csv(csv_file)
        obspd = all_obspd[[
            'date_time', 'air_temp', 'precip_accum_one_hour', 'wind_speed',
            'air_temp_low_6_hour', 'air_temp_high_6_hour',
            'precip_accum_six_hour'
        ]]  #subset of data used as verification
        obspd['date_time'] = np.array([
            datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
            for date in obspd['date_time'].values
        ],
                                      dtype='datetime64[s]')
        if config['verbose']:
            print(
                'verification: setting time back %d hours for daily statistics'
                % config['forecast_hour_start'])
        dateobj = pd.to_datetime(obspd['date_time']) - timedelta(
            hours=config['forecast_hour_start'])
        obspd['date_time'] = dateobj
        obspd = obspd.rename(columns={'date_time': datename})

        # Reformat data into hourly and daily
        # Hourly
        def hour(dates):
            date = dates.iloc[0]
            if type(
                    date
            ) == str:  #if data is from csv file, date will be a string instead of a datetime object
                #depending on which version of NumPy or pandas you use, the first or second statement will work
                try:
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                except:
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S+00:00')
            return datetime(date.year, date.month, date.day, date.hour)

        def last(values):
            return values.iloc[-1]

        aggregate = {datename: hour}
        if 'air_temp_high_6_hour' in vars_request and 'air_temp_low_6_hour' in vars_request:
            aggregate['air_temp_high_6_hour'] = np.max
            aggregate['air_temp_low_6_hour'] = np.min
        aggregate['air_temp'] = {
            'air_temp_max': np.max,
            'air_temp_min': np.min
        }
        if 'precip_accum_six_hour' in vars_request:
            aggregate['precip_accum_six_hour'] = np.max
        aggregate['wind_speed'] = np.max
        aggregate['precip_accum_one_hour'] = np.max

        if config['verbose']:
            print(
                'verification: grouping data by hour for hourly observations')
        # Note that obs in hour H are reported at hour H, not H+1
        obs_hourly = obspd.groupby([
            pd.DatetimeIndex(obspd[datename]).year,
            pd.DatetimeIndex(obspd[datename]).month,
            pd.DatetimeIndex(obspd[datename]).day,
            pd.DatetimeIndex(obspd[datename]).hour
        ]).agg(aggregate)
        # Rename columns
        col_names = obs_hourly.columns.values
        col_names_new = []
        for c in range(len(col_names)):
            if col_names[c][0] == 'air_temp':
                col_names_new.append(col_names[c][1])
            else:
                col_names_new.append(col_names[c][0])

        obs_hourly.columns = col_names_new

        # Daily
        def day(dates):
            date = dates.iloc[0]
            if type(
                    date
            ) == str:  #if data is from csv file, date will be a string instead of a datetime object
                #depending on which version of NumPy or pandas you use, the first or second statement will work
                try:
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                except:
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S+00:00')
            return datetime(date.year, date.month, date.day)

        def min_or_nan(a):
            '''
            Returns the minimum of a 1D array if there are at least 4 non-NaN values, and returns NaN otherwise. This is to ensure 
            having NaNs on days with incomplete data when grouping into daily data rather than incorrect data.
            '''
            if np.count_nonzero(~np.isnan(a)) < 4:  #incomplete data
                return np.nan
            else:
                return np.min(a)

        def max_or_nan(a):
            '''
            Returns the maximum of a 1D array if there are at least 4 non-NaN values, and returns NaN otherwise. This is to ensure 
            having NaNs on days with incomplete data when grouping into daily data rather than incorrect data.
            '''
            if np.count_nonzero(~np.isnan(a)) < 4:  #incomplete data
                return np.nan
            else:
                return np.max(a)

        aggregate[datename] = day
        aggregate['air_temp_min'] = np.min
        aggregate['air_temp_max'] = np.max
        aggregate['air_temp_low_6_hour'] = min_or_nan
        aggregate['air_temp_high_6_hour'] = max_or_nan
        aggregate['precip_accum_one_hour'] = np.sum
        aggregate['precip_accum_six_hour'] = np.sum

        try:
            aggregate.pop('air_temp')
        except:
            pass

        if config['verbose']:
            print('verification: grouping data by day for daily verifications')

        obs_daily = obs_hourly.groupby([
            pd.DatetimeIndex(obs_hourly[datename]).year,
            pd.DatetimeIndex(obs_hourly[datename]).month,
            pd.DatetimeIndex(obs_hourly[datename]).day
        ]).agg(aggregate)
        obs_hourly_copy = obs_hourly.copy()
        obs_hourly_copy.set_index(datename, inplace=True)

        if config['verbose']:
            print(
                'verification: checking matching dates for daily obs and CF6')
        if use_climo:
            try:
                climo_values = _climo(config, station_id, dates)
            except BaseException as e:
                if config['verbose']:
                    print(
                        "verification: warning: '%s' while reading climo data"
                        % str(e))
                climo_values = {}
        else:
            if config['verbose']:
                print('verification: not using climo.')
            climo_values = {}
        if use_cf6:
            num_months = min((datetime.utcnow() - dates[0]).days / 30, 24)
            try:
                get_cf6_files(config, station_id, num_months)
            except BaseException as e:
                if config['verbose']:
                    print(
                        "verification: warning: '%s' while getting CF6 files" %
                        str(e))
            try:
                cf6_values = _cf6(config, station_id)
            except BaseException as e:
                if config['verbose']:
                    print(
                        "verification: warning: '%s' while reading CF6 files" %
                        str(e))
                cf6_values = {}
        else:
            if config['verbose']:
                print('verification: not using CF6.')
            cf6_values = {}
        climo_values.update(cf6_values)  # CF6 has precedence
        count_rows = 0
        for index, row in obs_daily.iterrows():
            date = row[datename]
            use_cf6_precip = False
            if 'air_temp_high_6_hour' in vars_request:
                max_temp_var = 'air_temp_high_6_hour'
            else:
                max_temp_var = 'air_temp_max'

            if 'air_temp_low_6_hour' in vars_request:
                min_temp_var = 'air_temp_low_6_hour'
            else:
                min_temp_var = 'air_temp_min'

            if 'precip_accum_six_hour' in vars_request:
                precip_var = 'precip_accum_six_hour'
            else:
                precip_var = 'precip_accum_one_hour'

            obs_max_temp = row[max_temp_var]
            obs_min_temp = row[min_temp_var]
            obs_wind = row['wind_speed']
            obs_precip = round(row[precip_var], 2)
            if np.isnan(obs_max_temp) and np.isnan(
                    obs_min_temp
            ):  #if high or low temperature is missing, chances are some precipitation data is missing too
                use_cf6_precip = True

            # Check for missing or incorrect 6-hour precipitation amounts. If there are any, use sum of 1-hour precipitation amounts if none are missing.
            skip_date = False
            if 'precip_accum_six_hour' in vars_request:  #6-hour precipitation amounts were used
                daily_precip = 0.0
                for hour in [
                        5, 11, 17, 23
                ]:  #check the 4 times which should have 6-hour precipitation amounts
                    try:
                        obs_6hr_precip = round(
                            obs_hourly_copy['precip_accum_six_hour'][
                                pd.Timestamp(date.year, date.month, date.day,
                                             hour)], 2)
                    except KeyError:  #incomplete data for date
                        skip_date = True
                        use_cf6_precip = True
                        break
                    if np.isnan(obs_6hr_precip):
                        obs_6hr_precip = 0.0
                    sum_hourly_precip = 0.0
                    for hour2 in range(
                            hour - 5, hour +
                            1):  #check and sum 1-hour precipitation amounts
                        obs_hourly_precip = obs_hourly_copy[
                            'precip_accum_one_hour'][pd.Timestamp(
                                date.year, date.month, date.day, hour2)]
                        if np.isnan(
                                obs_hourly_precip
                        ):  #missing 1-hour precipitation amount, so use cf6/climo value instead
                            use_cf6_precip = True
                        else:
                            sum_hourly_precip += round(obs_hourly_precip, 2)
                    if sum_hourly_precip > obs_6hr_precip and not use_cf6_precip:  #Missing or incorrect 6-hour precipitation amount but 1-hour precipitation amounts are OK
                        obs_6hr_precip = round(sum_hourly_precip, 2)
                    daily_precip += round(obs_6hr_precip, 2)
                if (round(daily_precip, 2) > round(obs_precip, 2)
                        and not use_cf6_precip):
                    print(
                        'verification: warning: incorrect obs precip of %0.2f for %s, using summed one hour accumulation value of %0.2f'
                        % (obs_precip, date, daily_precip))
                    obs_daily.loc[index,
                                  'precip_accum_six_hour'] = daily_precip
            else:  #1-hour precipitation amounts were used
                for hour in range(24):
                    try:
                        obs_hourly_precip = obs_hourly_copy[
                            'precip_accum_one_hour'][pd.Timestamp(
                                date.year, date.month, date.day, hour)]
                    except KeyError:  #incomplete data for date
                        skip_date = True
                        break
                    if np.isnan(obs_hourly_precip):
                        use_cf6_precip = True
            if skip_date:
                obs_daily.loc[index, max_temp_var] = np.nan
                obs_daily.loc[index, min_temp_var] = np.nan
                obs_daily.loc[index, 'wind_speed'] = np.nan
                obs_daily.loc[index, precip_var] = np.nan
            if date in climo_values.keys() and not skip_date:
                count_rows += 1
                cf6_max_temp = climo_values[date]['max_temp']
                cf6_min_temp = climo_values[date]['min_temp']
                cf6_wind = climo_values[date]['wind']
                cf6_precip = climo_values[date]['precip']
                if not (np.isnan(cf6_max_temp)
                        ) and cf6_max_temp > -900.0 and np.isnan(obs_max_temp):
                    print(
                        'verification: warning: missing obs max temp for %s, using cf6/climo value of %d'
                        % (date, round(cf6_max_temp, 0)))
                    obs_daily.loc[index, max_temp_var] = cf6_max_temp
                if not (np.isnan(cf6_min_temp)
                        ) and cf6_min_temp < 900.0 and np.isnan(obs_min_temp):
                    print(
                        'verification: warning: missing obs min temp for %s, using cf6/climo value of %d'
                        % (date, round(cf6_min_temp, 0)))
                    obs_daily.loc[index, min_temp_var] = cf6_min_temp
                if not (np.isnan(cf6_wind)):
                    if obs_wind > cf6_wind and obs_wind < cf6_wind + 10:
                        print(
                            'verification: warning: obs wind for %s larger than wind from cf6/climo; using obs'
                            % date)
                    else:
                        obs_daily.loc[index, 'wind_speed'] = cf6_wind
                else:
                    count_rows -= 1
                if not (np.isnan(cf6_precip)
                        ) and cf6_precip > -900.0 and use_cf6_precip and round(
                            cf6_precip, 2) > round(obs_precip, 2):
                    print(
                        'verification: warning: incorrect obs precip of %0.2f for %s, using cf6/climo value of %0.2f'
                        % (obs_precip, date, cf6_precip))
                    obs_daily.loc[index, precip_var] = cf6_precip
        if config['verbose']:
            print('verification: found %d matching rows.' % count_rows)

        # Round
        round_dict = {'wind_speed': 0}
        if 'air_temp_high_6_hour' in vars_request:
            round_dict['air_temp_high_6_hour'] = 0
        if 'air_temp_low_6_hour' in vars_request:
            round_dict['air_temp_low_6_hour'] = 0
        round_dict['air_temp_max'] = 0
        round_dict['air_temp_min'] = 0
        if 'precip_accum_six_hour' in vars_request:
            round_dict['precip_accum_six_hour'] = 2
        round_dict['precip_accum_one_hour'] = 2
        obs_daily = obs_daily.round(round_dict)

        # Generation of final output data
        if config['verbose']:
            print('verification: generating final verification dictionary...')
        if 'air_temp_high_6_hour' in vars_request:
            obs_daily.rename(columns={'air_temp_high_6_hour': 'Tmax'},
                             inplace=True)
        else:
            obs_daily.rename(columns={'air_temp_max': 'Tmax'}, inplace=True)
        if 'air_temp_low_6_hour' in vars_request:
            obs_daily.rename(columns={'air_temp_low_6_hour': 'Tmin'},
                             inplace=True)
        else:
            obs_daily.rename(columns={'air_temp_min': 'Tmin'}, inplace=True)
        if 'precip_accum_six_hour' in vars_request:
            obs_daily.rename(columns={'precip_accum_six_hour': 'Rain'},
                             inplace=True)
        else:
            obs_daily.rename(columns={'precip_accum_one_hour': 'Rain'},
                             inplace=True)
        obs_daily.rename(columns={'wind_speed': 'Wind'}, inplace=True)

        # Deal with the rain depending on the type of forecast requested
        obs_daily['Rain'].fillna(0.0, inplace=True)
        if config['Model'][
                'rain_forecast_type'] == 'pop' and not force_rain_quantity:
            obs_daily.loc[:, 'Rain'] = pop_rain(obs_daily['Rain'])
        elif config['Model'][
                'rain_forecast_type'] == 'categorical' and not force_rain_quantity:
            obs_daily.loc[:, 'Rain'] = categorical_rain(obs_daily['Rain'])

        # Set the date time index and retain only desired columns
        obs_daily = obs_daily.rename(columns={datename: 'date_time'})
        obs_daily = obs_daily.set_index('date_time')
        if config['verbose']:
            print('verification: -> exporting to %s' % output_file)
        export_cols = ['Tmax', 'Tmin', 'Wind', 'Rain']
        for col in obs_daily.columns:
            if col not in export_cols:
                obs_daily.drop(col, 1, inplace=True)

        # If a time series is desired, then get hourly data from csv file
        if config['Model']['predict_timeseries']:
            obs_hourly_verify = all_obspd[[
                'date_time', 'air_temp', 'relative_humidity', 'wind_speed',
                'precip_accum_one_hour'
            ]]  #subset of data used as verification

            # Fix rainfall for categorical and time accumulation
            rain_column = 'precip_last_%d_hour' % config['time_series_interval']
            obs_hourly_verify.rename(
                columns={'precip_accum_one_hour': rain_column}, inplace=True)
            if config['Model'][
                    'rain_forecast_type'] == 'pop' and not force_rain_quantity:
                if config['verbose']:
                    print("verification: using 'pop' rain")
                obs_hourly_verify.loc[:, rain_column] = pop_rain(
                    obs_hourly_verify[rain_column])
                use_rain_max = True
            elif config['Model'][
                    'rain_forecast_type'] == 'categorical' and not force_rain_quantity:
                if config['verbose']:
                    print("verification: using 'categorical' rain")
                obs_hourly_verify.loc[:, rain_column] = categorical_rain(
                    obs_hourly_verify[rain_column])
                use_rain_max = True
            else:
                use_rain_max = False

        # Export final data
        export_dict = OrderedDict()
        for date in dates:
            try:
                day_dict = obs_daily.loc[date].to_dict(into=OrderedDict)
            except KeyError:
                continue
            if np.any(np.isnan(list(day_dict.values()))):
                if config['verbose']:
                    print(
                        'verification: warning: omitting day %s; missing data'
                        % date)
                continue  # No verification can have missing values
            if config['Model']['predict_timeseries']:
                start = pd.Timestamp(date + timedelta(
                    hours=(config['forecast_hour_start'] -
                           config['time_series_interval'])))
                end = pd.Timestamp(date + timedelta(
                    hours=config['forecast_hour_start'] + 24))
                try:
                    series = reindex_hourly(obs_hourly_verify,
                                            start,
                                            end,
                                            config['time_series_interval'],
                                            use_rain_max=use_rain_max)
                except KeyError:
                    # No values for the day
                    if config['verbose']:
                        print(
                            'verification: warning: omitting day %s; missing data'
                            % date)
                    continue
                if series.isnull().values.any():
                    if config['verbose']:
                        print(
                            'verification: warning: omitting day %s; missing data'
                            % date)
                    continue
                series_dict = OrderedDict(series.to_dict(into=OrderedDict))
                day_dict.update(series_dict)
            export_dict[date] = day_dict
        with open(output_file, 'wb') as handle:
            pickle.dump(export_dict, handle, protocol=2)

    return
示例#3
0
def obs(config,
        output_files=None,
        csv_files=None,
        num_hours=24,
        interval=3,
        use_nan_sounding=False,
        use_existing_sounding=True):
    """
    Generates observation data from MesoWest and UCAR soundings and saves to a file, which can later be retrieved for
    either training data or model run data.
    :param config:
    :param output_files: str: output file path if just one station, or list of output file paths if multiple stations
    :param csv_files: str: path to csv file containing observations if just one station, or list of paths to csv files if multiple stations
    :param num_hours: int: number of hours to retrieve obs
    :param interval: int: retrieve obs every 'interval' hours
    :param use_nan_sounding: bool: if True, uses a sounding of NaNs rather than omitting a day if sounding is missing
    :param use_existing_sounding: bool: if True, preferentially uses saved soundings in sounding_data_dir
    :return:
    """
    if config['multi_stations']:  #Train on multiple stations
        station_ids = config['station_id']
        if len(station_ids) != len(
                output_files
        ):  #There has to be the same number of output files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of output files as station IDs")
        if len(station_ids) != len(
                csv_files
        ):  #There has to be the same number of output files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of csv files as station IDs")
    else:
        station_ids = [config['station_id']]
        if output_files is not None:
            output_files = [output_files]
        if csv_files is not None:
            csv_files = [csv_files]

    for i in range(len(station_ids)):
        station_id = station_ids[i]
        if output_files is None:
            output_file = '%s/%s_obs.pkl' % (config['SITE_ROOT'], station_id)
        else:
            output_file = output_files[i]

        if csv_files is None:
            csv_file = '%s/%s_obs.csv' % (config['SITE_ROOT'], station_id)
        else:
            csv_file = csv_files[i]

        start_date = datetime.strptime(config['data_start_date'],
                                       '%Y%m%d') - timedelta(hours=num_hours)
        dates = generate_dates(config)
        api_dates = generate_dates(config, api=True, start_date=start_date)

        # Retrieve station data
        if not os.path.exists(csv_file):  #no observations saved yet
            # Look for desired variables
            vars_request = [
                'air_temp', 'altimeter', 'precip_accum_one_hour',
                'relative_humidity', 'wind_speed', 'wind_direction'
            ]

            vars_option = [
                'air_temp_low_6_hour', 'air_temp_high_6_hour',
                'precip_accum_six_hour'
            ]
            m = Meso(token=config['meso_token'])
            if config['verbose']:
                print('obs: MesoPy initialized for station %s' %
                      config['station_id'])
                print('obs: retrieving latest obs and metadata')
            latest = m.latest(stid=station_id)
            obs_list = list(latest['STATION'][0]['SENSOR_VARIABLES'].keys())

            # Add variables to the api request if they exist
            if config['verbose']:
                print('obs: searching for 6-hourly variables...')
            for var in vars_option:
                if var in obs_list:
                    if config['verbose']:
                        print('obs: found variable %s, adding to data' % var)
                    vars_request += [var]

            # Add variables to the api request
            vars_api = ''
            for var in vars_request:
                vars_api += var + ','
            vars_api = vars_api[:-1]

            # Units
            units = 'temp|f,precip|in,speed|kts'
            all_obs_hourly = get_obs_hourly(config, station_id, api_dates,
                                            vars_api, units)
            try:
                all_obs_hourly.to_csv(csv_file)
                if config['verbose']:
                    print('obs: saving observations to csv file succeeded')
                with open(
                        '%s/%s_obs_vars_request.txt' %
                    (config['SITE_ROOT'], station_id), 'wb') as fp:
                    pickle.dump(vars_request, fp, protocol=2)
                if config['verbose']:
                    print(
                        'obs: saving vars request list to txt file succeeded')
            except BaseException as e:
                if config['verbose']:
                    print("obs: warning: '%s' while saving observations" %
                          str(e))
            obs_hourly = all_obs_hourly[[
                'air_temp', 'altimeter', 'precip_accum_one_hour',
                'relative_humidity', 'wind_speed', 'wind_direction'
            ]]  #subset of data used as predictors
        else:
            if config['verbose']:
                print('obs: obtaining observations from csv file')
            all_obs_hourly = pd.read_csv(csv_file)
            with open(
                    '%s/%s_obs_vars_request.txt' %
                (config['SITE_ROOT'], station_id), 'rb') as fp:
                vars_request = pickle.load(fp)
            obs_hourly = all_obs_hourly[[
                'date_time', 'air_temp', 'altimeter', 'precip_accum_one_hour',
                'relative_humidity', 'wind_speed', 'wind_direction'
            ]]  #subset of data used as predictors

        # Retrieve upper-air sounding data
        soundings = OrderedDict()
        if config['Obs']['use_soundings']:
            if config['verbose']:
                print('obs: retrieving upper-air sounding data')
            for date in dates:
                soundings[date] = OrderedDict()
                start_date = date - timedelta(
                    days=1)  # get the previous day's soundings
                for hour in [0, 12]:
                    sounding_date = start_date + timedelta(hours=hour)
                    try:
                        sounding = upper_air(
                            config,
                            station_id,
                            sounding_station_id,
                            sounding_date,
                            use_nan_sounding,
                            use_existing=use_existing_sounding)
                        soundings[date][sounding_date] = sounding
                    except:
                        print(
                            'obs: warning: problem retrieving soundings for %s'
                            % datetime.strftime(date, '%Y%m%d'))
                        soundings.pop(date)
                        break

        # Create dictionary of days
        if config['verbose']:
            print('obs: converting to output dictionary')
        obs_export = OrderedDict({'SFC': OrderedDict(), 'SNDG': OrderedDict()})
        for date in dates:
            if config['Obs']['use_soundings'] and date not in soundings.keys():
                continue
            # Need to ensure we use the right intervals to have 22:5? Z obs
            start = pd.Timestamp((date - timedelta(hours=num_hours)))
            end = pd.Timestamp(date)
            obs_export['SFC'][date] = reindex_hourly(
                obs_hourly, start, end, interval,
                end_23z=True).to_dict(into=OrderedDict)
            if config['Obs']['use_soundings']:
                obs_export['SNDG'][date] = soundings[date]

        # Export final data
        if config['verbose']:
            print('obs: -> exporting to %s' % output_file)
        with open(output_file, 'wb') as handle:
            pickle.dump(obs_export, handle, protocol=2)

    return
示例#4
0
def verification(config,
                 output_file=None,
                 use_cf6=True,
                 use_climo=True,
                 force_rain_quantity=False):
    """
    Generates verification data from MesoWest and saves to a file, which is used to train the model and check test
    results.

    :param config:
    :param output_file: str: path to output file
    :param use_cf6: bool: if True, uses wind values from CF6 files
    :param use_climo: bool: if True, uses wind values from NCDC climatology
    :param force_rain_quantity: if True, returns the actual quantity of rain (rather than POP); useful for validation
    files
    :return:
    """
    if output_file is None:
        output_file = '%s/%s_verif.pkl' % (config['SITE_ROOT'],
                                           config['station_id'])

    dates = generate_dates(config)
    api_dates = generate_dates(config,
                               api=True,
                               api_add_hour=config['forecast_hour_start'] + 24)

    # Read new data for daily values
    m = Meso(token=config['meso_token'])

    if config['verbose']:
        print('verification: MesoPy initialized for station %s' %
              config['station_id'])
        print('verification: retrieving latest obs and metadata')
    latest = m.latest(stid=config['station_id'])
    obs_list = list(latest['STATION'][0]['SENSOR_VARIABLES'].keys())

    # Look for desired variables
    vars_request = ['air_temp', 'wind_speed', 'precip_accum_one_hour']
    vars_option = [
        'air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour'
    ]

    # Add variables to the api request if they exist
    if config['verbose']:
        print('verification: searching for 6-hourly variables...')
    for var in vars_option:
        if var in obs_list:
            if config['verbose']:
                print('verification: found variable %s, adding to data' % var)
            vars_request += [var]
    vars_api = ''
    for var in vars_request:
        vars_api += var + ','
    vars_api = vars_api[:-1]

    # Units
    units = 'temp|f,precip|in,speed|kts'

    # Retrieve data
    obspd = pd.DataFrame()
    for api_date in api_dates:
        if config['verbose']:
            print('verification: retrieving data from %s to %s' % api_date)
        obs = m.timeseries(stid=config['station_id'],
                           start=api_date[0],
                           end=api_date[1],
                           vars=vars_api,
                           units=units)
        obspd = pd.concat(
            (obspd, pd.DataFrame.from_dict(obs['STATION'][0]['OBSERVATIONS'])),
            ignore_index=True)

    # Rename columns to requested vars
    obs_var_names = obs['STATION'][0]['SENSOR_VARIABLES']
    obs_var_keys = list(obs_var_names.keys())
    col_names = list(map(''.join, obspd.columns.values))
    for c in range(len(col_names)):
        col = col_names[c]
        for k in range(len(obs_var_keys)):
            key = obs_var_keys[k]
            if col == list(obs_var_names[key].keys())[0]:
                col_names[c] = key
    obspd.columns = col_names

    # Make sure we have columns for all requested variables
    for var in vars_request:
        if var not in col_names:
            obspd = obspd.assign(**{var: np.nan})

    # Change datetime column to datetime object, subtract 6 hours to use 6Z days
    if config['verbose']:
        print('verification: setting time back %d hours for daily statistics' %
              config['forecast_hour_start'])
    dateobj = pd.to_datetime(
        obspd['date_time']) - timedelta(hours=config['forecast_hour_start'])
    obspd['date_time'] = dateobj
    datename = 'date_time_minus_%d' % config['forecast_hour_start']
    obspd = obspd.rename(columns={'date_time': datename})

    # Reformat data into hourly and daily
    # Hourly
    def hour(dates):
        date = dates.iloc[0]
        return datetime(date.year, date.month, date.day, date.hour)

    def last(values):
        return values.iloc[-1]

    aggregate = {datename: hour}
    if 'air_temp_high_6_hour' in vars_request and 'air_temp_low_6_hour' in vars_request:
        aggregate['air_temp_high_6_hour'] = np.max
        aggregate['air_temp_low_6_hour'] = np.min
    aggregate['air_temp'] = {'air_temp_max': np.max, 'air_temp_min': np.min}
    if 'precip_accum_six_hour' in vars_request:
        aggregate['precip_accum_six_hour'] = np.max
    aggregate['wind_speed'] = np.max
    aggregate['precip_accum_one_hour'] = np.max

    if config['verbose']:
        print('verification: grouping data by hour for hourly observations')
    # Note that obs in hour H are reported at hour H, not H+1
    obs_hourly = obspd.groupby([
        pd.DatetimeIndex(obspd[datename]).year,
        pd.DatetimeIndex(obspd[datename]).month,
        pd.DatetimeIndex(obspd[datename]).day,
        pd.DatetimeIndex(obspd[datename]).hour
    ]).agg(aggregate)
    # Rename columns
    col_names = obs_hourly.columns.values
    col_names_new = []
    for c in range(len(col_names)):
        if col_names[c][0] == 'air_temp':
            col_names_new.append(col_names[c][1])
        else:
            col_names_new.append(col_names[c][0])

    obs_hourly.columns = col_names_new

    # Daily
    def day(dates):
        date = dates.iloc[0]
        return datetime(date.year, date.month, date.day)

    aggregate[datename] = day
    aggregate['air_temp_min'] = np.min
    aggregate['air_temp_max'] = np.max
    aggregate['precip_accum_six_hour'] = np.sum
    try:
        aggregate.pop('air_temp')
    except:
        pass

    if config['verbose']:
        print('verification: grouping data by day for daily verifications')
    obs_daily = obs_hourly.groupby([
        pd.DatetimeIndex(obs_hourly[datename]).year,
        pd.DatetimeIndex(obs_hourly[datename]).month,
        pd.DatetimeIndex(obs_hourly[datename]).day
    ]).agg(aggregate)

    if config['verbose']:
        print('verification: checking matching dates for daily obs and CF6')
    if use_climo:
        try:
            climo_values = _climo_wind(config, dates)
        except BaseException as e:
            if config['verbose']:
                print("verification: warning: '%s' while reading climo data" %
                      str(e))
            climo_values = {}
    else:
        if config['verbose']:
            print('verification: not using climo.')
        climo_values = {}
    if use_cf6:
        num_months = min((datetime.utcnow() - dates[0]).days / 30, 24)
        try:
            get_cf6_files(config, num_months)
        except BaseException as e:
            if config['verbose']:
                print("verification: warning: '%s' while getting CF6 files" %
                      str(e))
        try:
            cf6_values = _cf6_wind(config)
        except BaseException as e:
            if config['verbose']:
                print("verification: warning: '%s' while reading CF6 files" %
                      str(e))
            cf6_values = {}
    else:
        if config['verbose']:
            print('verification: not using CF6.')
        cf6_values = {}
    climo_values.update(cf6_values)  # CF6 has precedence
    count_rows = 0
    for index, row in obs_daily.iterrows():
        date = row[datename]
        if date in climo_values.keys():
            count_rows += 1
            obs_wind = row['wind_speed']
            cf6_wind = climo_values[date]['wind']
            if not (np.isnan(cf6_wind)):
                if obs_wind - cf6_wind >= 5:
                    print(
                        'verification: warning: obs wind for %s much larger than wind from cf6/climo; using obs'
                        % date)
                else:
                    obs_daily.loc[index, 'wind_speed'] = cf6_wind
            else:
                count_rows -= 1
    if config['verbose']:
        print('verification: found %d matching rows.' % count_rows)

    # Round
    round_dict = {'wind_speed': 0}
    if 'air_temp_high_6_hour' in vars_request:
        round_dict['air_temp_high_6_hour'] = 0
    if 'air_temp_low_6_hour' in vars_request:
        round_dict['air_temp_low_6_hour'] = 0
    round_dict['air_temp_max'] = 0
    round_dict['air_temp_min'] = 0
    if 'precip_accum_six_hour' in vars_request:
        round_dict['precip_accum_six_hour'] = 2
    round_dict['precip_accum_one_hour'] = 2
    obs_daily = obs_daily.round(round_dict)

    # Generation of final output data
    if config['verbose']:
        print('verification: generating final verification dictionary...')
    if 'air_temp_high_6_hour' in vars_request:
        obs_daily.rename(columns={'air_temp_high_6_hour': 'Tmax'},
                         inplace=True)
    else:
        obs_daily.rename(columns={'air_temp_max': 'Tmax'}, inplace=True)
    if 'air_temp_low_6_hour' in vars_request:
        obs_daily.rename(columns={'air_temp_low_6_hour': 'Tmin'}, inplace=True)
    else:
        obs_daily.rename(columns={'air_temp_min': 'Tmin'}, inplace=True)
    if 'precip_accum_six_hour' in vars_request:
        obs_daily.rename(columns={'precip_accum_six_hour': 'Rain'},
                         inplace=True)
    else:
        obs_daily.rename(columns={'precip_accum_one_hour': 'Rain'},
                         inplace=True)
    obs_daily.rename(columns={'wind_speed': 'Wind'}, inplace=True)

    # Deal with the rain depending on the type of forecast requested
    obs_daily['Rain'].fillna(0.0, inplace=True)
    if config['Model'][
            'rain_forecast_type'] == 'pop' and not force_rain_quantity:
        obs_daily.loc[:, 'Rain'] = pop_rain(obs_daily['Rain'])
    elif config['Model'][
            'rain_forecast_type'] == 'categorical' and not force_rain_quantity:
        obs_daily.loc[:, 'Rain'] = categorical_rain(obs_daily['Rain'])

    # Set the date time index and retain only desired columns
    obs_daily = obs_daily.rename(columns={datename: 'date_time'})
    obs_daily = obs_daily.set_index('date_time')
    if config['verbose']:
        print('verification: -> exporting to %s' % output_file)
    export_cols = ['Tmax', 'Tmin', 'Wind', 'Rain']
    for col in obs_daily.columns:
        if col not in export_cols:
            obs_daily.drop(col, 1, inplace=True)

    # If a time series is desired, then get hourly data
    if config['Model']['predict_timeseries']:

        # Look for desired variables
        vars_request = [
            'air_temp', 'relative_humidity', 'wind_speed',
            'precip_accum_one_hour'
        ]

        # Add variables to the api request
        vars_api = ''
        for var in vars_request:
            vars_api += var + ','
        vars_api = vars_api[:-1]

        # Units
        units = 'temp|f,precip|in,speed|kts'

        # Retrieve data
        obs_hourly_verify = get_obs_hourly(config, api_dates, vars_api, units)

        # Fix rainfall for categorical and time accumulation
        rain_column = 'precip_last_%d_hour' % config['time_series_interval']
        obs_hourly_verify.rename(
            columns={'precip_accum_one_hour': rain_column}, inplace=True)
        if config['Model'][
                'rain_forecast_type'] == 'pop' and not force_rain_quantity:
            if config['verbose']:
                print("verification: using 'pop' rain")
            obs_hourly_verify.loc[:, rain_column] = pop_rain(
                obs_hourly_verify[rain_column])
            use_rain_max = True
        elif config['Model'][
                'rain_forecast_type'] == 'categorical' and not force_rain_quantity:
            if config['verbose']:
                print("verification: using 'categorical' rain")
            obs_hourly_verify.loc[:, rain_column] = categorical_rain(
                obs_hourly_verify[rain_column])
            use_rain_max = True
        else:
            use_rain_max = False

    # Export final data
    export_dict = OrderedDict()
    for date in dates:
        try:
            day_dict = obs_daily.loc[date].to_dict(into=OrderedDict)
        except KeyError:
            continue
        if np.any(np.isnan(day_dict.values())):
            if config['verbose']:
                print('verification: warning: omitting day %s; missing data' %
                      date)
            continue  # No verification can have missing values
        if config['Model']['predict_timeseries']:
            start = pd.Timestamp(date + timedelta(
                hours=(config['forecast_hour_start'] -
                       config['time_series_interval'])))
            end = pd.Timestamp(date + timedelta(
                hours=config['forecast_hour_start'] + 24))
            try:
                series = reindex_hourly(obs_hourly_verify,
                                        start,
                                        end,
                                        config['time_series_interval'],
                                        use_rain_max=use_rain_max)
            except KeyError:
                # No values for the day
                if config['verbose']:
                    print(
                        'verification: warning: omitting day %s; missing data'
                        % date)
                continue
            if series.isnull().values.any():
                if config['verbose']:
                    print(
                        'verification: warning: omitting day %s; missing data'
                        % date)
                continue
            series_dict = OrderedDict(series.to_dict(into=OrderedDict))
            day_dict.update(series_dict)
        export_dict[date] = day_dict
    with open(output_file, 'wb') as handle:
        pickle.dump(export_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return
示例#5
0
def obs(config,
        output_file=None,
        num_hours=24,
        interval=3,
        use_nan_sounding=False,
        use_existing_sounding=True):
    """
    Generates observation data from MesoWest and UCAR soundings and saves to a file, which can later be retrieved for
    either training data or model run data.

    :param config:
    :param output_file: str: output file path
    :param num_hours: int: number of hours to retrieve obs
    :param interval: int: retrieve obs every 'interval' hours
    :param use_nan_sounding: bool: if True, uses a sounding of NaNs rather than omitting a day if sounding is missing
    :param use_existing_sounding: bool: if True, preferentially uses saved soundings in sounding_data_dir
    :return:
    """
    if output_file is None:
        output_file = '%s/%s_obs.pkl' % (config['SITE_ROOT'],
                                         config['station_id'])

    start_date = datetime.strptime(config['data_start_date'],
                                   '%Y%m%d') - timedelta(hours=num_hours)
    dates = generate_dates(config)
    api_dates = generate_dates(config, api=True, start_date=start_date)

    # Look for desired variables
    vars_request = [
        'air_temp', 'altimeter', 'precip_accum_one_hour', 'relative_humidity',
        'wind_speed', 'wind_direction'
    ]

    # Add variables to the api request
    vars_api = ''
    for var in vars_request:
        vars_api += var + ','
    vars_api = vars_api[:-1]

    # Units
    units = 'temp|f,precip|in,speed|kts'

    # Retrieve station data
    obs_hourly = get_obs_hourly(config, api_dates, vars_api, units)

    # Retrieve upper-air sounding data
    if config['verbose']:
        print('obs: retrieving upper-air sounding data')
    soundings = OrderedDict()
    if config['Obs']['use_soundings']:
        for date in dates:
            soundings[date] = OrderedDict()
            start_date = date - timedelta(
                days=1)  # get the previous day's soundings
            for hour in [0, 12]:
                sounding_date = start_date + timedelta(hours=hour)
                try:
                    sounding = upper_air(sounding_date,
                                         use_nan_sounding,
                                         use_existing=use_existing_sounding)
                    soundings[date][sounding_date] = sounding
                except:
                    print('obs: warning: problem retrieving soundings for %s' %
                          datetime.strftime(date, '%Y%m%d'))
                    soundings.pop(date)
                    break

    # Create dictionary of days
    if config['verbose']:
        print('obs: converting to output dictionary')
    obs_export = OrderedDict({'SFC': OrderedDict(), 'SNDG': OrderedDict()})
    for date in dates:
        if config['Obs']['use_soundings'] and date not in soundings.keys():
            continue
        # Need to ensure we use the right intervals to have 22:5? Z obs
        start = pd.Timestamp((date - timedelta(hours=num_hours)))
        end = pd.Timestamp(date)
        obs_export['SFC'][date] = reindex_hourly(
            obs_hourly, start, end, interval,
            end_23z=True).to_dict(into=OrderedDict)
        if config['Obs']['use_soundings']:
            obs_export['SNDG'][date] = soundings[date]

    # Export final data
    if config['verbose']:
        print('obs: -> exporting to %s' % output_file)
    with open(output_file, 'wb') as handle:
        pickle.dump(obs_export, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return