def test_ensemble_prediction_is_stable_after_saving(model_name, fitted_model): """Predictions shouldn't change after the model is saved and re-loaded""" predictions1 = fitted_model.predict() fitted_model.save_params('model_params.json', overwrite=True) loaded_model = utils.load_saved_model('model_params.json') predictions2 = loaded_model.predict(to_predict = obs, predictors=predictors) assert np.all(predictions1 == predictions2)
def test_invalid_saved_model_type(): with pytest.raises(TypeError): utils.load_saved_model(123)
def test_ensemble_save_load(model_name, fitted_model): """"Save and load a model""" fitted_model.save_params('model_params.json', overwrite=True) loaded_model = utils.load_saved_model('model_params.json') assert len(loaded_model.predict(obs, predictors)) == len(obs)
def test_ensemble_do_not_predict_without_data(model_name, fitted_model): """Should not predict when no fitting was done and no new data passed """ fitted_model.save_params('model_params.json', overwrite=True) loaded_model = utils.load_saved_model('model_params.json') with pytest.raises(TypeError): loaded_model.predict()
def test_save_and_load_model_universal_loader(model_name, fitted_model): """Load a saved model via utils.load_saved_model""" fitted_model.save_params('model_params.json', overwrite=True) loaded_model = utils.load_saved_model('model_params.json') assert fitted_model.get_params() == loaded_model.get_params()
def test_ensemble_parameters(model_name, fitted_model): """Parameters should be the same when loaded again""" fitted_model.save_params('model_params.json', overwrite=True) loaded_model = utils.load_saved_model('model_params.json') assert loaded_model.get_params() == fitted_model.get_params()
forecast_metadata.to_dict('records')): time_elapsed = np.round((time.time() - start_time) / 60, 0) print( 'Hindcast date {d}/{D}, species {s}/{S}, elapsed time {t} minutes' .format(d=date_i, D=total_dates, s=species_i, S=total_species, t=time_elapsed)) species = forecast_info['species'] Phenophase_ID = forecast_info['Phenophase_ID'] model_file = config['phenology_model_folder'] + forecast_info[ 'model_file'] model = utils.load_saved_model(model_file) # only make predictions where this species is located. sites_for_this_species = species_sites.query( 'species == @species & Phenophase_ID == @Phenophase_ID') site_info_for_this_species = site_info_for_prediction[ site_info_for_prediction.site_id.isin( sites_for_this_species.site_id)] site_temp_for_this_species = site_temp[site_temp.site_id.isin( sites_for_this_species.site_id)] #print('{s} - {p}'.format(s=species, p=Phenophase_ID)) #print('################## site info ###################') #print(site_info_for_this_species) #print('################## site temp ###################') #print(site_temp_for_this_species) hindcasts_to_compute.append(
def run(climate_forecast_folder=None, phenology_forecast_folder=None, species_list=None): """Build phenology models """ divider = '#' * 90 config = tools.load_config() current_season = tools.current_growing_season(config) current_season_doy_0 = str(int(current_season)) + '0101' current_season_doy_0 = tools.string_to_date(current_season_doy_0, h=False).date() today = datetime.datetime.today().date() current_doy = today.timetuple().tm_yday season_first_date = str( int(current_season) - 1) + config['season_month_begin'] + config['season_day_begin'] season_first_date = tools.string_to_date(season_first_date, h=False).date() # if the season for spring forecasts has started. Nov 1 if today >= season_first_date: # adjust the current doy to potentially be negative to reflect the doy # for the following calendar year. if today < current_season_doy_0: current_doy -= 365 print(divider) print('Applying phenology models - ' + str(today)) range_masks = xr.open_dataset(config['species_range_file']) doy_0 = np.datetime64(current_season_doy_0) # Default location of climate forecasts if not climate_forecast_folder: climate_forecast_folder = config['current_forecast_folder'] current_climate_forecast_files = glob.glob(climate_forecast_folder + '*.nc') print( str(len(current_climate_forecast_files)) + ' current climate forecast files: \n' + str(current_climate_forecast_files)) # Load default species list if no special one was passed if not species_list: species_list = pd.read_csv(config['species_list_file']) species_list = species_list[[ 'species', 'Phenophase_ID', 'current_forecast_version', 'season_start_doy', 'season_end_doy' ]] # Only forecast species and phenophases in the current season species_list = species_list[(current_doy >= species_list.season_start_doy) & (current_doy <= species_list.season_end_doy)] if len(species_list) == 0: raise RuntimeError( 'No species currenly in season, which is roughly Dec. 1 - Nov. 1') phenology_model_metadata = pd.read_csv( config['phenology_model_metadata_file']) forecast_metadata = species_list.merge( phenology_model_metadata, left_on=['species', 'Phenophase_ID', 'current_forecast_version'], right_on=['species', 'Phenophase_ID', 'forecast_version'], how='left') # Default location to write phenology forecasts if not phenology_forecast_folder: phenology_forecast_folder = config['phenology_forecast_folder'] print(divider) # Load the climate forecasts #current_climate_forecasts = [xr.open_dataset(f) for f in current_climate_forecast_files] num_species_processed = 0 for i, forecast_info in enumerate(forecast_metadata.to_dict('records')): species = forecast_info['species'] phenophase = forecast_info['Phenophase_ID'] model_file = config['phenology_model_folder'] + forecast_info[ 'model_file'] model = utils.load_saved_model(model_file) print(divider) if species not in range_masks.species.values: print('Skipping {s} {p}, no range mask'.format(s=species, p=phenophase)) continue else: print('Apply model for {s} {p}'.format(s=species, p=phenophase)) print( 'forecast attempt {i} of {n} potential species. {n2} processed succesfully so far.' .format(i=i, n=len(forecast_metadata), n2=num_species_processed)) species_range = range_masks.sel(species=species) prediction, prediction_sd = predict_phenology_from_climate( model, current_climate_forecast_files, post_process='automated', doy_0=doy_0, species_range=species_range, n_jobs=config['n_jobs']) species_forecast = xr.Dataset(data_vars={ 'doy_prediction': (('species', 'phenophase', 'lat', 'lon'), prediction), 'doy_sd': (('species', 'phenophase', 'lat', 'lon'), prediction_sd) }, coords={ 'species': [species], 'phenophase': [phenophase], 'lat': species_range.lat, 'lon': species_range.lon }) if i == 0: all_species_forecasts = species_forecast num_species_processed += 1 else: merge_start_time = time.time() all_species_forecasts = xr.merge( [all_species_forecasts, species_forecast]) print('merge time {s} sec'.format( s=round(time.time() - merge_start_time, 0))) num_species_processed += 1 # Merging this files over and over slows things down more and more # Saving it every few iterations seems to speed things up. if num_species_processed % 5 == 0: all_species_forecasts.to_netcdf(config['tmp_folder'] + 'forecast_tmp.nc') all_species_forecasts = xr.open_dataset(config['tmp_folder'] + 'forecast_tmp.nc') all_species_forecasts.load() all_species_forecasts.close() print(divider) print('phenology forecast final processing') #all_species_forecasts = xr.merge(all_species_forecasts) current_season = tools.current_growing_season(config) provenance_note = \ """Forecasts for plant phenology of select species flowering and/or leaf out times for the {s} season. Made on {t} from NOAA CFSv2 forecasts downscaled using PRISM climate data. Plant phenology models made using National Phenology Network data. """.format(s=current_season, t=today) all_species_forecasts.attrs['note'] = provenance_note all_species_forecasts.attrs['issue_date'] = str(today) all_species_forecasts.attrs['crs'] = '+init=epsg:4269' # TODO: add some more metadata # common names? #all_species_forecasts['forecast_date']=str(today) #all_species_forecasts['forecast_date']=str(today) forecast_filename = config[ 'phenology_forecast_folder'] + 'phenology_forecast_' + str( today) + '.nc' all_species_forecasts = all_species_forecasts.chunk({'lat': 50, 'lon': 50}) all_species_forecasts.to_netcdf(forecast_filename, encoding={ 'doy_prediction': { 'zlib': True, 'complevel': 4, 'dtype': 'int32', 'scale_factor': 0.001, '_FillValue': -9999 }, 'doy_sd': { 'zlib': True, 'complevel': 4, 'dtype': 'int32', 'scale_factor': 0.001, '_FillValue': -9999 } }) # Return filename of final forecast file for use by primary script return forecast_filename
def predict_phenology_from_climate(model, climate_forecast_files, post_process, doy_0, species_range=None, n_jobs=1): """Predict a phenology model over climate ensemble model A saved model file, or pyPhenology object climate_forecasts A list of xarray objects. each one a climate forecast post_process How to deal with multiple forecasts and/or bootstraps 'automated': for the automated website forecasts. Returns a tuple (prediction_doy, prediction_sd) each with shape (lat, lon) 'hindcast': for hindcasting where I want all the bootstrap + climate ensembles. returns a single array prediction_doy of shape (n_ensemble, n_bootstrap, lat, lon) doy_0 A timestamp for doy_0, usually Jan 1 of the year in question species_range xarray object from the species range code representing 1 species returns numpy array of (a,b,lat,lon) """ if post_process not in ['automated', 'hindcast']: raise ValueError('Uknown post-processing routine: ' + str(post_process)) # If not a pre-fitted model then assume it's a saved file to load try: model.get_params() except: model = utils.load_saved_model(model) species_ensemble = [] for climate_file in climate_forecast_files: climate = xr.open_dataset(climate_file) doy_series = pd.TimedeltaIndex(climate.time.values - doy_0, freq='D').days.values # When using a bootstrap model in hindcasting we want *all* # the predictions. Otherwise just the mean will do if type(model ).__name__ == 'BootstrapModel' and post_process == 'hindcast': species_ensemble.append( model.predict(predictors={ 'temperature': climate.tmean.values, 'doy_series': doy_series }, aggregation='none', n_jobs=n_jobs)) else: species_ensemble.append( model.predict(predictors={ 'temperature': climate.tmean.values, 'doy_series': doy_series }, n_jobs=n_jobs)) species_ensemble = np.array(species_ensemble).astype(np.float) # apply nan to non predictions species_ensemble[species_ensemble == 999] = np.nan # Keep only values in the range if species_range: species_ensemble[:, ~species_range.range.values] = np.nan if post_process == 'automated': prediction_doy = np.nanmean(species_ensemble, axis=0) prediction_sd = np.nanstd(species_ensemble, axis=0) # extend the axis by 2 to match the xarray creation prediction_doy = np.expand_dims(prediction_doy, axis=0) prediction_doy = np.expand_dims(prediction_doy, axis=0) prediction_sd = np.expand_dims(prediction_sd, axis=0) prediction_sd = np.expand_dims(prediction_sd, axis=0) return prediction_doy, prediction_sd elif post_process == 'hindcast': return species_ensemble
model.fit(species_obs, predictor_data) ###################################################### # Save the model parameters time.sleep(1) model_hash = str(uuid.uuid1()) model_filename = '{s}_{p}_{h}.json'.format( s=species_info['species'].replace(' ', '_'), p=species_info['Phenophase_ID'], h=model_hash) model.save_params(config['phenology_model_folder'] + model_filename) # reload the model to clear the fitting data # Otherwise the prediction below balks. model = utils.load_saved_model(config['phenology_model_folder'] + model_filename) ####################################################### # make entry for this specifc model in model metadata file # forecast version is -1 cause the models themselves will # never be used in the automated stuff. model_note = """Naive model using doy ~ latitude, Variation from bootstrapping""" all_model_metadata.append({ 'species': species, 'Phenophase_ID': phenophase, 'base_model': 'Naive', 'forecast_version': -1, 'model_file': model_filename, 'build_date': str(today), 'n_observations': len(species_obs), 'percent_test': 0,
'species == @this_spp & \ Phenophase_ID == @this_phenophase & \ forecast_version == @forecast_version_to_use' ).to_dict('records') n_model_entries = len(species_model_info) if n_model_entries == 0: print('No model found: {s} - {p} - forecast version {f}'.format( s=this_spp, p=this_phenophase, f=forecast_version_to_use)) continue if n_model_entries > 1: raise RuntimeError('{n} models found for {s}-{p}-v{f}'.format( s=this_spp, p=this_phenophase, f=forecast_version_to_use)) try: pheno_model = utils.load_saved_model( config['phenology_model_folder'] + species_model_info[0]['model_file']) except: continue # Make the weighted ensemble unweighted pheno_model.weights[:] = 1 / len(pheno_model.weights) ################ # Make the prediction prediction = pheno_model.predict(predictors={ 'temperature': this_year_temperature.tmean.values, 'doy_series': doy_series },