dset.createVariable('ens', 'i4', ('ens', )) dset.variables['time'].units = tunit dset.variables['lat'].units = 'degrees_north' dset.variables['lon'].units = 'degrees_east' dset.variables['ens'].units = 'member_number' dset.variables['time'][:] = date2num(state.ensemble_times(), tunit) dset.variables['lat'][:] = state['lat'].values[:, 0] dset.variables['lon'][:] = state['lon'].values[0, :] dset.variables['ens'][:] = state['mem'].values for var in state.vars(): print('Writing variable {}'.format(var)) dset.createVariable(var, 'f8', ( 'time', 'lat', 'lon', 'ens', )) dset.variables[var].units = ut.get_units(var) dset.variables[var][:] = state[var].values #Get the required packages #from netCDF4 import Dataset #import numpy as np #import matplotlib.pyplot as plt #from mpl_toolkits.basemap import Basemap # ##Import the ncfile, assign a file handle, indicate read-only #my_example_nc_file = '/home/disk/hot/stangen/Documents/GEFS/ensembles/2017081400_21mem_1days.nc' #fh = Dataset(my_example_nc_file, mode='r') ##Print the variables to see what we have available #print(fh.variables)
def create_full_ensemble(year, month, day, hour, vrbls=['Z500'], writenc=True): """ Populates and returns a CFSv2 global ensemble forecast using the archived operational forecasts on vader.atmos.washington.edu. Requires: month -> string of month we want to retrieve. year -> string of year we want to retrieve. day -> string of day we want to retrieve. hour -> string (00 or 12) we want to retrieve. vrbls -> A list of variables we want to retrieve. writenc -> A boolean object. If True, the ensemble data will be written out to a netcdf. If False, the ensemble data will simply be returned. Returns: statecls -> An ensemble state object (see EnsembleState class) *only if writenc==False """ # for each date I want to do this to: # for each of the ensembles (ecmwf, jma, ncep, eccc) # if the ensemble file for a date exists # put it into the big ensemble # Here we have a dictionary translating some generic variable names to # what they correspond to in the CFSv2 netcdf files vardict = { 'Z500': 'HGT_500mb', 'gh': 'Z500', 't2m': 'T2M', 'tcw': 'TCW', 'time': 'time', 'lat': 'latitude', 'lon': 'longitude', '10': 'oct', '11': 'nov', '12': 'dec', '01': 'jan', '02': 'feb', '03': 'mar' } outfile = '/home/disk/hot/stangen/Documents/ensembles/all/%s%s/%s-%s-%s_%s.nc' % ( vardict[month], year, year, month, day, hour) # directories for the ensembles, initialize lists used later ensembles = ['ecmwf', 'jma', 'ncep', 'eccc'] memfiles = [] each_mem_length = list() state = [] # load the ensemble files into list called memfiles for ens in ensembles: indir = '/home/disk/hot/stangen/Documents/ensembles/%s/%s%s/%s-%s-%s_%s_%s*' % ( ens, vardict[month], year, year, month, day, hour, ens) # Get a list of filenames (each file is a different member) command = 'ls -1a {}'.format(indir) memfiles.extend( list(reversed(check_output([command], shell=True).split()))) # more initializations.... nmems = 0 priorens = ' ' m2 = -1 # Count the number of ensemble members for allocation of state later # try blocks are to filter out dates when TIGGE didn't have any data print('\nCreating superensemble for {}/{}/{} {}Z\n'.format( month, day, year, hour)) print('Counting ensemble members for allocation') for m, mem in enumerate(memfiles): mem = mem.decode('utf-8') # to not double-count ens members if from both sfc and pl if mem[0:50] != priorens[0:50]: m2 += 1 try: with Dataset(mem, 'r') as ncdata: #keep track of number of members of each ensemble each_mem_length.append( ncdata.variables[vrbls[0]][:, :, :, :].shape[1]) #keep track of total length of all ensembles nmems = nmems + ncdata.variables[ vrbls[0]][:, :, :, :].shape[1] print('ensembles from {}: {}'.format( ensembles[m2], each_mem_length[m2])) print('total members: {}'.format(nmems)) # if there is no data from one of the ensembles, add a 0 to list of number of ensembles except: print('No data from {} ensembles'.format(ensembles[m2])) each_mem_length.append(0) #pass # keep track of file name to compare with next file priorens = mem # even more initialization... memrange_sum = 0 bad_mems_range = 0 priorens = ' ' del_mems = [] mnum2 = -1 # loop to add data from each file for mnum, mem in enumerate(memfiles): mem = mem.decode('utf-8') try: # Read the netcdf with Dataset(mem, 'r') as ncdata: # Find the indices corresponding to the start and end times tunit = ncdata.variables[vardict['time']].units ftimes = num2date(ncdata.variables[vardict['time']][:], tunit) # If this is the first member, calculate how large the state array # needs to be and allocate. Also set up the metadata. if mnum == 0: ntimes = len(ftimes) nvars = len(vrbls) nlats = len(ncdata.dimensions[vardict['lat']]) nlons = len(ncdata.dimensions[vardict['lon']]) # Allocate the state array print('\nAllocating the state vector array...') state = np.zeros((nvars, ntimes, nlats, nlons, nmems)) print( 'state contains {} variables, {} times, {} lats, {} lons, {} ensembles' .format(nvars, ntimes, nlats, nlons, nmems)) # For the metadata, need a list of locations lats = ncdata.variables[vardict['lat']][:][:, None] lons = ncdata.variables[vardict['lon']][:][None, :] # Do a 2d mesh of lat and lon lonarr, latarr = np.meshgrid(lons, lats) #And an array of ensemble members memarr = np.arange(1, nmems + 1) # only increase the ensemble range if running through a new center if mem[0:50] != priorens[0:50]: mnum2 += 1 # get the lower and upper ranges of the ensembles memrange_lower = memrange_sum memrange_sum = memrange_sum + each_mem_length[mnum2] memrange_upper = memrange_sum print('Adding {} to state'.format(ensembles[mnum2])) priorens = mem # cycle through each variable (can be multiple per file) for v, var in enumerate(vrbls): #if from sfc and pl, not each variable will be in each file try: field = ncdata.variables[var][:, :, :, :] # see if the data will fit into state (filters out bad data) try: # make the ensembles at the end of state field = np.swapaxes(field, 1, 3) field = np.swapaxes(field, 1, 2) state[v, :, :, :, memrange_lower:memrange_upper] = field print('Adding {} to {}'.format( var, ensembles[mnum2])) #if the ensembles are a bad shape(missing times, etc) except ValueError: state[v, :, :, :, memrange_lower:memrange_upper] = np.nan if v == 0: print( '{}: bad forecast array shape- not adding to superensemble' .format(ensembles[mnum2])) del_mems.append( range(memrange_lower, memrange_upper)) bad_mems_range += memrange_upper - memrange_lower except: pass # this runs if there is no data for the ensemble, or it can't read the data except: # only increase the ensemble range if running through a new center if mem[0:50] != priorens[0:50]: mnum2 += 1 print('Bad {} ensembles- not adding to superensemble'.format( ensembles[mnum2])) priorens = mem pass # Remove the nans (the incomplete members) if len(del_mems) > 0: state = np.delete(state, del_mems, axis=-1) nmems -= bad_mems_range #nmems -= len(del_mems) memarr = np.arange(1, nmems + 1) # If we are writing this out... if writenc: print('\nWriting to netcdf...') # Convert times back to integers valid_times = date2num(ftimes, tunit) #outfile = '{}/{:%Y%m%d%H}_{}mem_{}days.nc'.format(outdir,start,nmems4name, # (end-start).days) # Write ensemble forecast to netcdf with Dataset(outfile, 'w') as dset: dset.createDimension('time', None) dset.createDimension('lat', nlats) dset.createDimension('lon', nlons) dset.createDimension('ens', nmems) dset.createVariable('time', 'i4', ('time', )) dset.createVariable('lat', np.float64, ('lat', )) dset.createVariable('lon', np.float64, ('lon')) dset.createVariable('ens', 'i4', ('ens', )) dset.variables['time'].units = tunit dset.variables['lat'].units = 'degrees_north' dset.variables['lon'].units = 'degrees_east' dset.variables['ens'].units = 'member_number' dset.variables['time'][:] = np.array(valid_times) dset.variables['lat'][:] = lats dset.variables['lon'][:] = lons dset.variables['ens'][:] = memarr for v, var in enumerate(vrbls): var = vardict[var] print('Writing variable {}'.format(var)) dset.createVariable(var, np.float32, ( 'time', 'lat', 'lon', 'ens', )) dset.variables[var].units = ut.get_units(var) dset.variables[var][:] = state[v, :, :, :, :] # Free up memory held by the state array del state # If we are NOT writing this out... else: # Reshape 5D state into a dictionary of 4D arrays allvars = {} for v, var in enumerate(vrbls): allvars[var] = (['validtime', 'y', 'x', 'mem'], state[v, :, :, :, :]) # Package into an EnsembleState object knowing the state and metadata statecls = EnsembleState.from_vardict( allvars, { 'validtime': ftimes, 'lat': (['y', 'x'], latarr), 'lon': (['y', 'x'], lonarr), 'mem': memarr, }) # Free up memory held by the state array del state return statecls
def get_cfsv2_ensemble(ndays, start, end, vrbls=['Z500'], only00z=True, writenc=True): """ Populates and returns a CFSv2 global ensemble forecast using the archived operational forecasts on vader.atmos.washington.edu. Requires: ndays -> The number of init days we want to use to populate the ensemble. 16 forecasts are initialized per day. That is, an ensemble with 16*ndays members will be built. start -> A datetime object of the ensemble forecast initialization time. end -> A datetime object of the ensemble forecast end time. vrbls -> A list of variables we want to retrieve. only00z -> A boolean object. If True, only the 00z validation times are saved. If False, all 6-hourly times are saved. writenc -> A boolean object. If True, the ensemble data will be written out to a netcdf. If False, the ensemble data will simply be returned. Returns: statecls -> An ensemble state object (see EnsembleState class) *only if writenc==False """ # Here we have a dictionary translating some generic variable names to # what they correspond to in the CFSv2 netcdf files vardict = {'Z500' : 'HGT_500mb', 'T2M' : 'TMP_2maboveground', 'PWAT' : 'PWAT_entireatmosphere_consideredasasinglelayer_', 'MSLP' : 'PRMSL_meansealevel', 'P6HR' : 'APCP_surface', 'time' : 'time', 'lat' : 'latitude', 'lon' : 'longitude', } # This is the input directory for the CFSv2 forecast netcdfs indir = '/home/disk/hot/stangen/Documents/GEFS/analysis/2017090600_2017091600/netcdf/precip' # This is the output directory for the ncfile if writenc==True outdir = '/home/disk/hot/stangen/Documents/GEFS/analysis/2017090600_2017091600/ensembles/precip' # Get a list of filenames (each file is a different member) mem = [] # if you want to include time-lag to increase ensemble, uncomment the for loop # and the timedelta part and indent after the for loop. # for i in range(4*ndays): idate = start #- timedelta(hours=i*6) # List the four members at this time and append to master list datestr = idate.strftime('%Y%m%d%H') command = 'ls -1a {}/*.nc'.format(indir,datestr) print(command) mem.extend(list(reversed(check_output([command],shell=True).split()))) # Loop through the individual member files to load the forecasts print('Loading {} ensemble...'.format(start.strftime('%Y-%m-%d %H:00'))) del_mems = [] mem = mem[0].decode('utf-8') print(mem) # Read the netcdf with Dataset(mem,'r') as ncdata: # Find the indices corresponding to the start and end times tunit = ncdata.variables[vardict['time']].units ftimes = num2date(ncdata.variables[vardict['time']][:],tunit) tbeg = ut.nearest_ind(ftimes,start) tend = ut.nearest_ind(ftimes,end) ftimes = ftimes[tbeg:tend+1] #for metadata # If this is the first member, calculate how large the state array # needs to be and allocate. Also set up the metadata. # nmems = len(memfiles) # nmems4name = len(memfiles) ntimes = len(ftimes) nvars = len(vrbls) nlats = len(ncdata.dimensions[vardict['lat']]) nlons = len(ncdata.dimensions[vardict['lon']]) # Allocate the state array print('Allocating the state vector array...') state = np.zeros((nvars,ntimes,nlats,nlons)) # For the metadata, need a list of locations lats = ncdata.variables[vardict['lat']][:][:,None] lons = ncdata.variables[vardict['lon']][:][None,:] # Do a 2d mesh of lat and lon lonarr, latarr = np.meshgrid(lons, lats) #And an array of ensemble members #field = ncdata.variables[vardict['Z500']][tbeg:tend,:,:] # Now to populate the state array for v, var in enumerate(vrbls): field = ncdata.variables[vardict[var]][:,:,:]#[tbeg:tend,:,:] #print(field) print('Adding variable {}'.format(var)) # Populate its component of the state array try: state[v,:,:,:] = field except ValueError: state[v,:,:,:] = np.nan if v==0: print(' member {}: bad forecast array shape'.format(mnum)) del_mems.append(mnum) # END of ncdata load # Grab only the 00z times, if appropriate if only00z: ftimes = ftimes[::4] state = state[:,::4,:,:,:] # Remove the nans (the incomplete members) # if len(del_mems) > 0: # state = np.delete(state,del_mems,axis=-1) # nmems -= len(del_mems) # memarr = np.arange(1,nmems+1) # If we are writing this out... if writenc: print('Writing to netcdf...') # Convert times back to integers valid_times = date2num(ftimes,tunit) outfile = '{}/{:%Y%m%d%H}_{}days.nc'.format(outdir,start, (end-start).days) # Write ensemble forecast to netcdf with Dataset(outfile,'w') as dset: dset.createDimension('time',None) dset.createDimension('lat',nlats) dset.createDimension('lon',nlons) # dset.createDimension('ens',nmems) dset.createVariable('time','i4',('time',)) dset.createVariable('lat','f8',('lat',)) dset.createVariable('lon','f8',('lon')) # dset.createVariable('ens','i4',('ens',)) dset.variables['time'].units = tunit dset.variables['lat'].units = 'degrees_north' dset.variables['lon'].units = 'degrees_east' # dset.variables['ens'].units = 'member_number' dset.variables['time'][:] = np.array(valid_times) dset.variables['lat'][:] = lats dset.variables['lon'][:] = lons # dset.variables['ens'][:] = memarr for v,var in enumerate(vrbls): print('Writing variable {}'.format(var)) dset.createVariable(var, 'f8', ('time','lat','lon')) dset.variables[var].units = ut.get_units(var) dset.variables[var][:] = state[v,:,:,:] # Free up memory held by the state array del state # If we are NOT writing this out... else: # Reshape 5D state into a dictionary of 4D arrays allvars = {} for v,var in enumerate(vrbls): allvars[var] = (['validtime','y','x',], state[v,:,:,:]) # Package into an EnsembleState object knowing the state and metadata statecls = EnsembleState.from_vardict(allvars, {'validtime' : ftimes, 'lat' : (['y','x'], latarr), 'lon' : (['y','x'], lonarr), }) # Free up memory held by the state array del state return statecls
def create_full_analysis(vrbls=['Z500'], writenc=True): """ Populates and returns a CFSv2 global ensemble forecast using the archived operational forecasts on vader.atmos.washington.edu. Requires: month -> string of month we want to retrieve. year -> string of year we want to retrieve. day -> string of day we want to retrieve. hour -> string (00 or 12) we want to retrieve. vrbls -> A list of variables we want to retrieve. writenc -> A boolean object. If True, the ensemble data will be written out to a netcdf. If False, the ensemble data will simply be returned. Returns: statecls -> An ensemble state object (see EnsembleState class) *only if writenc==False """ # Here we have a dictionary translating some generic variable names to # what they correspond to in the CFSv2 netcdf files vardict = { 'Z500': 'HGT_500mb', 'gh': 'Z500', 't2m': 'T2M', 'tcw': 'TCW', 'time': 'time', 'lat': 'latitude', 'lon': 'longitude', '10': 'oct', '11': 'nov', '12': 'dec', '01': 'jan', '02': 'feb', '03': 'mar' } outfile = '/home/disk/hot/stangen/Documents/ensembles/analysis/combined/oct-mar.nc' # directories for the months, initialize some lists months = ['oct2016', 'nov2016', 'dec2016', 'jan2017', 'feb2017', 'mar2017'] memfiles = [] each_time_length = list() state = [] # try is because may not have all months yet try: for mon in months: indir = '/home/disk/hot/stangen/Documents/ensembles/analysis/rawmonths/%s/*' % ( mon) # Get a list of filenames (each file is a different member) command = 'ls -1a {}'.format(indir) memfiles.extend( list(reversed(check_output([command], shell=True).split()))) except: pass # more initializations ntimes = 0 priorens = ' ' t2 = -1 # Count the number of times for allocation of state later # try blocks are to filter out dates when TIGGE didn't have any data, probably not used in analysis data. print('Counting times for allocation') for t, times in enumerate(memfiles): times = times.decode('utf-8') # to not double-count times if from both sfc and pl if times[0:80] != priorens[0:80]: t2 += 1 try: with Dataset(times, 'r') as ncdata: #keep track of number of members of each ensemble each_time_length.append(len(ncdata.variables['time'])) #keep track of total length of all ensembles ntimes = ntimes + len(ncdata.variables['time']) print('times from {}: {}'.format(months[t2], each_time_length[t2])) print('total times: {}'.format(ntimes)) #print(each_mem_length) except: print('Bad {} ensembles- not counting ensemble members'.format( months[t2])) pass priorens = times #even more initializations timerange_sum = 0 bad_times_range = 0 priorens = ' ' del_mems = [] mnum2 = -1 ftimes = [] valid_times = np.zeros(ntimes) #mem naming is leftover from ensemble script, this loop adds data from each file for mnum, mem in enumerate(memfiles): mem = mem.decode('utf-8') try: # Read the netcdf with Dataset(mem, 'r') as ncdata: # Find the indices corresponding to the start and end times tunit = ncdata.variables[vardict['time']].units ftimes = num2date(ncdata.variables[vardict['time']][:], tunit) # If this is the first month, calculate how large the state array # needs to be and allocate. Also set up the metadata. if mnum == 0: nvars = len(vrbls) nlats = len(ncdata.dimensions[vardict['lat']]) nlons = len(ncdata.dimensions[vardict['lon']]) # Allocate the state array print('\nAllocating the state vector array...') state = np.zeros((nvars, ntimes, nlats, nlons)) print( 'state contains {} variables, {} times, {} lats, {} lons' .format(nvars, ntimes, nlats, nlons)) # For the metadata, need a list of locations lats = ncdata.variables[vardict['lat']][:][:, None] lons = ncdata.variables[vardict['lon']][:][None, :] # Do a 2d mesh of lat and lon lonarr, latarr = np.meshgrid(lons, lats) #And an array of ensemble members # memarr = np.arange(1,nmems+1) # only increase the ensemble range if running through a new center if mem[0:80] != priorens[0:80]: mnum2 += 1 # get the lower and upper ranges of the ensembles timerange_lower = timerange_sum timerange_sum = timerange_sum + each_time_length[mnum2] timerange_upper = timerange_sum print(timerange_lower) print(timerange_upper) print(timerange_sum) print('Adding {} to state'.format(months[mnum2])) # Convert times back to integers valid_times[timerange_lower:timerange_upper] = date2num( ftimes, tunit) priorens = mem # cycle through each variable (can be multiple per file) for v, var in enumerate(vrbls): #if from sfc and pl, not each variable will be in each file try: field = ncdata.variables[var][:, :, :] # see if the data will fit into state (filters out bad data) try: state[ v, timerange_lower:timerange_upper, :, :] = field print('Adding {} to {}'.format(var, months[mnum2])) #if the ensembles are a bad shape(missing times, etc) except ValueError: state[ v, timerange_lower:timerange_upper, :, :] = np.nan if v == 0: print( '{}: bad forecast array shape- not adding to combined analysis' .format(months[mnum2])) del_mems.append( range(timerange_lower, timerange_upper)) bad_times_range += timerange_upper - timerange_lower except: pass # this runs if there is no data for the month except: # only increase the time range if running through a month if mem[0:80] != priorens[0:80]: mnum2 += 1 print( 'Bad {} month- not adding to combined analysis :('.format( months[mnum2])) priorens = mem pass #------Haven't touched this from ensemble script, just commented out b/c don't need it for now.------- # # Remove the nans (the incomplete members) # if len(del_mems) > 0: # state = np.delete(state,del_mems,axis=-1) # nmems -= bad_mems_range # #nmems -= len(del_mems) # memarr = np.arange(1,nmems+1) # print(del_mems) # print(len(del_mems)) # print(state.shape) # print(nmems) # print(memarr) # If we are writing this out... if writenc: print('\nWriting to netcdf...') # Write ensemble forecast to netcdf with Dataset(outfile, 'w') as dset: dset.createDimension('time', None) dset.createDimension('lat', nlats) dset.createDimension('lon', nlons) dset.createVariable('time', 'i4', ('time', )) dset.createVariable('lat', np.float64, ('lat', )) dset.createVariable('lon', np.float64, ('lon')) dset.variables['time'].units = tunit dset.variables['lat'].units = 'degrees_north' dset.variables['lon'].units = 'degrees_east' dset.variables['time'][:] = np.array(valid_times) dset.variables['lat'][:] = lats dset.variables['lon'][:] = lons for v, var in enumerate(vrbls): var = vardict[var] print('Writing variable {}'.format(var)) dset.createVariable(var, np.float32, ('time', 'lat', 'lon')) dset.variables[var].units = ut.get_units(var) dset.variables[var][:] = state[v, :, :, :] # Free up memory held by the state array del state # If we are NOT writing this out... else: # Reshape 5D state into a dictionary of 4D arrays allvars = {} for v, var in enumerate(vrbls): allvars[var] = (['validtime', 'y', 'x', 'mem'], state[v, :, :, :, :]) # Package into an EnsembleState object knowing the state and metadata statecls = EnsembleState.from_vardict( allvars, { 'validtime': ftimes, 'lat': (['y', 'x'], latarr), 'lon': (['y', 'x'], lonarr), 'mem': memarr, }) # Free up memory held by the state array del state return statecls