def testEquallength(self): results = [arange(3), array([1]), array([]), arange(5), array([2, 3]), array([]), arange(7), array([3, 4]), array([5])] for (ii, inp) in enumerate(self.inputs): pfunc = partition.EqualLength() actual = pfunc(*inp) expected = results[ii] msg = test_info_msg( 'EqualLength', inp[0], inp[1], inp[2], actual, expected) print(msg) testing.assert_array_equal(actual, expected, msg)
def testEquallength(self): results = [list(range(3)), [1], [], list(range(5)), [2, 3], [], list(range(7)), [3, 4], [5]] for (ii, inp) in enumerate(self.inputs): pfunc = partition.EqualLength() actual = pfunc(*inp) expected = results[ii] msg = test_info_msg( 'EqualLength', inp[0], inp[1], inp[2], actual, expected) print(msg) self.assertEqual(actual, expected, msg)
def testEquallength(self): results = [ list(range(3)), [1], [], list(range(5)), [2, 3], [], list(range(7)), [3, 4], [5], ] for (ii, inp) in enumerate(self.inputs): pfunc = partition.EqualLength() actual = pfunc(*inp) expected = results[ii] self.assertEqual(actual, expected)
def testEquallength(self): results = [ arange(3), array([1]), array([]), arange(5), array([2, 3]), array([]), arange(7), array([3, 4]), array([5]), ] for (ii, inp) in enumerate(self.inputs): pfunc = partition.EqualLength() actual = pfunc(*inp) expected = results[ii] testing.assert_array_equal(actual, expected)
def create_pre_proc(self, spec): """ Creates the CICE pre_proc file. @param spec An instance of the Specification class which holds the user settings that define which averages to compute, directories, file prefixes, etc """ variables = { 'hi': { 'factor': 1.0e-13, 'units': '1.E+13 m3' }, 'ai': { 'factor': 1.0e-14, 'units': '1.E+13 m3' }, 'ext': { 'factor': 1.0e-12, 'units': '1.E+12 m2' }, 'hs': { 'factor': 1.0e-13, 'units': '1.E+12 m2' }, } # All of the region names, with 0=Northern Hem and 1=Southern Hem regions = { 'nh': 0, 'sh': 1, 'Lab': 0, 'GIN': 0, 'Bar': 0, 'ArcOc': 0, 'Sib': 0, 'Beau': 0, 'CArc': 0, 'Bering': 0, 'Okhotsk': 0, 'Hudson': 0, 'CAArch': 0, 'Wed': 1, 'Ross': 1, 'Ind': 1, 'Pac': 1, 'BAm': 1, } split_hem = spec.split_files.split(',') attributes = { 'missing_value': 1.0e30, 'coordinates': 'time', 'cell_methods': 'time:mean', '_FillValue': 1.0e30, } ave_descr = ['preproc', str(spec.year0), str(spec.year1)] AVE_TAG = 40 years = list(range(int(spec.year0), int(spec.year1) + 1)) months = ave_t.average_types[ave_descr[0]]['months_to_average'] # Initialize simplecomm (MPI wrappers) main_comm = spec.main_comm # If the region mask file doesn't exist, have root call ncl to create it if not os.path.isfile(spec.reg_file) and (main_comm.is_manager() or spec.serial): import subprocess os.environ['GRIDFILE'] = spec.ice_obs_file os.environ['REGIONFILE'] = spec.reg_file ncl_command = 'ncl < ' + spec.ncl_location + '/ice_pre_proc_mask.ncl' subprocess.call(ncl_command, shell=True) # make sure to have all ranks sync to prevent ranks other than root from continuing on without a region mask file main_comm.sync() # Get the history dictionary that lists were files are located for each time slice, a variable list, meta list, and a key lookup variable if spec.hist_type == 'series': ( hist_dict, file_var_list, meta_list, key, ) = rover.set_slices_and_vars_time_series( spec.in_directory, spec.file_pattern, spec.date_pattern, spec.prefix, spec.suffix, int(spec.year0), int(spec.year1), spec.split, spec.split_files, ) else: ( hist_dict, file_var_list, meta_list, key, ) = rover.set_slices_and_vars_time_slice( spec.in_directory, spec.file_pattern, spec.prefix, spec.suffix, int(spec.year0), int(spec.year1), ) # Loop over the regions and variable names to get full list of variables global_var_list = [] for reg in regions: for var in variables: if 'ext' in var: global_var_list.append(var + '_mo_' + reg) else: global_var_list.append('v' + var + '_mo_' + reg) global_var_list.append('time') # Partition the global variable list between the MPI ranks local_var_list = main_comm.partition(global_var_list, func=partition.EqualLength(), involved=False) # If master/root, give it the full variable list if main_comm.is_manager() or spec.serial: local_var_list = global_var_list meta_list = [] # Define the netcdf file outfile = ('ice_vol_' + spec.prefix[:-7] + '_' + str(spec.year0) + '-' + str(spec.year1) + '.nc') ave_date = str(spec.year0) + '-' + str(spec.year1) all_files_vars, new_file = climFileIO.define_ave_file( main_comm.is_manager(), spec.serial, global_var_list, local_var_list, meta_list, hist_dict, spec.hist_type, ave_descr, spec.prefix, outfile, spec.split, split_hem[regions['GIN']], spec.out_directory, main_comm, spec.ncformat, ave_t.average_types[ave_descr[0]]['months_to_average'][0], key, spec.clobber, int(spec.year0), int(spec.year1), ave_date, attributes, variables, ) # If using time slice files, open all files now if len(local_var_list) > 0: if spec.hist_type == 'slice' and (spec.serial or not main_comm.is_manager()): file_dict, open_list = climFileIO.open_all_files( hist_dict, ave_t.average_types[ave_descr[0]]['months_to_average'], years, local_var_list[0], 'null', ave_descr[0], False, int(spec.year0), ) # Loop over each variable in the local list and read/operate on/write for nc_var in local_var_list: if not main_comm.is_manager() or spec.serial: # Slave print(('Computing ice_pre_proc for', nc_var)) # Get variable/region names if 'time' in nc_var: get_var_name = 'aice' var_name = 'time' else: var_name, reg = nc_var.split('_mo_') if 'ext' in var_name: var_name = var_name else: var_name = var_name[1:] if 'ext' in var_name or 'ai' in var_name: get_var_name = 'aice' else: get_var_name = var_name # Get observation lat,lon,area obs_file = spec.ice_obs_file tarea = 'TAREA' tlat = 'TLAT' # Read in the ice observation file to get area, lat, and lon values. obs_file_hndl = nc.Dataset(obs_file, 'r') o_lat = obs_file_hndl.variables[tlat] o_area = obs_file_hndl.variables[tarea] o_area = o_area[:] * 1.0e-4 # If using time series files, open the variable's file now if spec.hist_type == 'series': if spec.split: split_name = split_hem[regions[reg]] else: split_name = '' file_dict, open_list = climFileIO.open_all_files( hist_dict, ave_t.average_types[ave_descr[0]]['months_to_average'], years, get_var_name, split_name, ave_descr[0], False, int(spec.year0), ) time_slice = 0 for year in years: for m in months: if not main_comm.is_manager() or spec.serial: # Slave if 'time' in nc_var: var_sum = rover.fetch_slice( hist_dict, year, m, var_name, file_dict) else: # Get month slice var_slice = rover.fetch_slice( hist_dict, year, m, get_var_name, file_dict) lat = var_slice.shape[-2] lon = var_slice.shape[-1] full_lat = o_lat.shape[-2] if spec.split: fill = full_lat - lat missing_vals = np.zeros((fill, lon)) var_slice = np.array(var_slice) var_slice[var_slice >= 1e20] = 0 if regions[reg] == 1: var_slice = np.concatenate( (var_slice, missing_vals), axis=0) else: var_slice = np.concatenate( (missing_vals, var_slice), axis=0) # Get ai factor if 'ext' in var_name or 'ai' in var_name: aimax = np.amax(var_slice) if aimax < 2: aifac = 100 else: aifac = 1 var_slice = var_slice * aifac # The ext variable is true/false based on the ai variable. Set accordingly if 'ext' in var_name: var_slice = np.array(var_slice) var_slice[var_slice >= 1e20] = 0 var_slice[var_slice < 15] = 0 var_slice[var_slice >= 15] = 1 # Mult by weight var_slice = var_slice * o_area # Mask the variable to get just this region mask_to_apply = self.read_reg_mask( spec.reg_file, reg) masked_var = MA.masked_where( mask_to_apply == 0, var_slice) # Sum the variable var_sum = self.get_sum(masked_var, variables[var_name], var_name) # Pass the average results to master rank for writing var_shape = var_sum.shape var_dtype = var_sum.dtype md_message = { 'name': nc_var, 'shape': var_shape, 'dtype': var_dtype, 'average': var_sum, 'index': time_slice, } if not spec.serial: main_comm.collect(data=md_message, tag=AVE_TAG) if main_comm.is_manager() or spec.serial: # Master # Recv the variable to write if not spec.serial: r_rank, results = main_comm.collect(tag=AVE_TAG) var_sum_results = results['average'] v_name = results['name'] index = results['index'] else: v_name = nc_var var_sum_results = var_sum index = time_slice # Write Var climFileIO.write_averages(all_files_vars, var_sum_results, v_name, index) time_slice = time_slice + 1 # Close timeseries files that are open if spec.hist_type == 'series' and (not main_comm.is_manager() or spec.serial): climFileIO.close_all_files(open_list) # Close timeslice files that are open if len(local_var_list) > 0: if spec.hist_type == 'slice' and (spec.serial or not main_comm.is_manager()): climFileIO.close_all_files(open_list) # Make sure everyone gets sync'ed up main_comm.sync() # Close the file that was just created if spec.serial or main_comm.is_manager(): new_file.close()
if not run_parallel: logger.debug("Exception while obtaining rank. Will run serial") else: logger.debug("My rank == {}".format(rank)) # determine subset of dates to process by this rank comm = simplecomm.create_comm(serial=False) frequency = int(INPUT_FREQUENCY.total_seconds()) duration = int(DURATION.total_seconds()) dateRange = range(0, duration + 1, frequency) all_dates = [START_TIME + tdelta(seconds=curr) for curr in dateRange] if rank == 0: logger.debug( "Global list of dates to be processed: {}".format(all_dates)) local_date_range = comm.partition(all_dates, func=partition.EqualLength(), involved=True) logger.info("List of dates to be processed by this process: {}".format( local_date_range)) # # get it done # # currDate = copy.copy(START_TIME) # while currDate <= START_TIME + DURATION: for currDate in local_date_range: # create output file ; e.g. "aug29.geosgcm_surfh.20060909_2330z.nc4" outFileName = _get_file_name(currDate) outdir = confbasic("lsm_merged_files_outdir") outfile_path = os.path.join(outdir, outFileName) if os.path.exists(outfile_path): logger.info("Skipping existing file '{}'".format(outfile_path))
def testOutOfBounds(self): self.assertRaises( IndexError, partition.EqualLength(), [1, 2, 3], 3, 3) self.assertRaises( IndexError, partition.EqualStride(), [1, 2, 3], 7, 3)
def fill_list(nc_files, root_dir, extra_dir, comm, rank, size): variablelist = {} gridfile = None nc_files_l = comm.partition(nc_files,func=partition.EqualLength(),involved=True) for fn in nc_files_l: f = nc.Dataset(fn, "r") mt = fn.replace(root_dir,"").split("/")[-5] stri = fn model_type = mt if "lnd" in model_type or "rof" in model_type: model_type = 'lnd,rof' if "glc" in model_type: model_type = 'glc,lnd' if ("time" not in f.variables.keys() or "tseries" not in fn): variablelist["skip"] = {} else: lt = "none" ln = "none" lv = "none" lat_name = None lon_name = None lev_name = None time_name = None # Find which dim variables to use v_dims = f.variables[fn.split('.')[-3]].dimensions for i in grids[mt]['lat']: if i in v_dims: if 'nlat' in i: lat_name = str(f.variables[fn.split('.')[-3]].coordinates.split()[1]) else: lat_name = i lt = len(f.dimensions[i]) for i in grids[mt]['lon']: if i in v_dims: if 'nlon' in i: lon_name = str(f.variables[fn.split('.')[-3]].coordinates.split()[0]) if 'ULONG' in lon_name: ln = str(len(f.dimensions[i]))+"_UGRID" else: ln = str(len(f.dimensions[i]))+"_TGRID" else: lon_name = i ln = len(f.dimensions[i]) for i in grids[mt]['lev']: if i in v_dims: lev_name = i lv = len(f.dimensions[i]) for i in grids[mt]['time']: if i in v_dims: time_name = i lv = len(f.dimensions[i]) gridfile = '{0}/{1}x{2}x{3}.nc'.format(extra_dir,mt,lt,ln) for vn,ob in f.variables.iteritems(): if model_type not in variablelist.keys(): variablelist[model_type] = {} if vn not in variablelist[model_type].keys(): variablelist[model_type][vn] = {} if hasattr(f,"time_period_freq"): if f.time_period_freq not in variablelist[model_type][vn].keys(): variablelist[model_type][vn][f.time_period_freq] = {} date = stri.split('.')[-2] if date not in variablelist[model_type][vn][f.time_period_freq].keys(): variablelist[model_type][vn][f.time_period_freq][date] = {} if 'files' not in variablelist[model_type][vn][f.time_period_freq][date].keys(): variablelist[model_type][vn][f.time_period_freq][date]['files']=[stri,gridfile] variablelist[model_type][vn][f.time_period_freq][date]['lat']=lat_name variablelist[model_type][vn][f.time_period_freq][date]['lon']=lon_name variablelist[model_type][vn][f.time_period_freq][date]['lev']=lev_name variablelist[model_type][vn][f.time_period_freq][date]['time']=time_name else: if "unknown" not in variablelist[model_type][vn].keys(): variablelist[model_type][vn]["unknown"] = {} if stri not in variablelist[model_type][vn]["unknown"]: variablelist[model_type][vn]["unknown"]["unknown"] = {} variablelist[model_type][vn][f.time_period_freq][date]['files']=[stri,gridfile] variablelist[model_type][vn][f.time_period_freq][date]['lat']=lat_name variablelist[model_type][vn][f.time_period_freq][date]['lon']=lon_name variablelist[model_type][vn][f.time_period_freq][date]['lev']=lev_name variablelist[model_type][vn][f.time_period_freq][date]['time']=time_name f.close() VL_TAG = 30 variable_list = {} if size > 1: if rank==0: variable_list = variablelist for i in range(0,size-1): r,lvarList = comm.collect(data=None, tag=VL_TAG) for model_type,d1 in lvarList.iteritems(): if model_type not in variable_list.keys(): variable_list[model_type] = {} for vn,d2 in d1.iteritems(): if vn not in variable_list[model_type].keys(): variable_list[model_type][vn] = {} for tp,d3 in d2.iteritems(): if tp not in variable_list[model_type][vn].keys(): variable_list[model_type][vn][tp] = {} for date,l in d3.iteritems(): if date not in variable_list[model_type][vn][tp].keys(): variable_list[model_type][vn][tp][date] = {} if 'files' in variable_list[model_type][vn][tp][date].keys(): if len(lvarList[model_type][vn][tp][date]['files'])>0: variable_list[model_type][vn][tp][date]['files'].append(lvarList[model_type][vn][tp][date]['files'][0]) else: variable_list[model_type][vn][tp][date] = lvarList[model_type][vn][tp][date] # variable_list.update(lvarList) comm.partition(variable_list, func=partition.Duplicate(), involved=True) else: comm.collect(data=variablelist, tag=VL_TAG) variable_list = comm.partition(func=partition.Duplicate(), involved=True) comm.sync() return variable_list
def get_input_dates(glob_str, comm, rank, size): ''' Open up all of the files that match the search string and get the dates within the files. Also get the number of slices within each file, what calendar it uses and the time unit. Input: glob_str(string) - the search path to get files Output: stream_dates(dictionary) - keys->date, values->the file where this slice is located file_slices(dictionary) - keys->filename, values->the number of slices found in the file calendar(string) - the name of the calendar type (ie, noleap, ...) units(string) - the calendar unit (possibly in the form 'days since....') time_period_freq(string) - time_period_freq global attribute from first file ''' stream_files = glob.glob(glob_str) stream_dates = {} file_slices = {} att = {} if len(stream_files) < 1: return stream_dates, file_slices, None, None, None time_period_freq = None first = True stream_files_l = comm.partition(stream_files,func=partition.EqualLength(),involved=True) for fn in sorted(stream_files_l): print rank,'/',size,' opening ',fn # open file and get time dimension f = nc.Dataset(fn,"r") all_t = f.variables['time'] nc_atts = f.ncattrs() # add the file name are how many slices it contains file_slices[fn] = len(all_t) # add all dates and which file they are located in for t in all_t[:]: stream_dates[t] = fn # get all attributes of time in order to get cal and units for a in all_t.ncattrs(): att[a] = all_t.__getattribute__(a) # get the time_period_freq global attribute from the first file if first: try: time_period_freq = f.getncattr('time_period_freq') print 'time_period_freq = ',time_period_freq except: print 'Global attribute time_period_freq not found - set to XML tseries_tper element' first = False f.close() g_stream_dates = {} g_file_slices = {} if size > 1: T1 = 31 T2 = 32 T3 = 33 if rank==0: g_stream_dates = stream_dates g_file_slices = file_slices g_att = att for i in range(0,size-1): r,l_stream_dates = comm.collect(data=None, tag=T1) g_stream_dates.update(l_stream_dates) r,l_file_slices = comm.collect(data=None, tag=T2) g_file_slices.update(l_file_slices) r,l_att = comm.collect(data=None, tag=T3) g_att.update(l_att) comm.partition(g_stream_dates, func=partition.Duplicate(), involved=True) comm.partition(g_file_slices, func=partition.Duplicate(), involved=True) comm.partition(g_att, func=partition.Duplicate(), involved=True) else: comm.collect(data=stream_dates, tag=T1) comm.collect(data=file_slices, tag=T2) comm.collect(data=att, tag=T3) g_stream_dates = comm.partition(func=partition.Duplicate(), involved=True) g_file_slices = comm.partition(func=partition.Duplicate(), involved=True) g_att = comm.partition(func=partition.Duplicate(), involved=True) if 'calendar' in g_att.keys(): calendar = g_att['calendar'] else: calendar = "noleap" if 'units' in g_att.keys(): units = g_att['units'] else: units = "days since 0000-01-01 00:00:00" comm.sync() return g_stream_dates,g_file_slices,calendar.lower(),units,time_period_freq
def fill_list(nc_files, root_dir, extra_dir, comm, rank, size): grds = { 'atm':'192x288', 'lnd':'192x288', 'glc':'192x288', 'rof':'192x288', 'ice':'384x320', 'ocn':'384x320' } variablelist = {} gridfile = None nc_files.append(extra_dir+"/ocn_constants.nc") nc_files_l = comm.partition(nc_files,func=partition.EqualLength(),involved=True) for fn in nc_files_l: f = nc.Dataset(fn, "r") mt = fn.replace(root_dir,"").split("/")[-5] stri = fn model_type = mt if "ocn_constants" in fn: model_type = "ocn" mt = "ocn" if "lnd" in model_type or "rof" in model_type: model_type = 'lnd,rof' if "glc" in model_type: model_type = 'glc,lnd' if ("time" not in f.variables.keys() or "tseries" not in fn): variablelist["skip"] = {} else: lt = "none" ln = "none" lv = "none" lat_name = None lon_name = None lev_name = None time_name = None # Find which dim variables to use v_dims = f.variables[fn.split('.')[-3]].dimensions for i in grids[mt]['lat']: if i in v_dims: if 'nlat' in i or 'nj' in i: lat_name = str(f.variables[fn.split('.')[-3]].coordinates.split()[1]) else: lat_name = i lt = len(f.dimensions[i]) for i in grids[mt]['lon']: if i in v_dims: if 'nlon' in i or 'ni' in i: lon_name = str(f.variables[fn.split('.')[-3]].coordinates.split()[0]) if 'ULON' in lon_name: ln = str(len(f.dimensions[i]))+"_UGRID" else: ln = str(len(f.dimensions[i]))+"_TGRID" else: lon_name = i ln = len(f.dimensions[i]) for i in grids[mt]['lev']: if i in v_dims: lev_name = i lv = len(f.dimensions[i]) # for i in grids[mt]['time']: # if i in v_dims: # time_name = i # lv = len(f.dimensions[i]) if 'none' == lt or 'none' == ln: gridfile = '{0}/{1}x{2}.nc'.format(extra_dir,mt,grds[mt]) else: if 'atm' in mt: gridfile = '{0}/{1}x{2}x{3}x{4}.nc'.format(extra_dir,mt,lt,ln,lv) else: gridfile = '{0}/{1}x{2}x{3}.nc'.format(extra_dir,mt,lt,ln) if gridfile is not None: if not os.path.isfile(gridfile): gridfile = None for vn,ob in f.variables.iteritems(): if model_type not in variablelist.keys(): variablelist[model_type] = {} if vn not in variablelist[model_type].keys(): variablelist[model_type][vn] = {} if hasattr(f,"time_period_freq"): if 'day_365' in f.time_period_freq: time_period_freq = 'year_1' else: time_period_freq = f.time_period_freq if time_period_freq not in variablelist[model_type][vn].keys(): variablelist[model_type][vn][time_period_freq] = {} if 'ocn_constants' in stri: date = "0000" else: date = stri.split('.')[-2] if date not in variablelist[model_type][vn][time_period_freq].keys(): variablelist[model_type][vn][time_period_freq][date] = {} if 'files' not in variablelist[model_type][vn][time_period_freq][date].keys(): variablelist[model_type][vn][time_period_freq][date]['files']=[stri,gridfile] variablelist[model_type][vn][time_period_freq][date]['lat']=lat_name variablelist[model_type][vn][time_period_freq][date]['lon']=lon_name variablelist[model_type][vn][time_period_freq][date]['lev']=lev_name variablelist[model_type][vn][time_period_freq][date]['time']=time_name else: if "unknown" not in variablelist[model_type][vn].keys(): variablelist[model_type][vn]["unknown"] = {} if stri not in variablelist[model_type][vn]["unknown"]: variablelist[model_type][vn]["unknown"]["unknown"] = {} variablelist[model_type][vn][time_period_freq][date]['files']=[stri,gridfile] variablelist[model_type][vn][time_period_freq][date]['lat']=lat_name variablelist[model_type][vn][time_period_freq][date]['lon']=lon_name variablelist[model_type][vn][time_period_freq][date]['lev']=lev_name variablelist[model_type][vn][time_period_freq][date]['time']=time_name f.close() VL_TAG = 30 variable_list = {} if size > 1: if rank==0: variable_list = variablelist for i in range(0,size-1): r,lvarList = comm.collect(data=None, tag=VL_TAG) for model_type,d1 in lvarList.iteritems(): if model_type not in variable_list.keys(): variable_list[model_type] = {} for vn,d2 in d1.iteritems(): if vn not in variable_list[model_type].keys(): variable_list[model_type][vn] = {} for tp,d3 in d2.iteritems(): if tp not in variable_list[model_type][vn].keys(): variable_list[model_type][vn][tp] = {} for date,l in d3.iteritems(): if date not in variable_list[model_type][vn][tp].keys(): variable_list[model_type][vn][tp][date] = {} if 'files' in variable_list[model_type][vn][tp][date].keys(): if len(lvarList[model_type][vn][tp][date]['files'])>0: if lvarList[model_type][vn][tp][date]['files'][0] is not None: variable_list[model_type][vn][tp][date]['files'].append(lvarList[model_type][vn][tp][date]['files'][0]) else: variable_list[model_type][vn][tp][date] = lvarList[model_type][vn][tp][date] # variable_list.update(lvarList) comm.partition(variable_list, func=partition.Duplicate(), involved=True) else: comm.collect(data=variablelist, tag=VL_TAG) variable_list = comm.partition(func=partition.Duplicate(), involved=True) comm.sync() return variable_list