def preprocess_data(fname, input_files, grid_def_file): """ Create a file for the preprocessed data. It will contain: 1) timeseries of the max AMOC psi (1d), 2) timeseries of the AMOC index (1d), 3) timeseries of AMOC psi (3d), 4) timeseries of SST diff to NH mean (3d) """ print('master with pid {}'.format(os.getpid())) print('Preprocessing data ...', end='') start_time = time.time() # Get time dim for all input files p = mp.Pool() tv = p.map(get_time_var, input_files) p.close() p.join() tv_sizes = [a.shape[0] for a in tv] tv_total = sum(tv_sizes) assert tv_total > 0 # Now collect data. This will be done by a pool of workers num_workers = max(1, mp.cpu_count() / 2) job_queue = mp.Queue(len(input_files)) results_queue = mp.Queue(num_workers) # Start workers workers = [] for w in range(num_workers): args = (job_queue, results_queue, grid_def_file) w = mp.Process(target=collect_data, args=args) workers.append(w) w.start() # Put all jobs in the queue for f in input_files: job_queue.put(f) # Setup output file with nc.Dataset(input_files[0]) as inf: z = len(inf.dimensions['st_ocean']) y = len(inf.dimensions['yt_ocean']) x = len(inf.dimensions['xt_ocean']) ty_trans = inf.variables['ty_trans'][0, :] lats = inf.variables['geolat_t'][:] lons = inf.variables['geolon_t'][:] f = h5py.File(fname) f.create_dataset('time', (tv_total,), dtype='f', chunks=True) f.create_dataset('nh_sst_average', (tv_total,), dtype='f', chunks=True) f.create_dataset('nh_sst', (tv_total, y, x), dtype='f', chunks=True) f.create_dataset('amoc_index_sst', (tv_total,), dtype='f', chunks=True) f.create_dataset('amoc_psi', (tv_total, z, y), dtype='f', chunks=True) f.create_dataset('amoc_psi_max', (tv_total,), dtype='f', chunks=True) f.create_dataset('amoc_psi_max_at_26n', (tv_total,), dtype='f', chunks=True) f.create_dataset('atlantic_mask', (z, y, x), dtype='i', chunks=True) f.create_dataset('amoc_psi_mask', (z, y), dtype='i', chunks=True) f['time'][:] = np.concatenate(tv) f['atlantic_mask'][:] = get_atlantic_mask(ty_trans.mask, lons, lats) f['amoc_psi_mask'][:] = calc_atlantic_moc(ty_trans, lons, lats).mask f.flush() # A dictionary containing the global indices for each input file global_idx = np.split(range(tv_total), np.cumsum(tv_sizes)[:-1]) assert len(global_idx) == len(input_files) indx_dict = dict(zip(input_files, global_idx)) # Get data from workers load into file at the correct index. for i in range(len(input_files)): input_file, result_arrays = results_queue.get() indx = indx_dict[input_file] for name, data in result_arrays: if len(data.shape) == 1: f[name][indx[0]:indx[-1]+1] = data[:] elif len(data.shape) == 2: f[name][indx[0]:indx[-1]+1, :] = data[:, :] elif len(data.shape) == 3: f[name][indx[0]:indx[-1]+1, :, :] = data[:, :, :] else: import pdb pdb.set_trace() assert False print('.', end='') sys.stdout.flush() # Terminate workers job_queue.close() results_queue.close() for w in workers: w.terminate() for w in workers: w.join() f.close() print(' finished in {} seconds'.format(time.time() - start_time))
def visit_data_file(args): """ Visit a data file and collect/calculate the following: 1. AMOC maximum timeseries 2. AMOC mean timeseries 3. AMOC index timeseries (see (1) for definition) 4. AMOC psi timeseries as a numpy array, shape (t, depth, lat) 5. Surface plot of difference between SST and NH spatial mean SST, shape (t, lat, lon) """ file, grid_def_file, do_depth_correlation_plot, \ do_surface_correlation_plot = args # Try with annual data. use_annual = False if use_annual: _, tmp_file = run_ncra(file, ['geolon_t', 'geolat_t', 'time', 'temp', 'ty_trans']) f = nc.Dataset(tmp_file) else: f = nc.Dataset(file) lons = f.variables['geolon_t'] lats = f.variables['geolat_t'] time_var = f.variables['time'] temp_var = f.variables['temp'] ty_trans = f.variables['ty_trans'] depths = np.cumsum(f.variables['st_ocean'][:]) t_dim = len(f.dimensions['time']) z_dim = len(f.dimensions['st_ocean']) x_dim = len(f.dimensions['xt_ocean']) y_dim = len(f.dimensions['yt_ocean']) gf = nc.Dataset(grid_def_file) areas = gf.variables['area_t'][:] gf.close() nh_mask = get_nh_mask(temp_var[0, 0, :, :].mask, lats) atlantic_mask = get_atlantic_mask(ty_trans[0, :, :].mask, lons, lats) lat_start, lat_end, lon_start, \ lon_end = get_indices_for_amoc_idx_region(lons, lats) amoc_max_ts = [] amoc_mean_ts = [] amoc_idx_ts = [] amoc_psi_ts = np.ma.zeros((t_dim, z_dim, y_dim)) sst_nh_diff_ts = np.ma.zeros((t_dim, y_dim, x_dim)) for t in range(time_var.shape[0]): # Get surface temp spatial mean in the NH nh_sst_mean = np.ma.average(np.ma.masked_array(temp_var[t, 0, :, :], mask=nh_mask), weights=areas) # Get AMOC max and mean. amoc_psi = calc_atlantic_moc(ty_trans[t, :, :, :], lons, lats) amoc_idx_sst = np.average(temp_var[t, 0, lat_start:lat_end, lon_start:lon_end], weights=areas[lat_start:lat_end, lon_start:lon_end]) # Calculate the AMOC index amoc_idx_ts.append(amoc_idx_sst - nh_sst_mean) amoc_max_ts.append(max_within_region(amoc_psi, 500.0, 35.0, depths, lats)) amoc_mean_ts.append(np.mean(amoc_psi)) # Get AMOC psi timeseries if do_depth_correlation_plot: amoc_psi_ts[t, :, :] = amoc_psi[:, :] # Get the surface difference between temp and NH SST mean if do_surface_correlation_plot: sst_nh_diff_ts[t, :, :] = \ np.ma.masked_array(temp_var[t, 0, :, :] - nh_sst_mean, mask=atlantic_mask[0, :, :]) # Add time dim to pandas timeseries periods = time_dim_to_pandas_periods(time_var) if use_annual: periods = [periods[len(periods) / 2]] amoc_idx_ts = pd.Series(amoc_idx_ts, periods) amoc_max_ts = pd.Series(amoc_max_ts, periods) amoc_mean_ts = pd.Series(amoc_mean_ts, periods) f.close() if use_annual: os.remove(tmp_file) ret = [None, None, None, None, None] ret[0] = amoc_idx_ts ret[1] = amoc_max_ts ret[2] = amoc_mean_ts if do_depth_correlation_plot: ret[3] = amoc_psi_ts if do_surface_correlation_plot: ret[4] = sst_nh_diff_ts print('^', end='') return tuple(ret)