示例#1
0
def preprocess_data(fname, input_files, grid_def_file):
    """
    Create a file for the preprocessed data. It will contain:
    1) timeseries of the max AMOC psi (1d),
    2) timeseries of the AMOC index (1d),
    3) timeseries of AMOC psi (3d),
    4) timeseries of SST diff to NH mean (3d)
    """

    print('master with pid {}'.format(os.getpid()))
    print('Preprocessing data ...', end='')
    start_time = time.time()

    # Get time dim for all input files
    p = mp.Pool()
    tv = p.map(get_time_var, input_files)
    p.close()
    p.join()
    tv_sizes = [a.shape[0] for a in tv]
    tv_total = sum(tv_sizes)
    assert tv_total > 0

    # Now collect data. This will be done by a pool of workers
    num_workers = max(1, mp.cpu_count() / 2)
    job_queue = mp.Queue(len(input_files))
    results_queue = mp.Queue(num_workers)

    # Start workers
    workers = []
    for w in range(num_workers):
        args = (job_queue, results_queue, grid_def_file)
        w = mp.Process(target=collect_data, args=args)
        workers.append(w)
        w.start()

    # Put all jobs in the queue
    for f in input_files:
        job_queue.put(f)

    # Setup output file
    with nc.Dataset(input_files[0]) as inf:
        z = len(inf.dimensions['st_ocean'])
        y = len(inf.dimensions['yt_ocean'])
        x = len(inf.dimensions['xt_ocean'])
        ty_trans = inf.variables['ty_trans'][0, :]
        lats = inf.variables['geolat_t'][:]
        lons = inf.variables['geolon_t'][:]

    f = h5py.File(fname)
    f.create_dataset('time', (tv_total,), dtype='f', chunks=True)
    f.create_dataset('nh_sst_average', (tv_total,), dtype='f', chunks=True)
    f.create_dataset('nh_sst', (tv_total, y, x), dtype='f', chunks=True)
    f.create_dataset('amoc_index_sst', (tv_total,), dtype='f', chunks=True)
    f.create_dataset('amoc_psi', (tv_total, z, y), dtype='f', chunks=True)
    f.create_dataset('amoc_psi_max', (tv_total,), dtype='f', chunks=True)
    f.create_dataset('amoc_psi_max_at_26n', (tv_total,), dtype='f', chunks=True)
    f.create_dataset('atlantic_mask', (z, y, x), dtype='i', chunks=True)
    f.create_dataset('amoc_psi_mask', (z, y), dtype='i', chunks=True)

    f['time'][:] = np.concatenate(tv)
    f['atlantic_mask'][:] = get_atlantic_mask(ty_trans.mask, lons, lats)
    f['amoc_psi_mask'][:] = calc_atlantic_moc(ty_trans, lons, lats).mask
    f.flush()

    # A dictionary containing the global indices for each input file
    global_idx = np.split(range(tv_total), np.cumsum(tv_sizes)[:-1])
    assert len(global_idx) == len(input_files)
    indx_dict = dict(zip(input_files, global_idx))

    # Get data from workers load into file at the correct index.
    for i in range(len(input_files)):
        input_file, result_arrays = results_queue.get()
        indx = indx_dict[input_file]

        for name, data in result_arrays:
            if len(data.shape) == 1:
                f[name][indx[0]:indx[-1]+1] = data[:]
            elif len(data.shape) == 2:
                f[name][indx[0]:indx[-1]+1, :] = data[:, :]
            elif len(data.shape) == 3:
                f[name][indx[0]:indx[-1]+1, :, :] = data[:, :, :]
            else:
                import pdb
                pdb.set_trace()
                assert False
        print('.', end='')
        sys.stdout.flush()

    # Terminate workers
    job_queue.close()
    results_queue.close()
    for w in workers:
        w.terminate()

    for w in workers:
        w.join()

    f.close()

    print(' finished in {} seconds'.format(time.time() - start_time))
示例#2
0
def visit_data_file(args):
    """
    Visit a data file and collect/calculate the following:

    1. AMOC maximum timeseries
    2. AMOC mean timeseries
    3. AMOC index timeseries (see (1) for definition)
    4. AMOC psi timeseries as a numpy array, shape (t, depth, lat)
    5. Surface plot of difference between SST and NH spatial mean SST,
        shape (t, lat, lon)
    """

    file, grid_def_file, do_depth_correlation_plot, \
        do_surface_correlation_plot = args

    # Try with annual data.
    use_annual = False
    if use_annual:
        _, tmp_file = run_ncra(file, ['geolon_t', 'geolat_t', 'time', 'temp', 'ty_trans'])
        f = nc.Dataset(tmp_file)
    else:
        f = nc.Dataset(file)

    lons = f.variables['geolon_t']
    lats = f.variables['geolat_t']
    time_var = f.variables['time']
    temp_var = f.variables['temp']
    ty_trans = f.variables['ty_trans']
    depths = np.cumsum(f.variables['st_ocean'][:])
    t_dim = len(f.dimensions['time'])
    z_dim = len(f.dimensions['st_ocean'])
    x_dim = len(f.dimensions['xt_ocean'])
    y_dim = len(f.dimensions['yt_ocean'])

    gf = nc.Dataset(grid_def_file)
    areas = gf.variables['area_t'][:]
    gf.close()

    nh_mask = get_nh_mask(temp_var[0, 0, :, :].mask, lats)
    atlantic_mask = get_atlantic_mask(ty_trans[0, :, :].mask, lons, lats)

    lat_start, lat_end, lon_start, \
    lon_end = get_indices_for_amoc_idx_region(lons, lats)

    amoc_max_ts = []
    amoc_mean_ts = []
    amoc_idx_ts = []
    amoc_psi_ts = np.ma.zeros((t_dim, z_dim, y_dim))
    sst_nh_diff_ts = np.ma.zeros((t_dim, y_dim, x_dim))

    for t in range(time_var.shape[0]):
        # Get surface temp spatial mean in the NH
        nh_sst_mean = np.ma.average(np.ma.masked_array(temp_var[t, 0, :, :],
                                        mask=nh_mask), weights=areas)

        # Get AMOC max and mean.
        amoc_psi = calc_atlantic_moc(ty_trans[t, :, :, :], lons, lats)
        amoc_idx_sst = np.average(temp_var[t, 0, lat_start:lat_end, lon_start:lon_end],
                                      weights=areas[lat_start:lat_end, lon_start:lon_end])
        # Calculate the AMOC index
        amoc_idx_ts.append(amoc_idx_sst - nh_sst_mean)
        amoc_max_ts.append(max_within_region(amoc_psi, 500.0, 35.0, depths, lats))
        amoc_mean_ts.append(np.mean(amoc_psi))

        # Get AMOC psi timeseries
        if do_depth_correlation_plot:
            amoc_psi_ts[t, :, :] = amoc_psi[:, :]

        # Get the surface difference between temp and NH SST mean
        if do_surface_correlation_plot:
            sst_nh_diff_ts[t, :, :] = \
                np.ma.masked_array(temp_var[t, 0, :, :] - nh_sst_mean,
                                   mask=atlantic_mask[0, :, :])


    # Add time dim to pandas timeseries
    periods = time_dim_to_pandas_periods(time_var)
    if use_annual:
        periods = [periods[len(periods) / 2]]
    amoc_idx_ts = pd.Series(amoc_idx_ts, periods)
    amoc_max_ts = pd.Series(amoc_max_ts, periods)
    amoc_mean_ts = pd.Series(amoc_mean_ts, periods)

    f.close()
    if use_annual:
        os.remove(tmp_file)

    ret = [None, None, None, None, None]

    ret[0] = amoc_idx_ts
    ret[1] = amoc_max_ts
    ret[2] = amoc_mean_ts
    if do_depth_correlation_plot:
        ret[3] = amoc_psi_ts
    if do_surface_correlation_plot:
        ret[4] = sst_nh_diff_ts

    print('^', end='')

    return tuple(ret)