def plot_cats_best_prms_1d(dbs_dir, n_cpus): '''Plot every best kfold parameter set for all catchments.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) cats_paths_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_best_prms_1d, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_best_prms_1d(cat_paths) return
def plot_cats_prm_vecs(dbs_dir, n_cpus): '''Plot final parameter set from kfold for every catchments along with objective function value distribution. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) opt_res_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prm_vecs, opt_res_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for opt_res in opt_res_gen: plot_cat_prm_vecs(opt_res) return
def _prep_anomaly_bjs_mp(anoms_arr, bjs_arr, n_cpus, fig_out_dir): assert anoms_arr.shape == bjs_arr.shape _idxs = ret_mp_idxs(anoms_arr.shape[1], n_cpus) _idxs_list = [_idxs[i:i + 2] for i in range(n_cpus)] _anoms_gen = ((anoms_arr[:, _idxs_list[i][0]:_idxs_list[i][1]]) for i in range(n_cpus)) _bjs_gen = ((bjs_arr[:, _idxs_list[i][0]:_idxs_list[i][1]]) for i in range(n_cpus)) mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) try: print( list( mp_pool.uimap(Anomaly._plot_anomaly_bjs_cdf, _idxs_list, _anoms_gen, _bjs_gen, [fig_out_dir] * n_cpus))) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in _plot_anomaly_bjs_cdf:', msg) return
def plot_cats_kfold_effs(dbs_dir, hgs_db_path, compare_ann_cyc_flag, n_cpus): '''Plot the k-fold efficiency results.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) const_args = (compare_ann_cyc_flag, hgs_db_path) cats_paths_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_kfold_effs, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_kfold_effs(cat_paths) return
def plot_cats_vars_errors(dbs_dir, err_var_labs, n_cpus): cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) cats_paths_gen = ((cat_db, err_var_labs) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_vars_errors, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_vars_errors(cat_paths) return
def plot_cats_qsims(dbs_dir, n_cpus=1): '''Plot discharge simulations for every catchment for every kfold using its prm_vecs.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) plot_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_qsims, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_qsims(plot_args) return
def pairs_construction(seqs: List[List[Union[str, int]]], window_size: int = 2, drop_duplicates: bool = True, n_jobs: int = 4, **kwargs): """ Helper function to make pairs from sequences in parallel Parameters ---------- seqs : input sequences of nodes window_size : int, default is 2 drop_duplicates : bool, default if True Delete pairs where both elements are the same n_jobs : int, default is 4 Number of workers to be created in parallel pool Returns ------- List of pairs of nodes as <cur_vertex, context_vertex> """ set_new_config(window_size=window_size, **kwargs) local_logger = logging.getLogger(f"{__name__}") max_processes = max(n_jobs, os.cpu_count()) pairs_pool = ProcessPool(nodes=max_processes) pairs_pool.terminate() pairs_pool.restart() local_logger.info("Started making pairs from the sequences.") pairs = pairs_pool.map(_make_pairs, seqs) local_logger.info(f"Total number of raw sampled pairs is {len(pairs)}") if drop_duplicates: pairs = [item for sublist in pairs for item in sublist if item[0] != item[1]] else: pairs = [item for sublist in pairs for item in sublist] pairs = [item for item in pairs if (item[0] != -3) & (item[1] != -3)] pairs_pool.terminate() pairs_pool.restart() return pairs
def plot_cats_hbv_sim(dbs_dir, water_bal_step_size, full_flag=False, wat_bal_flag=False, show_warm_up_steps_flag=False, n_cpus=1): '''Plot hbv simulations for every catchment for every kfold.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) const_args = (water_bal_step_size, full_flag, wat_bal_flag, show_warm_up_steps_flag) plot_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_hbv_sim, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_hbv_sim(plot_args) return
def parallelize_simulations(simulation_execs: List[Callable], var_dict_list: List[VarDictType], states_lists: List[StatesListsType], configs_structs: List[ConfigsType], env_processes_list: List[EnvProcessesType], Ts: List[range], SimIDs, Ns: List[int], ExpIDs: List[int], SubsetIDs, SubsetWindows, configured_n): print(f'Execution Mode: parallelized') params = list( zip(simulation_execs, var_dict_list, states_lists, configs_structs, env_processes_list, Ts, SimIDs, Ns, SubsetIDs, SubsetWindows)) len_configs_structs = len(configs_structs) unique_runs = Counter(SimIDs) sim_count = max(unique_runs.values()) highest_divisor = int(len_configs_structs / sim_count) new_configs_structs, new_params = [], [] for count in range(len(params)): if count == 0: new_params.append(params[count:highest_divisor]) new_configs_structs.append(configs_structs[count:highest_divisor]) elif count > 0: new_params.append(params[count * highest_divisor:(count + 1) * highest_divisor]) new_configs_structs.append( configs_structs[count * highest_divisor:(count + 1) * highest_divisor]) def threaded_executor(params): if len_configs_structs > 1: tp = TPool() results = tp.map( lambda t: t[0](t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], configured_n), params) tp.close() else: t = params[0] results = t[0](t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], configured_n) return results pp = PPool() results = flatten( list(pp.map(lambda params: threaded_executor(params), new_params))) # results = flatten(list(map(lambda params: threaded_executor(params), new_params))) pp.close() pp.join() pp.clear() pp.restart() return results
def threaded_contents_to_text( content_series, processes=None, none_content='raise', ): """Threaded version of content_to_text method It takes as input a series which index is the uid of the products, and the values are the content (in the form of bytes) of the documents. processes argument is the number of processes to launch. If omitted, it defaults to the number of cpu cores on the machine. none_content arg can be 'raise' (default) or to_empty """ processer = partial( PDFDecoder.content_to_text, none_content=none_content, ) processes = processes if processes else cpu_count() print(f'Launching {processes} processes.') in_ds = content_series.apply(BytesIO) # Pool with context manager do not seem to work due to issue 38501 of # standard python library. It hangs when running tests through pytest # see: https://bugs.python.org/issue38501 # Below content should be tested again whenever this issue is closed # # with Pool(nodes=processes) as pool: # tuples = (list(in_ds.index), # pool.map(processer, in_ds)) # # End of block # This temporary solution should be removed when tests mentioned above # are successful. # This just closes each pool after execution or exception. try: pool = Pool(nodes=processes) pool.restart(force=True) tuples = (list(in_ds.index), pool.map(processer, in_ds)) except Exception: pool.close() raise pool.close() # End of block ds = pd.Series(tuples[1], index=tuples[0]) return (ds)
def threaded_texts_to_blocks(text_series, processes=None, split_func=lambda x: x.split('\n\n'), return_type='along_index'): """Threaded version of text_to_blocks_series method It takes as input a series which index is the uid of the products, and the values are the content (in the form of bytes) of the documents.. processes argument is the number of processes to launch. If omitted, it defaults to the number of cpu cores on the machine. As for text_to_blocks_series function, return_type can be 'along_axis' or 'list_like'. """ processer = partial(PDFDecoder.text_to_blocks_series, split_func=split_func, return_type=return_type) processes = processes if processes else cpu_count() print(f'Launching {processes} processes.') # Pool with context manager do not seem to work due to issue 38501 of # standard python library. It hangs when running tests through pytest # see: https://bugs.python.org/issue38501 # Below content should be tested again whenever this issue is closed # # with Pool(nodes=processes) as pool: # ds_list = pool.map(processer, text_series, text_series.index) # # End of block # This temporary solution should be removed when tests mentioned above # are successful. # This just closes each pool after execution or exception. try: pool = Pool(nodes=processes) pool.restart(force=True) ds_list = pool.map(processer, text_series, text_series.index) except Exception: pool.close() raise pool.close() # End of block ds = pd.concat(ds_list, axis=0) return (ds)
def plot_cats_prm_vecs_evo(dbs_dir, save_obj_flag, save_png_flag, save_gif_flag, anim_secs, n_cpus=1): '''Plot the evolution of parameter vectors and convex hull for every catchment for every kfold. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) opt_res_gen = ((cat_db, save_obj_flag, save_png_flag, save_gif_flag, anim_secs) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prm_vecs_evo, opt_res_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for opt_res in opt_res_gen: plot_cat_prm_vecs_evo(opt_res) return
def _cmpt_lim_phsrand_obj_vals(self, phs_red_rate, idxs_sclr): beg_tm = default_timer() _ = phs_red_rate _ = idxs_sclr self._sett_lim_phsrand_dir.mkdir(exist_ok=True) ptrb_ratios = np.linspace(self._sett_lim_phsrand_ptrb_lbd, self._sett_lim_phsrand_ptrb_ubd, self._sett_lim_phsrand_n_ptrb_vals, endpoint=True) ptrb_obj_vals = np.empty((self._sett_lim_phsrand_n_ptrb_vals, self._sett_lim_phsrand_iters_per_atpt)) n_cpus = min(self._sett_lim_phsrand_n_ptrb_vals, self._sett_misc_n_cpus) ubd_sclr = 1.2 search_attempts = 0 ress = [] sel_stat_ftn = getattr(np, self._alg_lim_phsrand_sel_stat) if self._vb: print('Attempt,', 'Perturb ratio,', ' Minimum,', ' Mean,', ' Maximum') if n_cpus > 1: self._lock = Manager().Lock() mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) for i in range(0, self._sett_lim_phsrand_n_ptrb_vals, n_cpus): end_idx = min(self._sett_lim_phsrand_n_ptrb_vals, n_cpus + i) assert i < end_idx, 'This was not supposed to happen!' search_attempts += end_idx - i # Don't use ret_mp_idxs, it will be inefficient. args_gen = ((j, ptrb_ratios[j]) for j in range(i, end_idx)) ptrb_obj_vals_iter = (list( mp_pool.imap(self._cmpt_lim_phsrand_obj_vals_single, args_gen))) ress.extend(ptrb_obj_vals_iter) if np.any([ sel_stat_ftn(ptrb_obj_vals_iter[k][1]) >= (self._sett_lim_phsrand_obj_ubd * ubd_sclr) for k in range(len(ptrb_obj_vals_iter)) ]): break mp_pool.close() mp_pool.join() self._lock = None mp_pool = None else: self._lock = Lock() for j in range(self._sett_lim_phsrand_n_ptrb_vals): search_attempts += 1 ress.append( self._cmpt_lim_phsrand_obj_vals_single( (j, ptrb_ratios[j]))) if (sel_stat_ftn(ress[-1][1]) >= (self._sett_lim_phsrand_obj_ubd * ubd_sclr)): break self._lock = None take_idxs = [] for res in ress: take_idxs.append(res[0]) ptrb_obj_vals[take_idxs[-1], :] = res[1] take_idxs.sort() take_idxs = np.array(take_idxs) ptrb_ratios = ptrb_ratios[take_idxs] ptrb_obj_vals = ptrb_obj_vals[take_idxs] res = ress = None assert np.all( np.isfinite(ptrb_ratios)), ('Invalid values in ptrb_ratios!') assert np.all( ptrb_ratios >= 0), ('Values less than zero in ptrb_ratios!') assert np.all( np.isfinite(ptrb_obj_vals)), ('Invalid values in ptrb_obj_vals!') assert np.all( ptrb_obj_vals >= 0), ('Values less than zero in ptrb_obj_vals!') self._alg_lim_phsrand_ptrb_ratios = ptrb_ratios self._alg_lim_phsrand_ptrb_obj_vals = ptrb_obj_vals self._set_lim_phsrand_ptrb_ratio() self._plot_lim_phsrand_obj_vals() end_tm = default_timer() if self._vb: print(f'Found perturbation ratio of ' f'{self._alg_lim_phsrand_ptrb_ratio:5.3E} in ' f'{end_tm - beg_tm:0.1f} ' f'seconds using {search_attempts} attempts.') return
def main(): main_dir = Path( r'P:\Synchronize\IWS\Testings\fourtrans_practice\multisite_phs_spec_corr' ) os.chdir(main_dir) interp_var = 'temp' ft_type = 'mag' #========================================================================== if interp_var == 'temp': # MEAN TEMPERATURE in_data_file = os.path.join(f'temperature_{ft_type}_spec_df.csv') in_vgs_file = os.path.join(r'temperature_cftns.csv') in_stns_coords_file = os.path.join(os.path.dirname(in_data_file), r'temperature_avg_coords.csv') out_dir = r'temperature_kriging' var_units = u'\u2103' # 'centigrade' var_name = 'temperature' out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc' # interpolated values # can be int, float, 'min_in'/'max_in' or None # min_var_val = 'min_in' # max_var_val = 'max_in' # min_var_val = None # max_var_val = None #========================================================================== #========================================================================== elif interp_var == 'ppt': # PRECIPITATION in_data_file = os.path.join(f'precipitation_{ft_type}_spec_df.csv') in_vgs_file = os.path.join(r'precipitation_cftns.csv') in_stns_coords_file = os.path.join(os.path.dirname(in_data_file), r'precipitation_coords.csv') out_dir = r'precipitation_kriging' var_units = 'mm' var_name = 'precipitation' out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc' # interpolated values # can be int, float, 'min_in'/'max_in' or None # min_var_val = 'min_in' # max_var_val = 'max_in' # min_var_val = None # max_var_val = None #========================================================================== else: raise ValueError(f'Invalid value for interp_var: {interp_var}!') out_krig_net_cdf_file = out_krig_net_cdf_file # assuming in_drift_raster and in_stns_coords_file and in_bounds_shp_file # have the same coordinates system # assuming in_drift_rasters_list have the same cell sizes, bounds and NDVs # basically they are copies of each other except for the drift values in_drift_rasters_list = ([ r'P:\Synchronize\IWS\QGIS_Neckar\raster\lower_de_gauss_z3_1km.tif' ]) # in_bounds_shp_file = ( # os.path.join(r'P:\Synchronize\IWS\QGIS_Neckar\raster', # r'taudem_out_spate_rockenau\watersheds.shp')) in_bounds_shp_file = (os.path.join( r'P:\Synchronize\IWS\QGIS_Neckar\raster\taudem_out_spate_rockenau\watersheds.shp' )) align_ras_file = in_drift_rasters_list[0] out_figs_dir = os.path.join(out_dir, 'krige_figs') x_coords_lab = 'X' y_coords_lab = 'Y' time_dim_lab = 'freq' nc_mode = 'w' # min_ppt_thresh = 1.0 idw_exp = 5 n_cpus = 1 buffer_dist = 20e3 sec_buffer_dist = 2e3 in_sep = str(';') ord_krige_flag = True sim_krige_flag = True edk_krige_flag = True idw_flag = True plot_figs_flag = True # ord_krige_flag = False sim_krige_flag = False edk_krige_flag = False idw_flag = False plot_figs_flag = False os.chdir(main_dir) if not os.path.exists(out_dir): os.mkdir(out_dir) if (not os.path.exists(out_figs_dir)) and plot_figs_flag: os.mkdir(out_figs_dir) # print('min_var_val:', min_var_val) # print('max_var_val:', max_var_val) print('idw_exp:', idw_exp) print('n_cpus:', n_cpus) print('nc_mode:', nc_mode) print('var_name:', var_name) print('out_dir:', out_dir) print('in_bounds_shp_file:', in_bounds_shp_file) print('out_krig_net_cdf_file:', out_krig_net_cdf_file) assert any([ord_krige_flag, sim_krige_flag, edk_krige_flag, idw_flag]) #========================================================================== # read the data frames #========================================================================== in_data_df = pd.read_csv(in_data_file, sep=in_sep, index_col=0, encoding='utf-8') in_vgs_df = pd.read_csv(in_vgs_file, sep=in_sep, index_col=0, encoding='utf-8') in_stns_coords_df = pd.read_csv(in_stns_coords_file, sep=in_sep, index_col=0, encoding='utf-8') all_stns = in_data_df.columns.intersection(in_stns_coords_df.index) assert all_stns.shape[0] in_data_df = in_data_df.loc[:, all_stns] in_stns_coords_df = in_stns_coords_df.loc[all_stns, :] #========================================================================== # Get stations that are around/in the bounds_shp only #========================================================================== bds_vec = ogr.Open(in_bounds_shp_file) assert bds_vec bds_lyr = bds_vec.GetLayer(0) feat_buffs_list = [] feat_sec_buffs_list = [] for feat in bds_lyr: # just to get the names of the catchments geom = feat.GetGeometryRef().Clone() assert geom feat_buffs_list.append(geom.Buffer(buffer_dist)) feat_sec_buffs_list.append(geom.Buffer(sec_buffer_dist)) bds_vec.Destroy() assert feat_buffs_list and feat_sec_buffs_list print(len(feat_buffs_list), 'polygons in the in_bounds_shp_file...') fin_stns = [] for poly in feat_buffs_list: for stn in all_stns: if stn in fin_stns: continue curr_pt = cnvt_to_pt(*in_stns_coords_df.loc[stn, ['X', 'Y']].values) if chk_cntmt(curr_pt, poly): fin_stns.append(stn) assert fin_stns print('%d stations out of %d within buffer zone of in_bounds_shp_file' % (len(fin_stns), in_stns_coords_df.shape[0])) fin_stns = np.unique(fin_stns) in_data_df = in_data_df.loc[:, fin_stns] in_stns_coords_df = in_stns_coords_df.loc[fin_stns, :] #========================================================================== # Read the DEM #========================================================================== # if edk_krige_flag: # in_drift_arr_list = [] # _rows_list = [] # _cols_list = [] # # for in_drift_raster in in_drift_rasters_list: # in_drift_ds = gdal.Open(in_drift_raster) # # assert in_drift_ds, 'GDAL cannot open %s' % in_drift_raster # # drift_rows = in_drift_ds.RasterYSize # drift_cols = in_drift_ds.RasterXSize # # drift_geotransform = in_drift_ds.GetGeoTransform() # # _drift_x_min = drift_geotransform[0] # _drift_y_max = drift_geotransform[3] # # drift_band = in_drift_ds.GetRasterBand(1) # drift_ndv = drift_band.GetNoDataValue() # # cell_width = drift_geotransform[1] # cell_height = abs(drift_geotransform[5]) # # _drift_x_max = _drift_x_min + (drift_cols * cell_width) # _drift_y_min = _drift_y_max - (drift_rows * cell_height) # # _arr = in_drift_ds.ReadAsArray() # # in_drift_arr_list.append(_arr) # _rows_list.append(_arr.shape[0]) # _cols_list.append(_arr.shape[1]) # # assert all(_ == _rows_list[0] for _ in _rows_list), ( # 'Drift raster have unequal number of rows!') # # assert all(_ == _cols_list[0] for _ in _cols_list), ( # 'Drift raster have unequal number of columns!') #========================================================================== # Read the bounding shapefile #========================================================================== # sf = shp.Reader(in_bounds_shp_file) # polys_list = [i.__geo_interface__ for i in sf.iterShapes()] ((fin_x_min, fin_x_max, fin_y_min, fin_y_max), cell_width) = get_aligned_shp_bds_and_cell_size(in_bounds_shp_file, align_ras_file) cell_height = cell_width fin_x_min -= 2 * cell_width fin_x_max += 2 * cell_width fin_y_min -= 2 * cell_height fin_y_max += 2 * cell_height # if edk_krige_flag: # assert fin_x_min > _drift_x_min # assert fin_x_max < _drift_x_max # assert fin_y_min > _drift_y_min # assert fin_y_max < _drift_y_max # # min_col = int(max(0, (fin_x_min - _drift_x_min) / cell_width)) # max_col = int(ceil((fin_x_max - _drift_x_min) / cell_width)) # # min_row = int(max(0, (_drift_y_max - fin_y_max) / cell_height)) # max_row = int(ceil((_drift_y_max - fin_y_min) / cell_height)) # # else: min_col = 0 max_col = int(ceil((fin_x_max - fin_x_min) / cell_width)) min_row = 0 max_row = int(ceil((fin_y_max - fin_y_min) / cell_height)) #========================================================================== # Calculate coordinates at which to krige #========================================================================== assert 0 <= min_col <= max_col, (min_col, max_col) assert 0 <= min_row <= max_row, (min_row, max_row) strt_x_coord = fin_x_min + (0.5 * cell_width) end_x_coord = strt_x_coord + ((max_col - min_col) * cell_width) strt_y_coord = fin_y_max - (0.5 * cell_height) end_y_coord = strt_y_coord - ((max_row - min_row) * cell_height) krige_x_coords = np.linspace(strt_x_coord, end_x_coord, (max_col - min_col + 1)) krige_y_coords = np.linspace(strt_y_coord, end_y_coord, (max_row - min_row + 1)) krige_x_coords_mesh, krige_y_coords_mesh = np.meshgrid( krige_x_coords, krige_y_coords) krige_coords_orig_shape = krige_x_coords_mesh.shape # if plot_figs_flag: # # xy coords for pcolormesh # pcolmesh_x_coords = np.linspace( # fin_x_min, fin_x_max, (max_col - min_col + 1)) # # pcolmesh_y_coords = np.linspace( # fin_y_max, fin_y_min, (max_row - min_row + 1)) # # krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = ( # np.meshgrid(pcolmesh_x_coords, pcolmesh_y_coords)) # # else: # krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = None, None krige_x_coords_mesh = krige_x_coords_mesh.ravel() krige_y_coords_mesh = krige_y_coords_mesh.ravel() # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print(krige_x_coords_mesh.shape[0], # 'cells to interpolate per step before intersection!') # fin_cntn_idxs = np.ones(krige_x_coords_mesh.shape[0], dtype=bool) # fin_cntn_idxs = np.zeros(krige_x_coords_mesh.shape[0], dtype=bool) # ogr_pts = np.vectorize(cnvt_to_pt)(krige_x_coords_mesh, krige_y_coords_mesh) # # for poly in feat_sec_buffs_list: # curr_cntn_idxs = np.vectorize(chk_cntmt)(ogr_pts, poly) # fin_cntn_idxs = fin_cntn_idxs | curr_cntn_idxs # # print(fin_cntn_idxs.sum(), # 'cells to interpolate per step after intersection!') # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # krige_x_coords_mesh = krige_x_coords_mesh[fin_cntn_idxs] # krige_y_coords_mesh = krige_y_coords_mesh[fin_cntn_idxs] # if edk_krige_flag: # drift_vals_list = [] # # krige_cols = np.arange(min_col, max_col + 1, dtype=int) # krige_rows = np.arange(min_row, max_row + 1, dtype=int) # # assert krige_x_coords.shape[0] == krige_cols.shape[0] # assert krige_y_coords.shape[0] == krige_rows.shape[0] # # (krige_drift_cols_mesh, # krige_drift_rows_mesh) = np.meshgrid(krige_cols, krige_rows) # # krige_drift_cols_mesh = krige_drift_cols_mesh.ravel() # krige_drift_rows_mesh = krige_drift_rows_mesh.ravel() # # krige_drift_cols_mesh = krige_drift_cols_mesh[fin_cntn_idxs] # krige_drift_rows_mesh = krige_drift_rows_mesh[fin_cntn_idxs] # # for _drift_arr in in_drift_arr_list: # _drift_vals = _drift_arr[ # krige_drift_rows_mesh, krige_drift_cols_mesh] # # drift_vals_list.append(_drift_vals) # # # drift_vals_arr = np.array(drift_vals_list, dtype=float) # # drift_df_cols = list(range(len(in_drift_rasters_list))) # in_stns_drift_df = pd.DataFrame( # index=in_stns_coords_df.index, # columns=drift_df_cols, # dtype=float) # # for stn in in_stns_drift_df.index: # stn_x = in_stns_coords_df.loc[stn, x_coords_lab] # stn_y = in_stns_coords_df.loc[stn, y_coords_lab] # # stn_col = int((stn_x - _drift_x_min) / cell_width) # stn_row = int((_drift_y_max - stn_y) / cell_height) # # for col, _arr in zip(drift_df_cols, in_drift_arr_list): # try: # _ = _arr[stn_row, stn_col] # if not np.isclose(drift_ndv, _): # in_stns_drift_df.loc[stn, col] = _ # # except IndexError: # pass # # in_stns_drift_df.dropna(inplace=True) #========================================================================== # Open NC #========================================================================== out_nc = nc.Dataset(os.path.join(out_dir, out_krig_net_cdf_file), mode=str(nc_mode)) if nc_mode == 'w': out_nc.set_auto_mask(False) out_nc.createDimension(x_coords_lab, krige_x_coords.shape[0]) out_nc.createDimension(y_coords_lab, krige_y_coords.shape[0]) out_nc.createDimension(time_dim_lab, in_data_df.shape[0]) x_coords_nc = out_nc.createVariable(x_coords_lab, 'd', dimensions=x_coords_lab) x_coords_nc[:] = krige_x_coords y_coords_nc = out_nc.createVariable(y_coords_lab, 'd', dimensions=y_coords_lab) y_coords_nc[:] = krige_y_coords time_nc = out_nc.createVariable(time_dim_lab, 'i8', dimensions=time_dim_lab) time_nc[:] = np.arange(in_data_df.shape[0]) else: raise RuntimeError('Not configured for this option!') time_nc = out_nc.variables[time_dim_lab] krige_y_coords = y_coords_nc[:] krige_x_coords = x_coords_nc[:] #========================================================================== # MP stuff #========================================================================== mp_cond = False if ((n_cpus > 1) and (in_data_df.shape[0] > (n_cpus + 1))): idxs = pd.np.linspace(0, in_data_df.shape[0], (n_cpus) + 1, endpoint=True, dtype=int) idxs = np.unique(idxs) print('MP idxs:', idxs) if idxs.shape[0] == 1: idxs = np.concatenate((np.array([0]), idxs)) mp_cond = True else: idxs = [0, in_data_df.shape[0]] #========================================================================== # Krige #========================================================================== if ord_krige_flag: print('\n\n') print('#' * 10) _beg_t = timeit.default_timer() print('Ordinary Kriging...') if 'OK' not in out_nc.variables: ok_nc = out_nc.createVariable('OK', 'd', dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), fill_value=False) else: ok_nc = out_nc.variables['OK'] ok_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], in_stns_coords_df, in_vgs_df.loc[ft_type][0], krige_x_coords_mesh, krige_y_coords_mesh, krige_coords_orig_shape, (idxs[i], idxs[i + 1]), fin_cntn_idxs) for i in range(n_cpus)) if mp_cond: ok_krige_flds = np.full( (in_data_df.shape[0], krige_coords_orig_shape[0], krige_coords_orig_shape[1]), np.nan, dtype=np.float32) mp_ress = [] try: mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen)) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in ordinary_kriging:', msg) for mp_res in mp_ress: if (len(mp_res) != 3) and (not isinstance(list)): print('\n', mp_res, '\n') continue [strt_index, end_index, sub_ok_krige_flds] = mp_res ok_krige_flds[strt_index:end_index] = sub_ok_krige_flds # free memory mp_res[2], sub_ok_krige_flds = None, None ok_nc[:] = ok_krige_flds else: [strt_index, end_index, ok_krige_flds] = ordinary_kriging(next(ok_vars_gen)) ok_nc[:] = ok_krige_flds ok_nc.units = var_units ok_nc.standard_name = var_name + ' (ordinary kriging)' ok_krige_flds = None _end_t = timeit.default_timer() _tot_t = _end_t - _beg_t print(f'Took {_tot_t:0.4f} seconds!') print('#' * 10) # if sim_krige_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('Simple Kriging...') # if 'SK' not in out_nc.variables: # sk_nc = out_nc.createVariable( # 'SK', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # sk_nc = out_nc.variables['SK'] # # sk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_coords_df, # in_vgs_df.iloc[idxs[i]:idxs[i + 1]], # min_ppt_thresh, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # krige_coords_orig_shape, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # sk_krige_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap(simple_kriging, sk_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in simple_kriging:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_sk_krige_flds] = mp_res # sk_krige_flds[strt_index:end_index] = sub_sk_krige_flds # # # free memory # mp_res[2], sub_sk_krige_flds = None, None # # sk_nc[:] = sk_krige_flds # # else: # [strt_index, # end_index, # sk_krige_flds] = simple_kriging(next(sk_vars_gen)) # # sk_nc[:] = sk_krige_flds # # sk_nc.units = var_units # sk_nc.standard_name = var_name + ' (simple kriging)' # # sk_krige_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # if edk_krige_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('External Drift Kriging...') # if 'EDK' not in out_nc.variables: # edk_nc = out_nc.createVariable( # 'EDK', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # edk_nc = out_nc.variables['EDK'] # # edk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_drift_df, # in_stns_coords_df, # in_vgs_df.iloc[idxs[i]:idxs[i + 1]], # min_ppt_thresh, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # drift_vals_arr, # krige_coords_orig_shape, # drift_ndv, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # edk_krige_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap( # external_drift_kriging, edk_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in external_drift_kriging:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_edk_krige_flds] = mp_res # edk_krige_flds[strt_index:end_index] = sub_edk_krige_flds # # print('sub_min:', np.nanmin(sub_edk_krige_flds)) # print('sub_max:', np.nanmax(sub_edk_krige_flds)) # # # free memory # mp_res[2], sub_edk_krige_flds = None, None # # else: # [strt_index, # end_index, # edk_krige_flds] = external_drift_kriging(next(edk_vars_gen)) # # edk_nc[:] = edk_krige_flds # # edk_nc.units = var_units # edk_nc.standard_name = var_name + ' (external drift kriging)' # # edk_krige_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # #========================================================================== # # IDW # #========================================================================== # if idw_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('Inverse Distance Weighting...') # if 'IDW' not in out_nc.variables: # idw_nc = out_nc.createVariable( # 'IDW', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # idw_nc = out_nc.variables['IDW'] # # idw_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_coords_df, # min_ppt_thresh, # idw_exp, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # krige_coords_orig_shape, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # idw_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap( # inverse_distance_wtng, idw_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in inverse_distance_wtng:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_idw_flds] = mp_res # idw_flds[strt_index:end_index] = sub_idw_flds # # # free memory # mp_res[2], sub_idw_flds = None, None # # else: # [strt_index, # end_index, # idw_flds] = inverse_distance_wtng(next(idw_vars_gen)) # # idw_nc[:] = idw_flds # # idw_nc.units = var_units # idw_nc.standard_name = ( # var_name + ' (IDW (exp=%0.3f))' % float(idw_exp)) # # idw_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) out_nc.Author = 'Faizan IWS Uni-Stuttgart' out_nc.Source = out_nc.filepath() out_nc.close() return
dat = MyDataset(cg.s, cg.ss, cg.r, cg.a, total, [(*NUM_GRID, NUM_CHANNEL), (1, ), (1, ), (1, )]).new() dat = dat.prefetch(tf.data.AUTOTUNE) print('Processing Data Complete.') print("Training...") with tqdm.tqdm(total=10) as pbar: prog_callback = ProgCallback(pbar) hist = critic.fit(dat, epochs=10, verbose=0, callbacks=[prog_callback]) print(hist.history['loss']) print("Training Complete.") with open(f'ddrive/{gen}.txt', 'wb') as f: f.write( base64.b85encode( lzma.compress(pickle.dumps([ arr.astype(np.float16) for arr in critic.get_weights() ]), preset=9))) pool.close() pool.join() pool.restart()
def graph_sampling(graph: FSN, strategy: Optional[str] = "MetaDiff", n_jobs: Optional[int] = 4, use_cache: Optional[bool] = True, **kwargs) \ -> List[List[Union[str, int]]]: """ Sampling the sequences of nodes from FSN w.r.t. chosen strategy Parameters ---------- graph : FSN object Graph to be processed strategy : str, default is 'MetaDiff' Walking strategy to be used n_jobs : int, default is 4 Number of workers to be created in parallel pool use_cache : bool, default is True To use the previously cached files Returns ------- Sampled sequences of BP nodes """ set_new_config(**kwargs) local_logger = logging.getLogger(f"{__name__}") if use_cache and os.path.isfile(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl"): local_logger.info("Loading sequences from cache... wait...") try: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "rb") as file: res = pickle.load(file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res except FileNotFoundError: local_logger.info("File not found... Recalculate \n") pass except Exception as e: local_logger.error(f"Unexpected error: {e}") local_logger.info("Sampling sequences... wait...") max_processes = max(n_jobs, os.cpu_count()) global walk if strategy in strategy_to_class.keys(): walk = strategy_to_class[strategy](G=graph, walk_length=CONFIG.WALKS_LENGTH, direction=CONFIG.DIRECTION, pressure=CONFIG.PRESSURE, allow_back=CONFIG.ALLOW_BACK) else: raise KeyError( f"The given strategy {strategy} is unknown. The following ones are implemented: {strategy_to_class.keys()}") sampling_pool = ProcessPool(nodes=max_processes) local_logger.info("Created a Pool with " + str(max_processes) + " processes ") # required to restart pool to update CONFIG inside the parallel part sampling_pool.terminate() sampling_pool.restart() BPs = graph.get_BPs() n_BPs = len(BPs) sampled = list() try: with tqdm(total=n_BPs) as pbar: for i, res in enumerate(sampling_pool.uimap(wrappedWalk, BPs)): sampled.append(res) pbar.update() except KeyboardInterrupt: print('Got ^C while pool mapping, terminating the pool') sampling_pool.terminate() res = list(itertools.chain(*sampled)) sampling_pool.terminate() sampling_pool.restart() local_logger.info("Cashing sampled sequences!") if use_cache: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "wb") as file: pickle.dump(res, file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res
def test_repeatability(self): import matplotlib.pyplot as mpl import numpy as np from pathos.multiprocessing import ProcessPool from itertools import product start, end = (10, 10), (350, 250) repeats = 2 equal_paths = [] rdrs = np.linspace(-100, 100, 10) jgs = [0] # np.linspace(0, 5000, 2) jls = np.linspace(0, 50, 2) def make_path(start, end, rdr, jg, jl): algo = RiskJumpPointSearchAStar(ManhattanRiskHeuristic( self.large_diag_environment, risk_to_dist_ratio=rdr), jump_gap=jg, jump_limit=jl) return algo.find_path(self.large_diag_environment, start, end) def run_params(rdr, jg, jl): paths = [ make_path(start, end, rdr, jg, jl) for _ in range(repeats) ] equal_paths.append(all([p == paths[0] for p in paths])) if not paths[0]: return [rdr, np.inf, jl, jg] risk_sum = sum([ self.large_diag_environment.grid[n[0], n[1]] for n in paths[0] ]) return [rdr, risk_sum, jl, jg] pool = ProcessPool(nodes=8) pool.restart(force=True) params = np.array(list(product(rdrs, jgs, jls))) risk_sums = pool.map(run_params, params[:, 0], params[:, 1], params[:, 2]) pool.close() # risk_sums = [] # for rdr, jg, jl in product(rdrs, jgs, jls): # paths = [make_path(start, end, rdr, jg, jl) for _ in range(repeats)] # equal_paths.append(all([p == paths[0] for p in paths])) # if not paths[0]: # risk_sums.append([rdr, np.inf, jl, jg]) # continue # risk_sum = sum([n.n for n in paths[0]]) # risk_sums.append([rdr, risk_sum, jl, jg]) # # fig = mpl.figure() # ax = fig.add_subplot(111) # for path in paths: # ax.plot([n.x for n in path], [n.y for n in path], color='red') # im = ax.imshow(self.large_diag_environment.grid) # fig.colorbar(im, ax=ax, label='Population') # ax.set_title(f'Risk JPS A* with RDR={rdr:.4g}, JL={jl} \n Risk sum={risk_sum:.4g}') # fig.show() risk_sums = np.array(risk_sums) jl_fig = mpl.figure() ax = jl_fig.add_subplot(111) sc = ax.scatter(risk_sums[:, 0], risk_sums[:, 1], c=risk_sums[:, 2]) ax.set_yscale('symlog') ax.set_xlabel('Risk-Distance Ratio') ax.set_ylabel('Path Risk sum') ax.set_title('R JPS+ A* Jump Limits') jl_fig.colorbar(sc, ax=ax, label='Jump Limit') jl_fig.show() jg_fig = mpl.figure() ax = jg_fig.add_subplot(111) sc = ax.scatter(risk_sums[:, 0], risk_sums[:, 1], c=risk_sums[:, 3]) ax.set_yscale('symlog') ax.set_xlabel('Risk-Distance Ratio') ax.set_ylabel('Path Risk sum') ax.set_title('R JPS+ A* Jump Gaps') jg_fig.colorbar(sc, ax=ax, label='Jump Gap') jg_fig.show() self.assertTrue(all(equal_paths), 'Paths are not generated repeatably')
def build_pt(sampler_class, pe_method, force_method, numdim = 5, masses = 1.0, \ nT = 10, nproc = 1, Tmin = 1.0, Tmax = 100.0, max_iteration = 500, iters_to_swap = 1, \ iters_to_waypoint = 5, iters_to_setdt = 10, iters_to_writestate = 1, run_token = 1, \ dt = 1.0e-4, traj_len = 100, num_traj = 10, absxmax = 1.0e2, initial_rand_bounds = 1.0e2, \ dt_max = None, min_rate = 0.6, max_rate = 0.7, gaussianprior_std = None): """Builds an instance of ParallelTempering. Reads restart file if it exists, or initialises a fresh run. Args: sampler_class : Sampler class from module sampling. Eg. sampling.Hmc . pe_method : A method for evaluating the potential energy. force_method : A method for evaluating the forces. numdim (int) :The number of dimensions of the configuration space ('parameter space'). (Defualt: 5) masses (single float or numpy array of floats, with length 1 or length numdim): specifies the masses associated with each dimension of the configuration space ('parameter space'). (Default: 1.0) nT (int) : Number of temperatures to use. (Default: 10) nproc (int) : Number of processors to use. (Default: 1) Tmin (float) : Lowest temperature in ladder of temperatures. (Default 1.0) Tmax (float) : Maximum temperature in ladder of temperatures. (Default 100.0). max_iteration (int) : Max number of iterations to run. (Default 500). iters_to_swap (int) : Configuration swaps between neighbouring temperatures are attempted every iters_to_swap iterations. (Default 1). iters_to_waypoint (int) : Restart information is written after every iters_to_waypoint iterations. (Default 5). iters_to_setdt (int) : The step sizes (or equivalently time steps) are updated after every iters_to_setdt interations. (Default 10). iters_to_writestate (int) : The latest potential energy values and coordinates are written out after every iters_to_writestate iterations. (Default 1). run_token (int) : An integer for labelling the restart and output files for this calculation. (Default 1). dt (float) : Initial time step (or step size). This will be updated algorithmically, but a good starting point saves time. (Default 1.0e-4). traj_len (int) : The number of time steps in a single trajectory. (Default 100). num_traj (int) : The number of trajectories run per iteration, per sampler. (Default 10). absxmax (single float or numpy array of floats, with length 1 or length numdim) : During the main calculation, the sampler is restricted to a region x in [-absxmax,absxmax]. (Default: 1.0e2). initial_rand_bounds : The same as absxmax, but applied only during random initialisation of the sampler's coordinate (parameters). This enables initialisation into a particular region, which might for example, be most likely to contain the global minimum. (Default: 1.0e2). dt_max (float) : maximum step size (time step). (Default: median(absxmax), which is set in module sampling.) min_rate (float) : minimum acceptance rate of trajectories. Used for setting step size (time step). (Default: 0.6. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65 http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3). max_rate (float) : maximum acceptance rate of trajectories. Used for setting step size (time step). (Default 0.7. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65 http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3). gaussianprior_std (single float or numpy array of floats, with length 1 or length numdim) : If this is set to a real value then an additional term is applied to (H)MC acceptance/rejection such that the target distribution is proportional to a multivariate Gaussian with this standard deviation for each dimension. (Default: None.) Return: ParallelTempering class object """ # CHECK FOR RESTART FILE AND DO RESTART IF PRESENT restrtfl = "restart_pt_" + str(run_token) + ".txt" if os.path.isfile("./" + restrtfl): # read restart data from restart file didrestart = True print "Restarting from file ", restrtfl, time.ctime() nT, Tmin, Tmax, iteration, num_traj, samplers, walkers = \ read_waypoint(restrtfl, sampler_class, pe_method, force_method) else: didrestart = False iteration = 0 # a list of new walkers (which are class objects) samplers = build_samplers( sampler_class, pe_method, force_method, nT, Tmin, Tmax, dt, \ traj_len, absxmax, dt_max, min_rate, max_rate, gaussianprior_std ) print "Start initialise walkers ", time.ctime() walkers = np.asarray([]) sampling.NewWalker.masses = masses sampling.NewWalker.numdim = numdim temp_pool = ProcessPool(nodes=nproc) # temporarily pass initial_random_bounds through samplers, since pathos multiprocessing is # restrictive with arguments for sampler in samplers: sampler.random_init_bounds = initial_rand_bounds outs = sampling.apply_pool(temp_pool, initialise_walker, samplers) for i in xrange(len(outs)): walkers = np.append(walkers, outs[i][0]) samplers[i] = outs[i][1] temp_pool.terminate() # close pool temp_pool.restart() # close pool print "Done initialise walkers ", time.ctime() coutfl = "ptconfsout_" + str(run_token) + ".txt" ptoutfl = "ptout_" + str(run_token) + ".txt" thispt = ParallelTempering(samplers, walkers, num_traj, nT, nproc, Tmin, Tmax, iteration, \ max_iteration, iters_to_swap, iters_to_waypoint, iters_to_setdt, iters_to_writestate, run_token, coutfl,\ ptoutfl, restrtfl ) if (not didrestart): thispt.set_dt_all(thispt.pt_pool, step_fac=0.1) return thispt
plot_figs_flag, krige_x_coords_plot_mesh, krige_y_coords_plot_mesh, var_units, polys_list, out_figs_dir, fin_cntn_idxs) for i in range(n_cpus)) if mp_cond: ok_krige_flds = np.full( (fin_date_range.shape[0], krige_coords_orig_shape[0], krige_coords_orig_shape[1]), np.nan, dtype=np.float32) mp_ress = [] try: mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen)) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in ordinary_kriging:', msg) for mp_res in mp_ress: if (len(mp_res) != 3) and (not isinstance(list)): print('\n', mp_res, '\n') continue
def plot_cats_prms_transfer_perfs(dbs_dir, n_cpus=1): '''Plot catchments performances' by using parameters from other catchment. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) kf_prms_dict = {} cats_vars_dict = {} for cat_db in cats_dbs: with h5py.File(cat_db, 'r') as db: kfolds = db['data'].attrs['kfolds'] cat = db.attrs['cat'] cv_flag = db['data'].attrs['cv_flag'] if cv_flag: print('plot_prm_trans_perfs not possible with cv_flag!') return f_var_infos = db['cdata/aux_var_infos'][...] prms_idxs = db['cdata/use_prms_idxs'][...] f_vars = db['cdata/aux_vars'][...] prms_flags = db['cdata/all_prms_flags'][...] bds_arr = db['cdata/bds_arr'][...] cat_vars_dict = {} cat_vars_dict['f_var_infos'] = f_var_infos cat_vars_dict['prms_idxs'] = prms_idxs cat_vars_dict['f_vars'] = f_vars cat_vars_dict['prms_flags'] = prms_flags cat_vars_dict['bds_arr'] = bds_arr cats_vars_dict[cat] = cat_vars_dict for i in range(1, kfolds + 1): kf_str = f'kf_{i:02d}' cd_db = db[f'calib/{kf_str}'] opt_prms = cd_db['opt_prms'][...] if i not in kf_prms_dict: kf_prms_dict[i] = {} kf_prms_dict[i][cat] = {} kf_prms_dict[i][cat]['opt_prms'] = opt_prms const_args = (kf_prms_dict, cats_vars_dict) plot_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prms_transfer_perfs, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_prms_transfer_perfs(plot_args) return
def regex_sentencize(docs, max_sentence_length=None, min_sentence_length=None, n_threads=1, reg_split=r"((?:\s*\n)+\s*)", reg_token=r"[\w*]+|[^\w\s\n*]", text_col="text", doc_id_col="doc_id", with_tqdm=False, verbose=0): """ Simple split MIMIC docs into sentences: - sentences bounds are found when multiple newline occurs - sentences too long are cut into `max_sentence_length` length sentences by splitting each sentence into the tokens using a dumb regexp. Parameters ---------- docs: pd.DataFrame max_sentence_length: int with_tqdm: bool verbose: int doc_id_col: str text_col: str Returns ------- (np.ndarray, np.ndarray, np.ndarray) """ n_threads = min(n_threads, len(docs)) if n_threads > 1: text_chunks = np.array_split(np.arange(len(docs)), n_threads) pool = ProcessPool(nodes=n_threads) pool.restart(force=True) results = [ pool.apipe(regex_sentencize, docs.iloc[chunk], max_sentence_length, 1, with_tqdm=False) for chunk in text_chunks ] results = [r.get() for r in results] pool.close() return pd.concat(results, ignore_index=True) reg_split = re.compile(reg_split) reg_token = re.compile(reg_token) doc_ids = [] sentence_idx_list = [] begins = [] ends = [] sentences = [] max_size = 0 min_size = 10000000 for doc_id, txt in zip( docs[doc_id_col], (tqdm(docs[text_col], desc="Splitting docs into sentences") if with_tqdm else docs[text_col])): idx = 0 queued_spans = [] sentence_idx = 0 for i, part in enumerate(reg_split.split(txt)): if i % 2 == 0: # we're in a sentence queued_spans.extend([(m.start() + idx, m.end() + idx) for m in reg_token.finditer(part)]) if max_sentence_length is None: max_sentence_length_ = len(queued_spans) else: max_sentence_length_ = max_sentence_length while len(queued_spans) > max_sentence_length_: b = queued_spans[0][0] e = queued_spans[max_sentence_length_ - 1][1] doc_ids.append(doc_id) sentence_idx_list.append(sentence_idx) begins.append(b) ends.append(e) max_size, min_size = max(max_size, max_sentence_length_), min( min_size, max_sentence_length_) queued_spans = queued_spans[max_sentence_length_:] sentences.append(txt[b:e]) sentence_idx += 1 if min_sentence_length is not None and len( queued_spans) < min_sentence_length: idx += len(part) continue if len(queued_spans): b = queued_spans[0][0] e = queued_spans[-1][1] doc_ids.append(doc_id) sentence_idx_list.append(sentence_idx) begins.append(b) ends.append(e) max_size, min_size = max(max_size, len(queued_spans)), min( min_size, len(queued_spans)) queued_spans = [] sentences.append(txt[b:e]) sentence_idx += 1 if part is not None: idx += len(part) if verbose: print("Sentence size: max = {}, min = {}".format(max_size, min_size)) df = pd.DataFrame({ doc_id_col: doc_ids, "sentence_idx": sentence_idx_list, "begin": begins, "end": ends, "text": sentences, }).astype({doc_id_col: docs[doc_id_col].dtype}) df = df.merge(docs[[doc_id_col] + [ col for col in docs.columns if col not in df.columns and col != "text" ]]) df["sentence_id"] = join_cols(df[[doc_id_col, "sentence_idx"]], "/") return df
def set(self,walkers,message_prefix, adjust_step_factor = 0.9): """Updates the stepsize to achieve a trajectory acceptance rate in or as close as possible to the range [self.sampler.min_rate, self.sampler.max_rate], with stepsize in the range [10^-50, self.sampler.dt_max]. Args: walkers : This MUST be an array or list of NewWalker class objects. These are NOT updated by this method. message_prefix (str/None) : if message_prefix is not None, then a message is printed describing the change in dt. If message_prefix is None, then no message is printed. adjust_step_factor (float) : self.sampler.dt is updated by * or / by this value. Return: duration (float) : duration of call to set in seconds. Can be useful for checking the fraction of time spent updating step lengths. """ start_time = time.time() if (self.nproc > 1): set_pool = ProcessPool(nodes=self.nproc) else: set_pool = None steplength_store = self.sampler.dt steplength_in = self.sampler.dt # protects against possible future bugs that would be hard to detect walk_n_walkers = int(self.nproc * np.ceil(float(self.min_num_data_point)/self.nproc)) # rounds up to next multiple of self.nproc for maximum usage of compute walkers_clone = copy.deepcopy(walkers) # expensive, but prevents this routine overwriting # walkers first_time = True # we will make at least two tries. Logical flag ensures this. # Step size calibration loop: while True: # collect statistics on trajectory acceptance rate run_outputs = apply_pool(set_pool, self.run, np.random.choice(walkers_clone, \ size=walk_n_walkers)) results = map(itemgetter(1), run_outputs) del run_outputs # The total number of accepted/rejected moves for this step size rate = float(np.sum(results))/walk_n_walkers if (rate>=self.sampler.min_rate and rate<=self.sampler.max_rate): # If the total acceptance rate is within the desired range, return this stepsize self.print_dt_change(steplength_in, self.sampler.dt, message_prefix) break else: # update the stepsize to get closer to the desired range if( not first_time ): # dodge this the first time round - no rate_store saved yet # Check whether rate and rate_store are on different sides # of interval if ((min(rate,rate_store) < self.sampler.min_rate) and (max(rate,rate_store) > self.sampler.max_rate)): # We previously obtained an acceptance rate on one side of the desired range # and now find an acceptance rate on the other side. We return the step size # that gave an acceptance rate closest to the middle of the desired range. target = 0.5*(self.sampler.min_rate+self.sampler.max_rate) # middle of range if (abs(rate-target)<abs(rate_store-target)): # take current step length self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break else: # take saved step length self.sampler.dt = steplength_store rate = rate_store self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break else: # this is the fist time - no rate_store saved yet first_time = False # save current step length and acceptance rate steplength_store = self.sampler.dt rate_store = rate # update step length if rate < self.sampler.min_rate: exp = 1.0 elif rate >= self.sampler.max_rate: exp = -1.0 # try to adjust self.sampler.dt *= adjust_step_factor**exp # Check that step size is neither larger than max allowed value nor smaller than # 10^-50 (useful for detecting errors). # Error check: if (self.sampler.dt < 1.0e-50): if (message_prefix is not None): prfx = message_prefix + " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \ (self.sampler.dt) else: prfx = " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \ (self.sampler.dt) exit_error(prfx, 25) # sampling demands a step size larger than dt_max. Set to dt_max then break if (self.sampler.dt>self.sampler.dt_max): self.sampler.dt = self.sampler.dt_max self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break # close pool if (set_pool is not None): set_pool.terminate() set_pool.restart() end_time = time.time() duration = end_time - start_time return duration