def pairs_construction(seqs: List[List[Union[str, int]]], window_size: int = 2, drop_duplicates: bool = True, n_jobs: int = 4, **kwargs): """ Helper function to make pairs from sequences in parallel Parameters ---------- seqs : input sequences of nodes window_size : int, default is 2 drop_duplicates : bool, default if True Delete pairs where both elements are the same n_jobs : int, default is 4 Number of workers to be created in parallel pool Returns ------- List of pairs of nodes as <cur_vertex, context_vertex> """ set_new_config(window_size=window_size, **kwargs) local_logger = logging.getLogger(f"{__name__}") max_processes = max(n_jobs, os.cpu_count()) pairs_pool = ProcessPool(nodes=max_processes) pairs_pool.terminate() pairs_pool.restart() local_logger.info("Started making pairs from the sequences.") pairs = pairs_pool.map(_make_pairs, seqs) local_logger.info(f"Total number of raw sampled pairs is {len(pairs)}") if drop_duplicates: pairs = [item for sublist in pairs for item in sublist if item[0] != item[1]] else: pairs = [item for sublist in pairs for item in sublist] pairs = [item for item in pairs if (item[0] != -3) & (item[1] != -3)] pairs_pool.terminate() pairs_pool.restart() return pairs
def update_hash_dict(self): if self.num_proc is None: self.num_proc = cpu_count() - 1 # check current hash_dict current_files = set(self.image_filenames) cache_files = self.hash_dict.keys() lost_set = cache_files - current_files target_files = list(current_files - cache_files) if len(lost_set) + len(target_files) > 0: try: if len(self.hash_dict) == 0: spinner = Spinner( prefix= "Calculating image hashes (hash-bits={} num-proc={})..." .format(self.hash_bits, self.num_proc)) else: spinner = Spinner( prefix= "Updating image hashes (hash-bits={} num-proc={})...". format(self.hash_bits, self.num_proc)) spinner.start() # del lost_set from hash_dict for f in lost_set: del self.hash_dict[f] if six.PY2: from pathos.multiprocessing import ProcessPool as Pool elif six.PY3: from multiprocessing import Pool pool = Pool(self.num_proc) hashes = pool.map(self.gen_hash, target_files) for filename, hash_value in zip(target_files, hashes): self.hash_dict[filename] = hash_value spinner.stop() except KeyboardInterrupt: pool.terminate() pool.join() spinner.stop() sys.exit(1) return True else: return False
def make_hash_list(self): if self.num_proc is None: self.num_proc = cpu_count() - 1 try: spinner = Spinner( prefix="Calculating image hashes (hash-bits={} num-proc={})..." .format(self.hash_bits, self.num_proc)) spinner.start() if six.PY2: from pathos.multiprocessing import ProcessPool as Pool elif six.PY3: from multiprocessing import Pool pool = Pool(self.num_proc) self.cache = pool.map(self.gen_hash, self.image_filenames) spinner.stop() except KeyboardInterrupt: pool.terminate() pool.join() spinner.stop() sys.exit(1)
def main(args): if len(args.input) < 2: print("Please name at least one STAR file and an output directory") return 1 if args.apix is None: print("Using pixel size computed from STAR files") def do_job(star): try: mrc = os.path.join(args.output, os.path.basename(star).replace(".star", ".mrc")) print("Starting reconstruction of %s" % star) do_reconstruct(star, mrc, args.apix, args.sym, args.ctf) print("Wrote %s reconstruction to %s" % (star, mrc)) if args.mask is not None: masked_mrc = mrc.replace(".mrc", "_masked.mrc") do_mask(mrc, masked_mrc, args.mask) print("Wrote masked map %s" % masked_mrc) if args.mask is not None and args.delete_unmasked: delete_unmasked(mrc, masked_mrc) print("Overwrote %s with %s" % (mrc, masked_mrc)) except Exception as e: print("Failed on %s" % star) return 0 pool = Pool(nodes=args.nproc) #pool.apipe(do_job, args.input) results = pool.imap(do_job, args.input) codes = list(results) if pool is not None: pool.close() pool.join() pool.terminate() return 0
catalogs = pool.map(catfunc, np.arange(Niter)) os.system("rm " + filename) print("Generated catalogs") #catalogs = model.abundance_match(alpha, scatter, Niter) cov_matrix, mean = model.stoch_covmat_mean(catalogs, nthreads=ncore) means[i, j, :] = mean covmats[i, j, :, :] = cov_matrix t = time() - start extime.append(t) remtime = sum(extime) / len(extime) * (Ntot - k) / 60**2 print( "Done with step {}/{} in time {:.1f}. Estimated remaining time is {:.2f} hours" .format(k, Ntot, t, remtime)) sys.stdout.flush() k += 1 pool.close() pool.join() pool.terminate() res = {'alpha': XX, 'scatter': YY, 'covmat': covmats, 'wp': means} p.dump_pickle( res, "../../Data/NSAmatching/Train_stoch_covmats_{}_.p".format(logSMlim)) print("Finished") sys.stdout.flush()
def graph_sampling(graph: FSN, strategy: Optional[str] = "MetaDiff", n_jobs: Optional[int] = 4, use_cache: Optional[bool] = True, **kwargs) \ -> List[List[Union[str, int]]]: """ Sampling the sequences of nodes from FSN w.r.t. chosen strategy Parameters ---------- graph : FSN object Graph to be processed strategy : str, default is 'MetaDiff' Walking strategy to be used n_jobs : int, default is 4 Number of workers to be created in parallel pool use_cache : bool, default is True To use the previously cached files Returns ------- Sampled sequences of BP nodes """ set_new_config(**kwargs) local_logger = logging.getLogger(f"{__name__}") if use_cache and os.path.isfile(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl"): local_logger.info("Loading sequences from cache... wait...") try: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "rb") as file: res = pickle.load(file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res except FileNotFoundError: local_logger.info("File not found... Recalculate \n") pass except Exception as e: local_logger.error(f"Unexpected error: {e}") local_logger.info("Sampling sequences... wait...") max_processes = max(n_jobs, os.cpu_count()) global walk if strategy in strategy_to_class.keys(): walk = strategy_to_class[strategy](G=graph, walk_length=CONFIG.WALKS_LENGTH, direction=CONFIG.DIRECTION, pressure=CONFIG.PRESSURE, allow_back=CONFIG.ALLOW_BACK) else: raise KeyError( f"The given strategy {strategy} is unknown. The following ones are implemented: {strategy_to_class.keys()}") sampling_pool = ProcessPool(nodes=max_processes) local_logger.info("Created a Pool with " + str(max_processes) + " processes ") # required to restart pool to update CONFIG inside the parallel part sampling_pool.terminate() sampling_pool.restart() BPs = graph.get_BPs() n_BPs = len(BPs) sampled = list() try: with tqdm(total=n_BPs) as pbar: for i, res in enumerate(sampling_pool.uimap(wrappedWalk, BPs)): sampled.append(res) pbar.update() except KeyboardInterrupt: print('Got ^C while pool mapping, terminating the pool') sampling_pool.terminate() res = list(itertools.chain(*sampled)) sampling_pool.terminate() sampling_pool.restart() local_logger.info("Cashing sampled sequences!") if use_cache: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "wb") as file: pickle.dump(res, file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res
def build_pt(sampler_class, pe_method, force_method, numdim = 5, masses = 1.0, \ nT = 10, nproc = 1, Tmin = 1.0, Tmax = 100.0, max_iteration = 500, iters_to_swap = 1, \ iters_to_waypoint = 5, iters_to_setdt = 10, iters_to_writestate = 1, run_token = 1, \ dt = 1.0e-4, traj_len = 100, num_traj = 10, absxmax = 1.0e2, initial_rand_bounds = 1.0e2, \ dt_max = None, min_rate = 0.6, max_rate = 0.7, gaussianprior_std = None): """Builds an instance of ParallelTempering. Reads restart file if it exists, or initialises a fresh run. Args: sampler_class : Sampler class from module sampling. Eg. sampling.Hmc . pe_method : A method for evaluating the potential energy. force_method : A method for evaluating the forces. numdim (int) :The number of dimensions of the configuration space ('parameter space'). (Defualt: 5) masses (single float or numpy array of floats, with length 1 or length numdim): specifies the masses associated with each dimension of the configuration space ('parameter space'). (Default: 1.0) nT (int) : Number of temperatures to use. (Default: 10) nproc (int) : Number of processors to use. (Default: 1) Tmin (float) : Lowest temperature in ladder of temperatures. (Default 1.0) Tmax (float) : Maximum temperature in ladder of temperatures. (Default 100.0). max_iteration (int) : Max number of iterations to run. (Default 500). iters_to_swap (int) : Configuration swaps between neighbouring temperatures are attempted every iters_to_swap iterations. (Default 1). iters_to_waypoint (int) : Restart information is written after every iters_to_waypoint iterations. (Default 5). iters_to_setdt (int) : The step sizes (or equivalently time steps) are updated after every iters_to_setdt interations. (Default 10). iters_to_writestate (int) : The latest potential energy values and coordinates are written out after every iters_to_writestate iterations. (Default 1). run_token (int) : An integer for labelling the restart and output files for this calculation. (Default 1). dt (float) : Initial time step (or step size). This will be updated algorithmically, but a good starting point saves time. (Default 1.0e-4). traj_len (int) : The number of time steps in a single trajectory. (Default 100). num_traj (int) : The number of trajectories run per iteration, per sampler. (Default 10). absxmax (single float or numpy array of floats, with length 1 or length numdim) : During the main calculation, the sampler is restricted to a region x in [-absxmax,absxmax]. (Default: 1.0e2). initial_rand_bounds : The same as absxmax, but applied only during random initialisation of the sampler's coordinate (parameters). This enables initialisation into a particular region, which might for example, be most likely to contain the global minimum. (Default: 1.0e2). dt_max (float) : maximum step size (time step). (Default: median(absxmax), which is set in module sampling.) min_rate (float) : minimum acceptance rate of trajectories. Used for setting step size (time step). (Default: 0.6. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65 http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3). max_rate (float) : maximum acceptance rate of trajectories. Used for setting step size (time step). (Default 0.7. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65 http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3). gaussianprior_std (single float or numpy array of floats, with length 1 or length numdim) : If this is set to a real value then an additional term is applied to (H)MC acceptance/rejection such that the target distribution is proportional to a multivariate Gaussian with this standard deviation for each dimension. (Default: None.) Return: ParallelTempering class object """ # CHECK FOR RESTART FILE AND DO RESTART IF PRESENT restrtfl = "restart_pt_" + str(run_token) + ".txt" if os.path.isfile("./" + restrtfl): # read restart data from restart file didrestart = True print "Restarting from file ", restrtfl, time.ctime() nT, Tmin, Tmax, iteration, num_traj, samplers, walkers = \ read_waypoint(restrtfl, sampler_class, pe_method, force_method) else: didrestart = False iteration = 0 # a list of new walkers (which are class objects) samplers = build_samplers( sampler_class, pe_method, force_method, nT, Tmin, Tmax, dt, \ traj_len, absxmax, dt_max, min_rate, max_rate, gaussianprior_std ) print "Start initialise walkers ", time.ctime() walkers = np.asarray([]) sampling.NewWalker.masses = masses sampling.NewWalker.numdim = numdim temp_pool = ProcessPool(nodes=nproc) # temporarily pass initial_random_bounds through samplers, since pathos multiprocessing is # restrictive with arguments for sampler in samplers: sampler.random_init_bounds = initial_rand_bounds outs = sampling.apply_pool(temp_pool, initialise_walker, samplers) for i in xrange(len(outs)): walkers = np.append(walkers, outs[i][0]) samplers[i] = outs[i][1] temp_pool.terminate() # close pool temp_pool.restart() # close pool print "Done initialise walkers ", time.ctime() coutfl = "ptconfsout_" + str(run_token) + ".txt" ptoutfl = "ptout_" + str(run_token) + ".txt" thispt = ParallelTempering(samplers, walkers, num_traj, nT, nproc, Tmin, Tmax, iteration, \ max_iteration, iters_to_swap, iters_to_waypoint, iters_to_setdt, iters_to_writestate, run_token, coutfl,\ ptoutfl, restrtfl ) if (not didrestart): thispt.set_dt_all(thispt.pt_pool, step_fac=0.1) return thispt
def set(self,walkers,message_prefix, adjust_step_factor = 0.9): """Updates the stepsize to achieve a trajectory acceptance rate in or as close as possible to the range [self.sampler.min_rate, self.sampler.max_rate], with stepsize in the range [10^-50, self.sampler.dt_max]. Args: walkers : This MUST be an array or list of NewWalker class objects. These are NOT updated by this method. message_prefix (str/None) : if message_prefix is not None, then a message is printed describing the change in dt. If message_prefix is None, then no message is printed. adjust_step_factor (float) : self.sampler.dt is updated by * or / by this value. Return: duration (float) : duration of call to set in seconds. Can be useful for checking the fraction of time spent updating step lengths. """ start_time = time.time() if (self.nproc > 1): set_pool = ProcessPool(nodes=self.nproc) else: set_pool = None steplength_store = self.sampler.dt steplength_in = self.sampler.dt # protects against possible future bugs that would be hard to detect walk_n_walkers = int(self.nproc * np.ceil(float(self.min_num_data_point)/self.nproc)) # rounds up to next multiple of self.nproc for maximum usage of compute walkers_clone = copy.deepcopy(walkers) # expensive, but prevents this routine overwriting # walkers first_time = True # we will make at least two tries. Logical flag ensures this. # Step size calibration loop: while True: # collect statistics on trajectory acceptance rate run_outputs = apply_pool(set_pool, self.run, np.random.choice(walkers_clone, \ size=walk_n_walkers)) results = map(itemgetter(1), run_outputs) del run_outputs # The total number of accepted/rejected moves for this step size rate = float(np.sum(results))/walk_n_walkers if (rate>=self.sampler.min_rate and rate<=self.sampler.max_rate): # If the total acceptance rate is within the desired range, return this stepsize self.print_dt_change(steplength_in, self.sampler.dt, message_prefix) break else: # update the stepsize to get closer to the desired range if( not first_time ): # dodge this the first time round - no rate_store saved yet # Check whether rate and rate_store are on different sides # of interval if ((min(rate,rate_store) < self.sampler.min_rate) and (max(rate,rate_store) > self.sampler.max_rate)): # We previously obtained an acceptance rate on one side of the desired range # and now find an acceptance rate on the other side. We return the step size # that gave an acceptance rate closest to the middle of the desired range. target = 0.5*(self.sampler.min_rate+self.sampler.max_rate) # middle of range if (abs(rate-target)<abs(rate_store-target)): # take current step length self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break else: # take saved step length self.sampler.dt = steplength_store rate = rate_store self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break else: # this is the fist time - no rate_store saved yet first_time = False # save current step length and acceptance rate steplength_store = self.sampler.dt rate_store = rate # update step length if rate < self.sampler.min_rate: exp = 1.0 elif rate >= self.sampler.max_rate: exp = -1.0 # try to adjust self.sampler.dt *= adjust_step_factor**exp # Check that step size is neither larger than max allowed value nor smaller than # 10^-50 (useful for detecting errors). # Error check: if (self.sampler.dt < 1.0e-50): if (message_prefix is not None): prfx = message_prefix + " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \ (self.sampler.dt) else: prfx = " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \ (self.sampler.dt) exit_error(prfx, 25) # sampling demands a step size larger than dt_max. Set to dt_max then break if (self.sampler.dt>self.sampler.dt_max): self.sampler.dt = self.sampler.dt_max self.print_dt_change(steplength_in, self.sampler.dt, \ message_prefix) break # close pool if (set_pool is not None): set_pool.terminate() set_pool.restart() end_time = time.time() duration = end_time - start_time return duration