def extract(self) -> None: self.__n_cores = cpu_count() self.__get_video_details() self.__jump_unit = ceil(self.n_frames / self.__n_cores) pool = Pool(processes=self.__n_cores) if self.verbose: tqdm.set_lock(RLock()) pool = Pool( processes=self.__n_cores, initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), ), ) pool.map(self._batch_process, range(self.__n_cores)) self.__combine()
freeze_support() # for Windows support L = list(range(NUM_SUBITERS))[::-1] print("Simple thread mapping") thread_map(partial(progresser, write_safe=not PY2), L, max_workers=4) print("Simple process mapping") process_map(partial(progresser), L, max_workers=4) print("Manual nesting") for i in trange(16, desc="1"): for _ in trange(16, desc="2 @ %d" % i, leave=i % 2): sleep(0.01) print("Multi-processing") tqdm.set_lock(RLock()) p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), )) p.map(partial(progresser, progress=True), L) print("Multi-threading") tqdm.set_lock(TRLock()) pool_args = {} if not PY2: pool_args.update(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), )) with ThreadPoolExecutor(**pool_args) as p: p.map( partial(progresser, progress=True, write_safe=not PY2, blocking=False), L)
DASHBOARD_STARTED_EVENT = None def get_function_details(_): pass def get_manager_client_dicts(): raise NotImplementedError logger = logging.getLogger(__name__) DATETIME_FORMAT = "%Y-%m-%d, %H:%M:%S" # Set lock for TQDM such that racing conditions are avoided when using multiple progress bars TQDM_LOCK = Lock() tqdm.set_lock(TQDM_LOCK) class ProgressBarHandler: def __init__(self, func: Callable, n_jobs: int, show_progress_bar: bool, progress_bar_total: int, progress_bar_position: int, worker_comms: WorkerComms, worker_insights: WorkerInsights) -> None: """ :param func: Function passed on to a WorkerPool map function :param n_jobs: Number of workers that are used :param show_progress_bar: When ``True`` will display a progress bar :param progress_bar_total: Total number of tasks that will be processed :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :param worker_comms: Worker communication objects (queues, locks, events, ...)
def sample(self, n_samples, epsilon=None, prop_scale=0.5, burn=100, tune=True, tune_iter=500, tune_interval=100, stat_weight=1., stat_scale=1., use_pilot=False, chains=2, seed=None, return_journal=False): """ tune: bool Flag for tuning. Defaults to True. tune_interval: int The frequency of tuning. Defaults to 100 iterations. Due to multiprocessing, estimation time (iteration per loop, total time, etc.) could be unstable, but the progress bar works perfectly. A good choice for the number of jobs is the number of cores or processors on your computer. If your processor supports hyperthreading, you can select an even higher number of jobs. The number of jobs is set to the number of cores found in the system by default. There are some papers that suggest Metropolis-Hastings is most efficient when you accept 23.4% of proposed samples, and it turns out that lowering step size increases the probability of accepting a proposal. PyMC3 will spend the first 500 steps increasing and decreasing the step size to try to find the best value of sd that will give you an acceptance rate of 23.4% (you can even set different acceptance rates). burn : either burn away or add to n_samples """ if self._log: self.logger.info("Run MCMC sampler.") if use_pilot: if not self._done_pilot_study: msg = ("In order to use tuning from pilot study, the " "pilot_study method must be run in advance.") raise PilotStudyMissing(msg) else: if epsilon is None: msg = ("epsilon must be passed.") raise ValueError(msg) self._epsilon = epsilon self._quantile = None self._stat_scale = stat_scale self._n_samples = n_samples self._burn = burn self._stat_weight = stat_weight # These are set in base instead #self._prior_logpdfs = [prior.logpdf for prior in self._priors] #self._rng = np.random.default_rng #self._uniform_distr = stats.uniform(loc=0, scale=1) # mcmc knobs self._prop_scale = prop_scale # force equal, n_samples n_samples, chains, tasks, seeds = self.batches(n_samples, chains, seed, force_equal=True) # n_samples + burn if self._log: # for managing output contention initializer = tqdm.set_lock(RLock(), ) initargs = (tqdm.get_lock(), ) else: initializer = None initargs = None with ProcessPool(chains) as pool: r0, r1, r2, r3 = zip(*pool.map(self._sample, tasks, range(chains), seeds, initializer=initializer, initargs=initargs)) #self._original_samples = np.stack(r0) self._original_samples = np.concatenate(r0, axis=0) self._samples = copy.deepcopy(self._original_samples) self._distances = np.concatenate(r1, axis=0) self._sum_stats = np.concatenate(r2, axis=0) self._n_accepted = np.sum(r3) self._done_sampling = True if return_journal: return self.journal()
def pilot_study( self, n_sim=500, quantile=None, stat_scale=None, stat_weight=1., n_jobs=-1, seed=None, ): r"""Perform pilot study. The pilot study runs the simulator ``n_sim`` times and sets the threshold parameter ``epsilon`` automatically as the q-quantile of simulated distances from the prior predictive distribution. For instance, the 0.5-quantile (the median) will give a threshold that accepts 50% of the simulations. The pilot study can also be used to provide an estimate of the ``stat_scale`` parameter, used in the weighted Euclidean distance, from the prior predictive distribution by passing the ``stat_scale`` keyword as ``sd`` or ``mad``. The ``stat_scale`` parameter is used to avoid dominance of particular summary statistics. ``stat_scale=sd`` scales the summary statistics according to their standard deviation (SD) estimated from the prior predictive samples, and ``stat_scale=mad`` according to their median absolute deviation (MAD). It is important to note that if more than 50% of the prior predictive samples for a particular summary statistic have identical values, MAD will equal zero. In this case, the logger will raise a warning and the scale for the particular summary statistic will be set to SD instead. If there is no variability at all, the scale will be set to 1. It is recommended to check that there are not too many identical samples before setting the scale, to avoid surprises. Parameters ---------- n_sims : `int`, optional Number of simulator runs. quantile : `int` Quantile of the Euclidean distances. stat_scale : `str`, optional Summary statistics scale to estimate; can be set as either ``sd`` (standard deviation) or ``mad`` (median absolute deviation). If ``None``, scale is set to ``1.0``. Default: ``None``. stat_weight : {`int`, `float`}, `numpy.ndarray`, optional Importance weights of summary statistics. Default: ``1.0``. n_jobs : `int`, optional Number of processes (workers). If ``n_jobs=-1``, then ``n_jobs`` is set to half of the CPUs found by `Pathos <https://pathos.readthedocs.io/en/latest/pathos.html>`_ (we assume half of the CPUs are hardware threads only and ignore those). Default: ``-1``. seed : `int` User-provided seed. Will be used to generate seed for each worker. Default: ``None``. """ if quantile is None: msg = ("quantile must be passed. The pilot study sets the " "accept/reject threshold as the provided q-quantile of the " "distances.") raise ValueError(msg) if not 0 < quantile <= 1.0: msg = ("quantile must be a value in (0, 1].") raise ValueError(msg) if isinstance(stat_scale, str): if stat_scale not in VALID_STAT_SCALES: msg = ("scale can be set as either sd (standard deviation) or " "mad (median absolute deviation). If None, it defaults " "to 1.") raise ValueError(msg) if self._log: msg = f"Run pilot study to estimate:\n" msg += f"* epsilon as the {quantile}-quantile of the distances" if stat_scale is not None: msg += f"\n* summary statistics scale ({stat_scale.upper()}) " msg += f"from the prior predictive distribution" self.logger.info(msg) self._quantile = quantile _, n_jobs, tasks, seeds = self.batches(n_sim, n_jobs, seed) if self._log: # for managing output contention initializer = tqdm.set_lock(RLock()) initargs = (Lock(),) else: initializer = None initargs = None if n_jobs == 1: sum_stats = self._pilot_study(tasks[0], 0, seeds[0]) else: with ProcessPool(n_jobs) as pool: results = pool.map(self._pilot_study, tasks, range(n_jobs), seeds, initializer=initializer, initargs=initargs ) sum_stats = np.concatenate(results, axis=0) if stat_scale is None: self._stat_scale = 1. elif stat_scale == "sd": self._stat_scale = self._sd(sum_stats) if 0 in self._stat_scale: idx = np.where(self._stat_scale == 0) if self._log: msg = (f"Encounterd SD = 0 for summary statistic at index:" f" {idx[0]}. Setting this to 1.") self.logger.warn(msg) self._stat_scale[idx] = 1. elif stat_scale == "mad": self._stat_scale = self._mad(sum_stats) if 0 in self._stat_scale: # Check if MAD=0 for some sum stats idx = np.where(self._stat_scale == 0) if self._log: msg = (f"Encounterd MAD = 0 for summary statistic at index:" f" {idx[0]}. Setting this to SD " "(or 1 if also SD = 0).") self.logger.warn(msg) backup_stat_scale = sum_stats.std(axis=0) self._stat_scale[idx] = backup_stat_scale[idx] if 0 in self._stat_scale: # Ensure that replaced sum stat scales is not SD=0 idx = np.where(self._stat_scale == 0) self._stat_scale[idx] = 1. distances = [] for sum_stat in sum_stats: distance = self.distance(sum_stat, self._obs_sumstat, weight=stat_weight, scale=self._stat_scale ) distances.append(distance) distances = np.array(distances, dtype=np.float64) distances[distances == np.inf] = np.NaN self._epsilon = np.nanquantile(distances, self._quantile) self._done_pilot_study = True if self._log: self.logger.info(f"epsilon = {self._epsilon}") self.logger.info(f"stat_scale = {self._stat_scale}")
def sample(self, n_samples, epsilon=None, stat_weight=1., stat_scale=1., use_pilot=False, n_jobs=-1, seed=None, return_journal=False): """ Due to multiprocessing, estimation time (iteration per loop, total time, etc.) could be unstable, but the progress bar works perfectly. A good choice for the number of jobs is the number of cores or processors on your computer. If your processor supports hyperthreading, you can select an even higher number of jobs. The number of jobs is set to the number of cores found in the system by default. """ if self._log: self.logger.info("Run rejection sampler.") if use_pilot: if not self._done_pilot_study: msg = ("In order to use tuning from pilot study, the " "pilot_study method must be run in advance.") raise PilotStudyMissing(msg) else: if epsilon is None: msg = ("epsilon must be passed.") raise ValueError(msg) self._epsilon = epsilon self._quantile = None self._stat_scale = stat_scale self._n_samples = n_samples self._stat_weight = stat_weight _, n_jobs, tasks, seeds = self._batches(n_samples, n_jobs, seed) if self._log: tqdm.set_lock(RLock()) # for managing output contention initializer = tqdm.set_lock else: initializer = None with ProcessPool(n_jobs) as pool: r0, r1, r2, r3 = zip(*pool.map(self._sample, tasks, range(n_jobs), seeds, initializer=initializer)) self._original_samples = np.concatenate(r0, axis=0) self._samples = copy.deepcopy(self._original_samples) self._distances = np.concatenate(r1, axis=0) self._sum_stats = np.concatenate(r2, axis=0) self._n_sims = np.sum(r3) self._done_sampling = True if return_journal: return self.journal()
def pilot_study( self, n_sim=500, quantile=None, stat_scale=None, # accept sd, mad stat_weight=1., n_jobs=-1, seed=None, ): """ Pilot study to set threshold and optionally summary statistics scale. Set scale and epsilon (add bool for if weights also?) """ if self._log: msg = "Run pilot study to estimate:\n" msg += "* epsilon as the p-quantile of the distances" if stat_scale is not None: msg += "\n* summary statistics scale from the prior " msg += "predictive distribution" self.logger.info(msg) if quantile is None: msg = ("quantile must be passed. The pilot study sets the " "accept/reject threshold as the provided p-quantile of the " "distances.") raise ValueError(msg) if stat_scale is not None: if stat_scale not in VALID_STAT_SCALES: msg = ("scale can be set as either sd (standard deviation) or " "mad (median absolute deviation). If None, it defaults " "to 1.") raise ValueError(msg) self._quantile = quantile _, n_jobs, tasks, seeds = self._batches(n_sim, n_jobs, seed) if self._log: tqdm.set_lock(RLock()) # for managing output contention initializer = tqdm.set_lock else: initializer = None with ProcessPool(n_jobs) as pool: results = pool.map(self._pilot_study, tasks, range(n_jobs), seeds, initializer=initializer) sum_stats = np.concatenate(results, axis=0) if stat_scale is None: self._stat_scale = 1. elif stat_scale == "sd": self._stat_scale = sum_stats.std(axis=0) elif stat_scale == "mad": self._stat_scale = np.median( np.absolute(sum_stats - np.median(sum_stats, axis=0)), axis=0) else: msg = ("scale can be set as either sd (standard deviation) or " "mad (median absolute deviation). If None, defaults to 1.") raise ValueError(msg) distances = [] for sum_stat in sum_stats: distance = self._distance(sum_stat, self._obs_sumstat, weight=stat_weight, scale=self._stat_scale) distances.append(distance) self._epsilon = np.quantile(np.array(distances), self._quantile) self._done_pilot_study = True
def sample(self, n_samples, epsilon=None, quantile=None, n_tune=500, n_jobs=-1, log=False): """ Due to multiprocessing, estimation time (iteration per loop, total time, etc.) could be unstable, but the progress bar works perfectly. A good choice for the number of jobs is the number of cores or processors on your computer. If your processor supports hyperthreading, you can select an even higher number of jobs. The number of jobs is set to the number of cores found in the system by default. """ _inference_scheme = "Rejection ABC" self._epsilon = epsilon if log: self.logger = setup_logger(self.__class__.__name__) self.logger.info(f"Run {_inference_scheme} sampler.") n_jobs = check_and_set_jobs(n_jobs, self.logger) else: n_jobs = check_and_set_jobs(n_jobs) seeds = generate_seed_sequence(self._seed, n_jobs) tasks = distribute_workload(n_samples, n_jobs) if quantile is not None: tasks = distribute_workload(n_samples, n_jobs) distances_tune = self._pilot_study(n_tune, seeds[0]) # print(distances_tune) #distances_tune = np.concatenate(distances_tune, axis=0) self._epsilon = np.quantile(np.array(distances_tune), quantile) # print(self._epsilon) if log: tqdm.set_lock(RLock()) # for managing output contention with ProcessPool(n_jobs) as pool: samples, distances, sum_stats, epsilons, n_sims = zip(*pool.map( self._sample_with_log, tasks, range(n_jobs), seeds, initializer=tqdm.set_lock) ) else: with ProcessPool(n_jobs) as pool: samples, distances, sum_stats, epsilons, n_sims = zip(*pool.map( self._sample, tasks, seeds) ) samples = np.concatenate(samples, axis=0) distances = np.concatenate(distances, axis=0) sum_stats = np.concatenate(sum_stats, axis=0) epsilons = np.concatenate(epsilons, axis=0) n_sims = np.sum(n_sims) journal = Journal() journal._write_to_journal( observation=self._obs_data, simulator=self._simulator, stat_calc=self._stat_calc, priors=self._priors, distance_metric=self._distance_metric, inference_scheme=_inference_scheme, n_samples=n_samples, n_simulations=n_sims, posterior_samples=samples, summary_stats=sum_stats, distances=distances, epsilons=epsilons, log=log) # return results # return samples, distances, sum_stats, epsilons, n_sims return journal