示例#1
0
    def _locate_sample_offsets(cls, data_path: str, n_jobs: int) -> np.ndarray:
        stat_result = os.stat(data_path)
        chunk_size, _ = divmod(stat_result.st_size, n_jobs)

        chunk_starts = [0]
        with open(data_path, mode='rb') as infile:
            while infile.tell() < stat_result.st_size:
                infile.seek(chunk_size, os.SEEK_CUR)
                infile.readline()
                chunk_starts.append(min(infile.tell(), stat_result.st_size))

        with mp.Pool(processes=n_jobs,
                     initializer=tqdm.set_lock,
                     initargs=(tqdm.get_lock(), ),
                     maxtasksperchild=1) as pool:
            try:
                return np.asarray(
                    list(
                        itertools.chain.from_iterable(
                            pool.imap(functools.partial(
                                Criteo._locate_sample_offsets_job, data_path),
                                      iterable=enumerate(
                                          zip(chunk_starts[:-1],
                                              chunk_starts[1:]))))))
            except KeyboardInterrupt as e:
                raise e
            finally:
                # should be redundant
                pool.terminate()
                pool.join()
示例#2
0
def run_downloader(args: Arguments, dataset):
    """
    Inputs:
        process: (int) number of process to run
        images_url:(list) list of images url
    """
    print(f"Running {args.num_proc} process")
    with Pool(args.num_proc,
              initializer=tqdm.set_lock,
              initargs=(tqdm.get_lock(), )) as pool:
        list(pool.imap_unordered(
            image_downloader,
            dataset,
            chunksize=1,
        ))
示例#3
0
    def extract(self) -> None:
        self.__n_cores = cpu_count()
        self.__get_video_details()

        self.__jump_unit = ceil(self.n_frames / self.__n_cores)

        pool = Pool(processes=self.__n_cores)
        if self.verbose:
            tqdm.set_lock(RLock())
            pool = Pool(
                processes=self.__n_cores,
                initializer=tqdm.set_lock,
                initargs=(tqdm.get_lock(), ),
            )

        pool.map(self._batch_process, range(self.__n_cores))

        self.__combine()
示例#4
0
    def __init__(self, prefer="threads", max_workers=None):
        if prefer == "threads":
            pool_cls = ThreadPool
        elif prefer == "processes":
            pool_cls = Pool
        else:
            raise ValueError(
                "Expected argument to be `threads` or `processes` but got {}".
                format(prefer))

        if max_workers is None:
            max_workers = cpu_count()

        self.prefer = prefer
        self.max_workers = max_workers

        self._pool = pool_cls(max_workers,
                              initializer=tqdm.set_lock,
                              initargs=[tqdm.get_lock()])
示例#5
0
 def _count_field_features(cls, data_path: str, sample_offsets: List[int],
                           n_jobs: int) -> List[typing.Counter[bytes]]:
     with mp.Pool(processes=n_jobs,
                  initializer=tqdm.set_lock,
                  initargs=(tqdm.get_lock(), ),
                  maxtasksperchild=1) as pool:
         try:
             return list(
                 map(
                     functools.partial(functools.reduce,
                                       lambda x, y: x + y),
                     zip(*pool.imap_unordered(
                         functools.partial(Criteo._count_field_features_job,
                                           data_path),
                         iterable=((i, sample_offsets[i::n_jobs])
                                   for i in range(n_jobs))))))
         except KeyboardInterrupt as e:
             raise e
         finally:
             # should be redundant
             pool.terminate()
             pool.join()
示例#6
0
    def _locate_sample_offsets(cls, data_uri: str, n_jobs: int) -> np.ndarray:
        finfo = fs.file_info(data_uri)
        chunk_size, _ = divmod(finfo.size, n_jobs)

        chunk_starts = [0]
        with fs.open_buffered_file_reader(data_uri) as infile:
            while infile.tell() < finfo.size:
                infile.seek(chunk_size, os.SEEK_CUR)
                infile.readline()
                chunk_starts.append(min(infile.tell(), finfo.size))

        with mp.Pool(processes=n_jobs,
                     initializer=tqdm.set_lock,
                     initargs=(tqdm.get_lock(), ),
                     maxtasksperchild=1) as pool:
            return np.asarray(
                list(
                    itertools.chain.from_iterable(
                        pool.imap(functools.partial(
                            LIBSVMDataset._locate_sample_offsets_job,
                            data_uri),
                                  iterable=enumerate(
                                      zip(chunk_starts[:-1],
                                          chunk_starts[1:]))))))
示例#7
0
    L = list(range(NUM_SUBITERS))[::-1]

    print("Simple thread mapping")
    thread_map(partial(progresser, write_safe=not PY2), L, max_workers=4)

    print("Simple process mapping")
    process_map(partial(progresser), L, max_workers=4)

    print("Manual nesting")
    for i in trange(16, desc="1"):
        for _ in trange(16, desc="2 @ %d" % i, leave=i % 2):
            sleep(0.01)

    print("Multi-processing")
    tqdm.set_lock(RLock())
    p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), ))
    p.map(partial(progresser, progress=True), L)

    print("Multi-threading")
    tqdm.set_lock(TRLock())
    pool_args = {}
    if not PY2:
        pool_args.update(initializer=tqdm.set_lock,
                         initargs=(tqdm.get_lock(), ))
    with ThreadPoolExecutor(**pool_args) as p:
        p.map(
            partial(progresser,
                    progress=True,
                    write_safe=not PY2,
                    blocking=False), L)
示例#8
0
    def sample(self,
               n_samples,
               epsilon=None,
               prop_scale=0.5,
               burn=100,
               tune=True,
               tune_iter=500,
               tune_interval=100,
               stat_weight=1.,
               stat_scale=1.,
               use_pilot=False,
               chains=2,
               seed=None,
               return_journal=False):
        """
        tune: bool
            Flag for tuning. Defaults to True.
        tune_interval: int
            The frequency of tuning. Defaults to 100 iterations.

        Due to multiprocessing, estimation time (iteration per loop, total
        time, etc.) could be unstable, but the progress bar works perfectly.

        A good choice for the number of jobs is the number of cores or processors on your computer.
        If your processor supports hyperthreading, you can select an even higher number of jobs.
        The number of jobs is set to the number of cores found in the system by default.

        There are some papers that suggest Metropolis-Hastings is most efficient
        when you accept 23.4% of proposed samples, and it turns out that lowering
        step size increases the probability of accepting a proposal. PyMC3 will
        spend the first 500 steps increasing and decreasing the step size to try
        to find the best value of sd that will give you an acceptance rate of
        23.4% (you can even set different acceptance rates).

        burn : either burn away or add to n_samples
        """

        if self._log:
            self.logger.info("Run MCMC sampler.")

        if use_pilot:
            if not self._done_pilot_study:
                msg = ("In order to use tuning from pilot study, the "
                       "pilot_study method must be run in advance.")
                raise PilotStudyMissing(msg)
        else:
            if epsilon is None:
                msg = ("epsilon must be passed.")
                raise ValueError(msg)
            self._epsilon = epsilon
            self._quantile = None
            self._stat_scale = stat_scale

        self._n_samples = n_samples
        self._burn = burn
        self._stat_weight = stat_weight

        # These are set in base instead
        #self._prior_logpdfs = [prior.logpdf for prior in self._priors]
        #self._rng = np.random.default_rng
        #self._uniform_distr = stats.uniform(loc=0, scale=1)

        # mcmc knobs
        self._prop_scale = prop_scale

        # force equal, n_samples
        n_samples, chains, tasks, seeds = self.batches(n_samples,
                                                       chains,
                                                       seed,
                                                       force_equal=True)
        # n_samples + burn

        if self._log:
            # for managing output contention
            initializer = tqdm.set_lock(RLock(), )
            initargs = (tqdm.get_lock(), )
        else:
            initializer = None
            initargs = None

        with ProcessPool(chains) as pool:
            r0, r1, r2, r3 = zip(*pool.map(self._sample,
                                           tasks,
                                           range(chains),
                                           seeds,
                                           initializer=initializer,
                                           initargs=initargs))

        #self._original_samples = np.stack(r0)
        self._original_samples = np.concatenate(r0, axis=0)
        self._samples = copy.deepcopy(self._original_samples)
        self._distances = np.concatenate(r1, axis=0)
        self._sum_stats = np.concatenate(r2, axis=0)
        self._n_accepted = np.sum(r3)

        self._done_sampling = True

        if return_journal:
            return self.journal()
示例#9
0
    def sample(
        self,
        n_samples,
        epsilon=None,
        stat_weight=1.,
        stat_scale=1.,
        use_pilot=False,
        n_jobs=-1,
        seed=None,
        return_journal=False
    ):
        """
        Due to multiprocessing, estimation time (iteration per loop, total
        time, etc.) could be unstable, but the progress bar works perfectly.

        A good choice for the number of jobs is the number of cores or processors on your computer.
        If your processor supports hyperthreading, you can select an even higher number of jobs.
        The number of jobs is set to the number of cores found in the system by default.
        """

        if self._log:
            self.logger.info("Run rejection sampler.")

        if use_pilot:
            if not self._done_pilot_study:
                msg = ("In order to use tuning from pilot study, the "
                       "pilot_study method must be run in advance.")
                raise PilotStudyMissing(msg)
        else:
            if epsilon is None:
                msg = ("epsilon must be passed.")
                raise ValueError(msg)
            self._epsilon = epsilon
            self._quantile = None
            self._stat_scale = stat_scale

        self._n_samples = n_samples
        self._stat_weight = stat_weight

        _, n_jobs, tasks, seeds = self.batches(n_samples, n_jobs, seed)

        if self._log:
            # for managing output contention
            initializer = tqdm.set_lock(RLock(),)
            initargs = (tqdm.get_lock(),)
        else:
            initializer = None
            initargs = None

        if n_jobs == 1:
            r0, r1, r2, r3 = self._sample(tasks[0], 0, seeds[0])
            self._original_samples = np.stack(r0)
            self._samples = copy.deepcopy(self._original_samples)
            self._distances = np.stack(r1)
            self._sum_stats = np.stack(r2)
            self._n_sims = r3
        else:
            with ProcessPool(n_jobs) as pool:
                r0, r1, r2, r3 = zip(
                    *pool.map(
                        self._sample,
                        tasks,
                        range(n_jobs),
                        seeds,
                        initializer=initializer,
                        initargs=initargs
                    )
                )

            self._original_samples = np.concatenate(r0, axis=0)
            self._samples = copy.deepcopy(self._original_samples)
            self._distances = np.concatenate(r1, axis=0)
            self._sum_stats = np.concatenate(r2, axis=0)
            self._n_sims = np.sum(r3)

        self._done_sampling = True

        if return_journal:
            return self.journal()