def test_n_splits(self): """ Test that n_splits works as expected. """ for n_splits, expected_n_chunks in [(1, 1), (3, 3), (150, 100)]: with self.subTest(n_splits=n_splits): iterable_of_args, iterable_len, chunk_size, n_splits_ = apply_numpy_chunking( self.test_data_numpy, n_splits=n_splits) # Materialize generator and test contents. We simply test if every row of the original input occurs in # the chunks iterable_of_args = list(iterable_of_args) self.assertEqual(len(iterable_of_args), expected_n_chunks) offset = 0 for chunk in iterable_of_args: self.assertIsInstance(chunk[0], np.ndarray) np.testing.assert_array_equal( chunk[0], self.test_data_numpy[offset:offset + len(chunk[0])]) offset += len(chunk[0]) self.assertEqual(offset, 100) # Test other output self.assertEqual(iterable_len, expected_n_chunks) self.assertEqual(chunk_size, 1) self.assertIsNone(n_splits_) # chunk_size and n_splits can't be both None with self.subTest(n_splits=None), self.assertRaises(ValueError): iterable_of_args, *_ = apply_numpy_chunking(self.test_data_numpy, n_splits=None) list(iterable_of_args)
def test_n_jobs(self): """ Test that n_jobs works as expected. When chunk_size and n_splits are both None, n_jobs * 4 is passed on as n_splits """ for n_jobs, expected_n_chunks in [(1, 4), (3, 12), (40, 100), (150, 100)]: with self.subTest(n_jobs=n_jobs): iterable_of_args, iterable_len, chunk_size, n_splits_ = apply_numpy_chunking( self.test_data_numpy, n_jobs=n_jobs) # Materialize generator and test contents. We simply test if every row of the original input occurs in # the chunks iterable_of_args = list(iterable_of_args) self.assertEqual(len(iterable_of_args), expected_n_chunks) offset = 0 for chunk in iterable_of_args: self.assertIsInstance(chunk[0], np.ndarray) np.testing.assert_array_equal( chunk[0], self.test_data_numpy[offset:offset + len(chunk[0])]) offset += len(chunk[0]) self.assertEqual(offset, 100) # Test other output self.assertEqual(iterable_len, expected_n_chunks) self.assertEqual(chunk_size, 1) self.assertIsNone(n_splits_)
def test_chunk_size(self): """ Test that chunk_size works as expected. Note that chunk_size trumps n_splits """ for chunk_size, expected_n_chunks in [(1, 100), (3, 34), (200, 1), (None, 1)]: with self.subTest(chunk_size=chunk_size): iterable_of_args, iterable_len, chunk_size_, n_splits = apply_numpy_chunking( self.test_data_numpy, chunk_size=chunk_size, n_splits=1) # Materialize generator and test contents. The chunks should be of size chunk_size (expect for the last # chunk which can be smaller) iterable_of_args = list(iterable_of_args) self.assertEqual(len(iterable_of_args), expected_n_chunks) chunk_size = chunk_size or 100 for chunk_idx, chunk in enumerate(iterable_of_args): self.assertIsInstance(chunk[0], np.ndarray) np.testing.assert_array_equal( chunk[0], self.test_data_numpy[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size]) # Test other output self.assertEqual(iterable_len, expected_n_chunks) self.assertEqual(chunk_size_, 1) self.assertIsNone(n_splits)
def test_iterable_len(self): """ Test that iterable_len is adhered to. When iterable_len < len(input) it should reduce the input size. If higher or None it should take the entire input """ for iterable_len, expected_size in [(5, 5), (150, 100), (None, 100)]: with self.subTest(iterable_len=iterable_len): iterable_of_args, iterable_len_, chunk_size, n_splits = apply_numpy_chunking( self.test_data_numpy, iterable_len=iterable_len, n_splits=1) # Materialize generator and test contents iterable_of_args = list(iterable_of_args) self.assertEqual(len(iterable_of_args), 1) self.assertIsInstance(iterable_of_args[0][0], np.ndarray) np.testing.assert_array_equal( iterable_of_args[0][0], self.test_data_numpy[:expected_size]) # Test other output self.assertEqual(iterable_len_, 1) self.assertEqual(chunk_size, 1) self.assertIsNone(n_splits)
def imap_unordered( self, func_pointer: Callable, iterable_of_args: Union[Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0) -> Generator[Any, None, None]: """ Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in the queue. :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function pointer :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4`` number of chunks :param n_splits: Number of splits to use when ``chunk_size`` is ``None`` :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :return: Generator yielding unordered results """ # If we're dealing with numpy arrays, we have to chunk them here already if isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.n_jobs) # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be # modified as well n_tasks, chunk_size, progress_bar = self._check_map_parameters( iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position) # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays if not isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args = chunk_tasks(iterable_of_args, n_tasks, chunk_size, n_splits or self.n_jobs * 4) # Start workers self.start_workers(func_pointer, worker_lifespan) # Create exception and progress bar handlers. The exception handler will receive any exceptions thrown by the # workers, terminates everything and re-raise an exceptoin in the main process. The progress bar handler will # receive updates from the workers and updates the progress bar accordingly with ExceptionHandler(self.terminate, self._exception_queue, self.exception_caught, self._keep_order, progress_bar is not None) as exception_handler, \ ProgressBarHandler(func_pointer, progress_bar, n_tasks, progress_bar_position, self._task_completed_queue, self._exception_queue, self.exception_caught): # Process all args in the iterable. If maximum number of active tasks is None, we avoid all the if and # try-except clauses to speed up the process. n_active = 0 if max_tasks_active == 'n_jobs*2': max_tasks_active = self.n_jobs * 2 if max_tasks_active is None: for chunked_args in iterator_of_chunked_args: # Stop given tasks when an exception was caught if self.exception_caught.is_set(): break # Add task self.add_task(chunked_args) n_active += 1 # Restart workers if necessary self._restart_workers() elif isinstance(max_tasks_active, int): while not self.exception_caught.is_set(): # Add task, only if allowed and if there are any if n_active < max_tasks_active: try: self.add_task(next(iterator_of_chunked_args)) n_active += 1 except StopIteration: break # Check if new results are available, but don't wait for it try: yield from self.get_result(block=False) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Obtain the results not yet obtained while not self.exception_caught.is_set() and n_active != 0: try: yield from self.get_result(block=True, timeout=0.1) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Clean up time exception_handler.raise_on_exception() self.stop_and_join()
def imap(self, func_pointer: Callable, iterable_of_args: Union[Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0) -> Generator[Any, None, None]: """ Same as ``multiprocessing.imap_unordered()``, but ordered. Also allows a user to set the maximum number of tasks available in the queue. :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function pointer :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4`` number of chunks :param n_splits: Number of splits to use when ``chunk_size`` is ``None`` :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :return: Generator yielding ordered results """ # Notify workers to keep order in mind self._keep_order.set() # If we're dealing with numpy arrays, we have to chunk them here already if isinstance(iterable_of_args, np.ndarray): iterable_of_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.n_jobs) # Yield results in order next_result_idx = 0 tmp_results = {} if iterable_len is None and hasattr(iterable_of_args, '__len__'): iterable_len = len(iterable_of_args) for result_idx, result in self.imap_unordered( func_pointer, ((args_idx, args) for args_idx, args in enumerate(iterable_of_args)), iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position): # Check if the next one(s) to return is/are temporarily stored. We use a while-true block with dict.pop() to # keep the temporary store as small as possible while True: if next_result_idx in tmp_results: yield tmp_results.pop(next_result_idx) next_result_idx += 1 else: break # Check if the current result is the next one to return. If so, return it if result_idx == next_result_idx: yield result next_result_idx += 1 # Otherwise, temporarily store the current result else: tmp_results[result_idx] = result # Yield all remaining results for result_idx in sorted(tmp_results.keys()): yield tmp_results.pop(result_idx) # Notify workers to forget about order self._keep_order.clear()
def map(self, func_pointer: Callable, iterable_of_args: Union[Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0, concatenate_numpy_output: bool = True ) -> Union[List[Any], np.ndarray]: """ Same as ``multiprocessing.map()``. Also allows a user to set the maximum number of tasks available in the queue. Note that this function can be slower than the unordered version. :param func_pointer: Function pointer to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function pointer :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. Use ``None`` to not limit the queue :param chunk_size: Number of simultaneous tasks to give to a worker. If ``None`` it will generate ``n_jobs * 4`` number of chunks :param n_splits: Number of splits to use when ``chunk_size`` is ``None`` :param worker_lifespan: Number of chunks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :param concatenate_numpy_output: When ``True`` it will concatenate numpy output to a single numpy array :return: List with ordered results """ # Notify workers to keep order in mind self._keep_order.set() # If we're dealing with numpy arrays, we have to chunk them here already if isinstance(iterable_of_args, np.ndarray): iterable_of_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.n_jobs) # Process all args if iterable_len is None and hasattr(iterable_of_args, '__len__'): iterable_len = len(iterable_of_args) results = self.map_unordered( func_pointer, ((args_idx, args) for args_idx, args in enumerate(iterable_of_args)), iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position) # Notify workers to forget about order self._keep_order.clear() # Rearrange and return sorted_results = [ result[1] for result in sorted(results, key=lambda result: result[0]) ] # Convert back to numpy if necessary return (np.concatenate(sorted_results) if sorted_results and concatenate_numpy_output and isinstance( sorted_results[0], np.ndarray) else sorted_results)
def imap_unordered( self, func: Callable, iterable_of_args: Union[Sized, Iterable, np.ndarray], iterable_len: Optional[int] = None, max_tasks_active: Optional[int] = None, chunk_size: Optional[int] = None, n_splits: Optional[int] = None, worker_lifespan: Optional[int] = None, progress_bar: bool = False, progress_bar_position: int = 0, enable_insights: bool = False, worker_init: Optional[Callable] = None, worker_exit: Optional[Callable] = None ) -> Generator[Any, None, None]: """ Same as ``multiprocessing.imap_unordered()``. Also allows a user to set the maximum number of tasks available in the queue. :param func: Function to call each time new task arguments become available. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param iterable_of_args: A numpy array or an iterable containing tuples of arguments to pass to a worker, which passes it to the function ``func`` :param iterable_len: Number of elements in the ``iterable_of_args``. When chunk_size is set to ``None`` it needs to know the number of tasks. This can either be provided by implementing the ``__len__`` function on the iterable object, or by specifying the number of tasks :param max_tasks_active: Maximum number of active tasks in the queue. If ``None`` it will be converted to ``n_jobs * 2`` :param chunk_size: Number of simultaneous tasks to give to a worker. When ``None`` it will use ``n_splits``. :param n_splits: Number of splits to use when ``chunk_size`` is ``None``. When both ``chunk_size`` and ``n_splits`` are ``None``, it will use ``n_splits = n_jobs * 64``. :param worker_lifespan: Number of tasks a worker can handle before it is restarted. If ``None``, workers will stay alive the entire time. Use this when workers use up too much memory over the course of time :param progress_bar: When ``True`` it will display a progress bar :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using multiple progress bars at the same time :param enable_insights: Whether to enable worker insights. Might come at a small performance penalty (often neglible) :param worker_init: Function to call each time a new worker starts. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :param worker_exit: Function to call each time a worker exits. Return values will be fetched and made available through :obj:`mpire.WorkerPool.get_exit_results`. When passing on the worker ID the function should receive the worker ID as its first argument. If shared objects are provided the function should receive those as the next argument. If the worker state has been enabled it should receive a state variable as the next argument :return: Generator yielding unordered results """ # If we're dealing with numpy arrays, we have to chunk them here already iterator_of_chunked_args = [] if isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args, iterable_len, chunk_size, n_splits = apply_numpy_chunking( iterable_of_args, iterable_len, chunk_size, n_splits, self.params.n_jobs) # Check parameters and thereby obtain the number of tasks. The chunk_size and progress bar parameters could be # modified as well n_tasks, max_tasks_active, chunk_size, progress_bar = self.params.check_map_parameters( iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position) # Chunk the function arguments. Make single arguments when we're dealing with numpy arrays if not isinstance(iterable_of_args, np.ndarray): iterator_of_chunked_args = chunk_tasks( iterable_of_args, n_tasks, chunk_size, n_splits or self.params.n_jobs * 64) # Reset profiling stats self._worker_insights.reset_insights(enable_insights) # Start workers if there aren't any. If they already exist they must be restarted when either the function to # execute or the worker lifespan changes if self._workers and self.params.workers_need_restart( func, worker_init, worker_exit, worker_lifespan): self.stop_and_join() if not self._workers: logger.debug("Spinning up workers") self._start_workers(func, worker_init, worker_exit, worker_lifespan, bool(progress_bar)) # Create exception, exit results, and progress bar handlers. The exception handler receives any exceptions # thrown by the workers, terminates everything and re-raise an exception in the main process. The exit results # handler fetches results from the exit function, if provided. The progress bar handler receives progress # updates from the workers and updates the progress bar accordingly with ExceptionHandler(self.terminate, self._worker_comms, bool(progress_bar)) as exception_handler, \ ProgressBarHandler(func, self.params.n_jobs, progress_bar, n_tasks, progress_bar_position, self._worker_comms, self._worker_insights): # Process all args in the iterable n_active = 0 while not self._worker_comms.exception_caught(): # Add task, only if allowed and if there are any if n_active < max_tasks_active: try: self._worker_comms.add_task( next(iterator_of_chunked_args)) n_active += 1 except StopIteration: break # Check if new results are available, but don't wait for it try: yield from self._worker_comms.get_results(block=False) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Obtain the results not yet obtained while not self._worker_comms.exception_caught() and n_active != 0: try: yield from self._worker_comms.get_results(block=True, timeout=0.1) n_active -= 1 except queue.Empty: pass # Restart workers if necessary self._restart_workers() # Clean up time. When keep_alive is set to True we won't join the workers. During the stop_and_join call an # error can occur as well, so we have to check once again whether an exception occurred and raise if it did exception_handler.raise_on_exception() if not self.params.keep_alive: self.stop_and_join() exception_handler.raise_on_exception() # Log insights if enable_insights: logger.debug(self._worker_insights.get_insights_string())