class ProcessSource(object): """A class for iterating another source in a separate process""" def __init__(self, source, batchsize=500, queuesize=20): """Arguments: - source: the source to iterate - batchsize: the number of rows passed from the worker process each time it passes on a batch of rows. Must be positive. Default: 500 - queuesize: the maximum number of batches that can wait in a queue between the processes. 0 means unlimited. Default: 20 """ if not isinstance(batchsize, int) or batchsize < 1: raise ValueError('batchsize must be a positive integer') self.__source = source self.__batchsize = batchsize self.__queue = Queue(queuesize) p = Process(target=self.__worker) p.name = "Process for ProcessSource" p.start() def __worker(self): batch = [] try: for row in self.__source: batch.append(row) if len(batch) == self.__batchsize: self.__queue.put(batch) batch = [] # We're done. Send the batch if it has any data and a signal if batch: self.__queue.put(batch) self.__queue.put('STOP') except Exception: # Jython 2.5.X does not support the as syntax required by Python 3 e = sys.exc_info()[1] if batch: self.__queue.put(batch) self.__queue.put('EXCEPTION') self.__queue.put(e) def __iter__(self): while True: data = self.__queue.get() if data == 'STOP': break elif data == 'EXCEPTION': exc = self.__queue.get() raise exc # else we got a list of rows from the other process for row in data: yield row
class DynamicForEachSource(object): """A source that for each given argument creates a new source that will be iterated by this source. For example, useful for directories where a CSVSource should be created for each file. The user must provide a function that when called with a single argument, returns a new source to iterate. A DynamicForEachSource instance can be given to several ProcessSource instances. """ def __init__(self, seq, callee): """Arguments: - seq: a sequence with the elements for each of which a unique source must be created. the elements are given (one by one) to callee. - callee: a function f(e) that must accept elements as those in the seq argument. the function should return a source which then will be iterated by this source. the function is called once for every element in seq. """ self.__queue = Queue() # a multiprocessing.Queue if not callable(callee): raise TypeError('callee must be callable') self.__callee = callee for e in seq: # put them in a safe queue such that this object can be used from # different fork'ed processes self.__queue.put(e) def __iter__(self): while True: try: arg = self.__queue.get(False) src = self.__callee(arg) for row in src: yield row except Empty: raise StopIteration()