def mr_map_parallel( processor, fd=STDIN, workers=NCPUS, chunk_size=1000, out=STDOUT, ): """Map in parallel. `processor` must be an instance of Mapper and promise that it is safe to execute in a fork()d process. Also note that we f**k up the result ordering, but relying on result ordering breaks the mapreduce contract anyway. Note also that like many of the mr_tools functions, we break on newlines in the emitted output :param processor: an multiprocessing-safe instance of :py:class:`Mapper` :param int workers: Number of concurrent workers. :param int chunk_size: job size per worker :param file fd: Input data stream (default is stdin) :param file out: Output data stream (default is stdout) :type processor: :py:class:`Mapper` """ if workers == 1: return mr_map(processor, fd=fd, out=out) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres, out=out)
def mr_map_parallel(processor, fd=stdin, workers=multiprocessing.cpu_count(), chunk_size=1000): # `process` must be an instance of Mapper and promise that it is # safe to execute in a fork()d process. Also note that we f**k # up the result ordering, but relying on result ordering breaks # the mapreduce contract anyway. Note also that like many of the # mr_tools functions, we break on newlines in the emitted output if workers == 1: return mr_map(process, fd=fd) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres)
def mr_map_parallel(processor, fd = stdin, workers = multiprocessing.cpu_count(), chunk_size = 1000): # `process` must be an instance of Mapper and promise that it is # safe to execute in a fork()d process. Also note that we f**k # up the result ordering, but relying on result ordering breaks # the mapreduce contract anyway. Note also that like many of the # mr_tools functions, we break on newlines in the emitted output if workers == 1: return mr_map(process, fd=fd) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres)