Exemplo n.º 1
0
    def test_mr_map(self):
        stdin = StringIO("\n".join(["foo\tbar\tbar1", "baz\tbad\tbad1"]))
        stdout = StringIO()

        mr_map(lambda x: [x[:1]], fd=stdin, out=stdout)

        self.assertEqual(stdout.getvalue(), "foo\nbaz\n")
Exemplo n.º 2
0
    def test_mr_map(self):
        stdin = StringIO("\n".join([
            "foo\tbar\tbar1",
            "baz\tbad\tbad1",
        ]))
        stdout = StringIO()

        mr_map(lambda x: [x[:1]], fd=stdin, out=stdout)

        self.assertEqual(stdout.getvalue(), "foo\nbaz\n")
Exemplo n.º 3
0
def mr_map_parallel(
    processor,
    fd=STDIN,
    workers=NCPUS,
    chunk_size=1000,
    out=STDOUT,
):
    """Map in parallel.

    `processor` must be an instance of Mapper and promise that it is
    safe to execute in a fork()d process.  Also note that we f**k
    up the result ordering, but relying on result ordering breaks
    the mapreduce contract anyway. Note also that like many of the
    mr_tools functions, we break on newlines in the emitted output

    :param processor: an multiprocessing-safe instance of :py:class:`Mapper`
    :param int workers: Number of concurrent workers.
    :param int chunk_size: job size per worker
    :param file fd: Input data stream (default is stdin)
    :param file out: Output data stream (default is stdout)
    :type processor: :py:class:`Mapper`
    """
    if workers == 1:
        return mr_map(processor, fd=fd, out=out)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres, out=out)
Exemplo n.º 4
0
def mr_map_parallel(
    processor,
    fd=STDIN,
    workers=NCPUS,
    chunk_size=1000,
    out=STDOUT,
):
    """Map in parallel.

    `processor` must be an instance of Mapper and promise that it is
    safe to execute in a fork()d process.  Also note that we f**k
    up the result ordering, but relying on result ordering breaks
    the mapreduce contract anyway. Note also that like many of the
    mr_tools functions, we break on newlines in the emitted output

    :param processor: an multiprocessing-safe instance of :py:class:`Mapper`
    :param int workers: Number of concurrent workers.
    :param int chunk_size: job size per worker
    :param file fd: Input data stream (default is stdin)
    :param file out: Output data stream (default is stdout)
    :type processor: :py:class:`Mapper`
    """
    if workers == 1:
        return mr_map(processor, fd=fd, out=out)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres, out=out)
Exemplo n.º 5
0
def mr_map_parallel(processor, fd=stdin, workers=multiprocessing.cpu_count(), chunk_size=1000):
    # `process` must be an instance of Mapper and promise that it is
    # safe to execute in a fork()d process.  Also note that we f**k
    # up the result ordering, but relying on result ordering breaks
    # the mapreduce contract anyway. Note also that like many of the
    # mr_tools functions, we break on newlines in the emitted output

    if workers == 1:
        return mr_map(process, fd=fd)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres)
Exemplo n.º 6
0
def mr_map_parallel(processor, fd = stdin,
                    workers = multiprocessing.cpu_count(),
                    chunk_size = 1000):
    # `process` must be an instance of Mapper and promise that it is
    # safe to execute in a fork()d process.  Also note that we f**k
    # up the result ordering, but relying on result ordering breaks
    # the mapreduce contract anyway. Note also that like many of the
    # mr_tools functions, we break on newlines in the emitted output

    if workers == 1:
        return mr_map(process, fd=fd)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres)