예제 #1
0
def test_multiple_environments_free_assignment():
    # Dummy environments with no components for testing.
    environments = [
        TaskEnvironment(placement=[cpu(0)],
                        components=[DummyComponent("foo")]),
        TaskEnvironment(placement=[cpu(1)], components=[DummyComponent("bar")])
    ]
    with Parla(environments):
        for _ in repetitions():
            task_results = []

            @spawn(vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            @spawn(vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            @spawn(vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            sleep_until(lambda: len(task_results) == 3)
            assert set(task_results) == {"foo", "bar"}
예제 #2
0
def test_multiple_environments_tagged():
    # Dummy environments with no components for testing.
    environments = [
        TaskEnvironment(placement=[cpu(0)],
                        components=[DummyComponent("foo")],
                        tags=(threading, )),
        TaskEnvironment(placement=[cpu(1)],
                        components=[DummyComponent("bar")],
                        tags=(logging, ))
    ]
    with Parla(environments):
        for _ in repetitions():
            task_results = []

            @spawn(tags=(threading, ))
            def task():
                task_results.append(thread_locals.value)

            sleep_until(lambda: len(task_results) == 1)
            assert task_results == ["foo"]

            task_results = []

            @spawn(tags=(logging, ))
            def task():
                task_results.append(thread_locals.value)

            sleep_until(lambda: len(task_results) == 1)
            assert task_results == ["bar"]
예제 #3
0
def test_placement(runtime_sched):
    devices = [cpu(0), cpu(1), cpu(2)]
    for rep in repetitions():
        task_results = []
        for (i, dev) in enumerate(devices):
            @spawn(placement=dev)
            def task():
                task_results.append(get_current_devices()[0])
            sleep_until(lambda: len(task_results) == i+1)

        assert task_results == devices
예제 #4
0
def test_placement_multi():
    # Dummy environments with no components for testing.
    environments = [TaskEnvironment(placement=d, components=[]) for d in combinations(cpu.devices, 2)]
    with Parla(environments):
        devices = [frozenset((cpu(0), cpu(1))), frozenset((cpu(1), cpu(2))), frozenset((cpu(4), cpu(3)))]
        for rep in repetitions():
            task_results = []
            for (i, dev) in enumerate(devices):
                @spawn(placement=dev, ndevices=2)
                def task():
                    task_results.append(frozenset(get_current_devices()))
                sleep_until(lambda: len(task_results) == i+1)

            assert task_results == devices
예제 #5
0
def test_placement_await(runtime_sched):
    devices = [cpu(0), cpu(1), cpu(2)]

    for rep in repetitions():
        task_results = []
        for (i, dev) in enumerate(devices):
            @spawn(placement=dev)
            async def task():
                task_results.append(get_current_devices()[0])
                await tasks() # Await nothing to force a new task.
                task_results.append(get_current_devices()[0])
            sleep_until(lambda: len(task_results) == (i+1)*2)

        assert task_results == [cpu(0), cpu(0), cpu(1), cpu(1), cpu(2), cpu(2)]
예제 #6
0
def test_memory_aware_scheduling(runtime_sched):
    # test memory restrictions
    for rep in repetitions():
        task_results = []
        for i in range(8):
            @spawn(placement=cpu, memory=cpu(0).available_memory)
            def task():
                task_results.append(get_current_devices()[0])
                sleep(0.1)
        sleep_until(lambda: len(task_results) == 8)
        assert 8 >= len(set(task_results)) >= 4
예제 #7
0
def test_multiple_environments_fixed_assignment():
    # Dummy environments with no components for testing.
    environments = [
        TaskEnvironment(placement=[cpu(0)],
                        components=[DummyComponent("foo")]),
        TaskEnvironment(placement=[cpu(1)], components=[DummyComponent("bar")])
    ]
    with Parla(environments):
        task_results = []

        @spawn(placement=cpu(0))
        def task():
            task_results.append(thread_locals.value)

        @spawn(placement=cpu(1))
        def task():
            task_results.append(thread_locals.value)

        sleep_until(lambda: len(task_results) == 2)
        assert set(task_results) == {"foo", "bar"}
예제 #8
0
def test_placement_await():
    try:
        from parla.cuda import gpu
    except (ImportError, AttributeError):
        skip("CUDA required for this test.")

    devices = [cpu(0), gpu(0)]

    for rep in repetitions():
        task_results = []
        for i in range(2):

            @spawn(placement=devices[i])
            async def task():
                task_results.append(get_current_device())
                await tasks()  # Await nothing to force a new task.
                task_results.append(get_current_device())

            sleep_until(lambda: len(task_results) == (i + 1) * 2)

        assert task_results == [cpu(0), cpu(0), gpu(0), gpu(0)]
예제 #9
0
def main():
    comm = MPI.COMM_WORLD
    print(comm.Get_rank(), comm.Get_size())

    a = np.random.rand(10000000).astype(dtype='d')
    b = np.random.rand(10000000).astype(dtype='d')

    divisions = 100

    comm.Barrier()
    start = time.perf_counter()
    # Map the divisions onto actual hardware locations
    mapper = LDeviceSequenceBlocked(divisions)
    # print(mapper.devices)

    a_part = mapper.partition_tensor(a)
    b_part = mapper.partition_tensor(b)

    inner_result = np.empty(1, dtype='d')

    @spawn(placement=cpu(0))
    async def inner_part():
        partial_sums = np.empty(divisions)
        async with finish():
            for i in range(divisions):
                @spawn(placement=mapper.device(i))
                def inner_local():
                    copy(partial_sums[i:i+1], a_part[i] @ b_part[i])
        res = 0.
        for i in range(divisions):
            res += partial_sums[i]
        inner_result[0] = res

    overall_result = np.array(0.0, dtype='d') if comm.Get_rank() == 0 else None
    comm.Reduce([inner_result, MPI.DOUBLE],
                [overall_result, MPI.DOUBLE],
                op=MPI.SUM,
                root=0)
    if overall_result is not None:
        result = float(overall_result)
        print(result)
        end = time.perf_counter()
        print(end - start)

    assert np.allclose(np.inner(a, b), inner_result[0])

    other_results = np.empty(comm.Get_size(), dtype='d') if comm.Get_rank() == 0 else None
    comm.Gather([inner_result, MPI.DOUBLE],
                [other_results, MPI.DOUBLE],
                root=0)
    if overall_result is not None:
        assert np.isclose(result, np.sum(other_results))
예제 #10
0
def test_placement_options_memory(runtime_sched):
    # test multiple options in placement list with only one device used in the end
    for rep in repetitions():
        task_results = []
        for i in range(4):
            @spawn(placement=[cpu(0), cpu(1)], memory=cpu(0).available_memory)
            def task():
                sleep(0.1)
                task_results.append(get_current_devices()[0])
        sleep_until(lambda: len(task_results) == 4)
        assert set(task_results) == {cpu(0), cpu(1)}
        assert task_results.count(cpu(0)) == 2
        assert task_results.count(cpu(1)) == 2
예제 #11
0
def test_dummy_environment_component():
    environments = [
        TaskEnvironment(placement=[cpu(0)],
                        components=[DummyComponent("test")])
    ]
    with Parla(environments):
        task_results = []

        @spawn()
        def task():
            assert get_current_devices() == [cpu(0)]
            task_results.append(thread_locals.value)

        sleep_until(lambda: len(task_results) == 1)
        assert task_results == ["test"]
예제 #12
0
def main():
    @spawn(placement=cpu(0))
    async def test_fox():
        comm = MPI.COMM_WORLD
        print(comm.Get_rank(), comm.Get_size())

        # Create test data at each rank
        comm.Barrier()
        size_factor = 1024*8
        A = np.random.rand(size_factor // comm.Get_size(), size_factor).astype(dtype='d')
        x = np.random.rand(size_factor // comm.Get_size()).astype(dtype='d')
        comm.Barrier()

        print("----", A.shape)
        # Perform multiplication
        y = await matvec_mpi(comm, A, x)
        print("++++", A.shape)
예제 #13
0
def test_placement_data(runtime_sched):
    try:
        from parla.cuda import gpu
    except:
        skip("Test needs cuda.")
        return
    devices = [cpu(0), gpu(0)]
    for rep in repetitions():
        task_results = []
        for (i, dev) in enumerate(devices):
            d = dev.memory()(np.array([1, 2, 3]))
            @spawn(placement=d)
            def task():
                task_results.append(get_current_devices()[0])
            sleep_until(lambda: len(task_results) == i+1)

        assert task_results == devices
예제 #14
0
def main():
    @spawn(placement=cpu(0))
    async def test_fox():
        size_factor = 1024
        A = np.random.rand(size_factor, size_factor)
        x = np.random.rand(size_factor)

        ## Perform single multiplication

        # Compute "golden" result
        res = A @ x
        print("----", A.shape)

        # Compute with Parla
        out = np.empty_like(x)
        out1 = await matvec_fox(out, A, x)
        assert out is out1

        # Compare parla result to golden result
        print("++++", A.shape)
        print(np.linalg.norm(res - out, ord=np.inf))
        assert np.allclose(res, out), "Parallel fox failed"

        ## Perform double multiplication

        # Compute "golden" result
        res = A @ (A @ x)
        print("----", A.shape)

        # Compute with Parla
        out = np.empty_like(x)
        # Partition the data
        yp, Ap, xp = partition_fox(out, A, x)
        # Multiply twice without copying back to system memory.
        await matvec_fox_partitioned(yp, Ap, xp)
        await matvec_fox_partitioned(xp, Ap, yp)
        # Collect the final result to system memory.
        out1 = await collect_fox(out, xp)
        assert out is out1

        # Compare parla result to golden result
        print("++++", A.shape)
        print(np.linalg.norm(res - out, ord=np.inf))
        assert np.allclose(res, out), "Parallel fox failed"
        print("Done")
예제 #15
0
def test_placement_options_vcus(runtime_sched):
    # test multiple options in placement list with only one device used in the end
    for rep in repetitions():
        N = 4
        task_results = []
        for i in range(N):
            @spawn(placement=[cpu(0), cpu(1)], vcus=1)
            def task():
                sleep(0.1)
                task_results.append(get_current_devices()[0])
        sleep_until(lambda: len(task_results) == N)
        assert set(task_results) == {cpu(0), cpu(1)}
        assert task_results.count(cpu(0)) == N/2
        assert task_results.count(cpu(1)) == N/2
예제 #16
0
async def collect_fox(y, yp):
    """
    Collect the partitions in `yp` into `y`.

    :param yp: A 2d list of partitions.
    :param y: The output array.
    :return: `y`
    """
    C = TaskSpace()

    # Collect from diagonal in parallel
    for i in range(0, partitions_y):  # rows
        @spawn(C[i], placement=cpu(0))
        def c():
            copy(y[mapper.slice_x(i, y.shape[0])], yp[i][i])

    # wait for the collect tasks to complete.
    await C

    return y
예제 #17
0
def main():
    n = 3 * 100000000
    a = np.random.rand(n)
    b = np.random.rand(n)

    divisions = 100

    start = time.perf_counter()
    # Map the divisions onto actual hardware locations
    devs = list(gpu.devices) + list(cpu.devices)
    if "N_DEVICES" in os.environ:
        devs = devs[:int(os.environ.get("N_DEVICES"))]
    mapper = LDeviceSequenceBlocked(divisions, devices=devs)

    a_part = mapper.partition_tensor(a)
    b_part = mapper.partition_tensor(b)

    inner_result = np.empty(1)

    @spawn(placement=cpu(0))
    async def inner_part():
        partial_sums = np.empty(divisions)
        async with finish():
            for i in range(divisions):

                @spawn(placement=mapper.device(i))
                def inner_local():
                    copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])

        res = 0.
        for i in range(divisions):
            res += partial_sums[i]
        inner_result[0] = res

    end = time.perf_counter()
    print(end - start)

    assert np.allclose(np.inner(a, b), inner_result[0])
예제 #18
0
def main():
    devs = list(gpu.devices) + list(cpu.devices)
    if "N_DEVICES" in os.environ:
        devs = devs[:int(os.environ.get("N_DEVICES"))]
    divisions = len(devs)*2

    # Set up an "n" x "n" grid of values and run
    # "steps" number of iterations of the 4 point stencil on it.
    n = 25000
    steps = 200

    # Set up two arrays containing the input data.
    # This demo uses the standard technique of computing
    # from one array into another then swapping the
    # input and output arrays for the next iteration.
    # These are the two arrays that will be swapped back
    # and forth as input and output.
    a0 = np.random.rand(n, n)
    a1 = a0.copy()

    # An object that distributes arrays across all the given devices.
    mapper = LDeviceSequenceBlocked(divisions, devices=devs)

    # Partition a0 and a1.
    # Here we just partition the rows across the different devices.
    # Other partitioning schemes are possible.
    a0_row_groups = mapper.partition_tensor(a0, overlap=1)
    a1_row_groups = mapper.partition_tensor(a1, overlap=1)

    # Trigger JIT
    @spawn(placement=cpu(0))
    async def warmups():
        warmup = TaskSpace()
        for i in range(divisions):
            @spawn(warmup[i], placement=mapper.device(i))
            async def w():
                jacobi(a1_row_groups[i], a0_row_groups[i])
                cupy.cuda.get_current_stream().synchronize()
                cupy.cuda.Stream.null.synchronize()
        await warmup

    time.sleep(5)

    start = time.perf_counter()
    # Main parla task.
    @spawn(placement=cpu(0))
    async def run_jacobi():
        assert steps > 0
        # Specify which set of blocks is used as input or output
        # (they will be swapped for each iteration).
        in_blocks = a0_row_groups
        out_blocks = a1_row_groups
        # Create a set of labels for the tasks that perform the first
        # Jacobi iteration step.
        previous_block_tasks = CompletedTaskSpace()
        # Now create the tasks for subsequent iteration steps.
        for i in range(steps):
            # Swap input and output blocks for the next step.
            in_blocks, out_blocks = out_blocks, in_blocks
            # Create a new set of labels for the tasks that do this iteration step.
            current_block_tasks = TaskSpace("block_tasks[{}]".format(i))
            # Create the tasks to do the i'th iteration.
            # As before, each task needs the following info:
            #  a block index "j"
            #  a "device" where it should execute (supplied by mapper used for partitioning)
            #  the "in_block" of data used as input
            #  the "out_block" to write the output to
            for j in range(divisions):
                device = mapper.device(j)
                in_block = in_blocks[j]
                out_block = out_blocks[j]
                # Make each task operating on each block depend on the tasks for
                # that block and its immediate neighbors from the previous iteration.
                @spawn(current_block_tasks[j],
                       dependencies=[previous_block_tasks[max(0, j-1):min(divisions, j+2)]],
                       placement=device)
                def device_local_jacobi_task():
                    # Read boundary values from adjacent blocks in the partition.
                    # This may communicate across device boundaries.
                    if j > 0:
                        copy(in_block[0], in_blocks[j - 1][-2])
                    if j < divisions - 1:
                        copy(in_block[-1], in_blocks[j + 1][1])
                    # Run the computation, dispatching to device specific code.
                    jacobi(in_block, out_block)
            # For the next iteration, use the newly created tasks as
            # the tasks from the previous step.
            previous_block_tasks = current_block_tasks
        await previous_block_tasks
        cupy.cuda.get_current_stream().synchronize()
        cupy.cuda.Stream.null.synchronize()
        end = time.perf_counter()
        print(end - start)

        # This depends on all the tasks from the last iteration step.
        for j in range(divisions):
            start_index = 1 if j > 0 else 0
            end_index = -1 if j < divisions - 1 else None  # None indicates the last element of the dimension
            copy(a1[mapper.slice(j, len(a1))], out_blocks[j][start_index:end_index])
예제 #19
0
    WARMUP = args.warmup
    NTHREADS = args.threads
    NGPUS = args.ngpus
    PLACEMENT_STRING = args.placement
    CHECK_RESULT = args.check_result
    CSV = args.csv

    # Set up PLACEMENT variable
    if PLACEMENT_STRING == 'cpu':
        PLACEMENT = cpu
        ACUS = None
    elif PLACEMENT_STRING == 'gpu':
        PLACEMENT = [gpu(i) for i in range(NGPUS)]
        ACUS = None
    elif PLACEMENT_STRING == 'both':
        PLACEMENT = [cpu(0)] + [gpu(i) for i in range(NGPUS)]
        ACUS = 1
    elif PLACEMENT_STRING == 'puregpu':
        PLACEMENT = [gpu(i) for i in range(NGPUS)]
        ACUS = None
        BLOCK_SIZE = int(NROWS / NGPUS)
    else:
        print(
            "Invalid value for placement. Must be 'cpu' or 'gpu' or 'both' or 'puregpu'"
        )

    perf_stats = perfStats(ITERS, NROWS, BLOCK_SIZE)

    print(
        '%**********************************************************************************************%\n'
    )
예제 #20
0
def test_multiple_environments_less_good_fit():
    # Dummy environments with no components for testing.
    environments = [
        TaskEnvironment(placement=[cpu(0), cpu(1)],
                        components=[DummyComponent("foo")]),
        TaskEnvironment(placement=[cpu(2), cpu(3), cpu(4)],
                        components=[DummyComponent("bar")])
    ]
    with Parla(environments):
        for _ in repetitions():
            task_results = []
            # The first two will fit in the the first environment using 0.5 of the environment.
            # The next two will spill into the less good (0.33) fit of the second environment.
            @spawn(placement=[cpu(1), cpu(2)], vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            @spawn(placement=[cpu(1), cpu(2)], vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            @spawn(placement=[cpu(1), cpu(2)], vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            @spawn(placement=[cpu(1), cpu(2)], vcus=1)
            def task():
                sleep(0.1)
                task_results.append(thread_locals.value)

            sleep_until(lambda: len(task_results) == 4)
            task_results.sort()
            assert task_results == ["bar", "bar", "foo", "foo"]
예제 #21
0
 def task():
     assert get_current_devices() == [cpu(0)]
     task_results.append(thread_locals.value)