Exemplo n.º 1
0
def test_failed_worker_without_warning(c, s, a, b):
    L = c.map(inc, range(10))
    yield wait(L)

    original_pid = a.pid
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[a.worker_address])
    start = time()
    while a.pid == original_pid:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield gen.sleep(0.5)

    start = time()
    while len(s.ncores) < 2:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield wait(L)

    L2 = c.map(inc, range(10, 20))
    yield wait(L2)
    assert all(len(keys) > 0 for keys in s.has_what.values())
    ncores2 = dict(s.ncores)

    yield c._restart()

    L = c.map(inc, range(10))
    yield wait(L)
    assert all(len(keys) > 0 for keys in s.has_what.values())

    assert not (set(ncores2) & set(s.ncores))  # no overlap
Exemplo n.º 2
0
def test_gather_after_failed_worker(loop):
    with cluster(active_rpc_timeout=10) as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc']().terminate()
            result = c.gather(L)
            assert result == list(map(inc, range(10)))
Exemplo n.º 3
0
def test_submit_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            total = c.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Exemplo n.º 4
0
def test_steal_cheap_data_slow_computation(c, s, a, b):
    x = c.submit(slowinc, 100, delay=0.1)  # learn that slowinc is slow
    yield wait(x)

    futures = c.map(slowinc, range(10), delay=0.1, workers=a.address,
                    allow_other_workers=True)
    yield wait(futures)
    assert abs(len(a.data) - len(b.data)) <= 5
Exemplo n.º 5
0
def test_gather_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            result = c.gather(L)
            assert result == list(map(inc, range(10)))
Exemplo n.º 6
0
def test_submit_after_failed_worker_sync(loop):
    with cluster(active_rpc_timeout=10) as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = c.map(inc, range(10))
            wait(L)
            a['proc']().terminate()
            total = c.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Exemplo n.º 7
0
    def _scatter_data(self, job_cluster, context, workers):
        logger.debug("Scattering data to workers started")
        shared_data = {'context': context, 'settings_config': settings.config}
        scattered = job_cluster.scatter(shared_data, broadcast=True,
                                        workers=workers)
        # Waiting data is scattered to workers
        distributed.wait(scattered.values())
        logger.debug("Scattering data to workers finished")

        return scattered
Exemplo n.º 8
0
def test_share_communication(c, s, w1, w2, w3):
    x = c.submit(mul, b'1', int(w3.target_message_size + 1), workers=w1.address)
    y = c.submit(mul, b'2', int(w3.target_message_size + 1), workers=w2.address)
    yield wait([x, y])
    yield c._replicate([x, y], workers=[w1.address, w2.address])
    z = c.submit(add, x, y, workers=w3.address)
    yield wait(z)
    assert len(w3.incoming_transfer_log) == 2
    assert w1.outgoing_transfer_log
    assert w2.outgoing_transfer_log
Exemplo n.º 9
0
def test_dont_overlap_communications_to_same_worker(c, s, a, b):
    x = c.submit(mul, b'1', int(b.target_message_size + 1), workers=a.address)
    y = c.submit(mul, b'2', int(b.target_message_size + 1), workers=a.address)
    yield wait([x, y])
    z = c.submit(add, x, y, workers=b.address)
    yield wait(z)
    assert len(b.incoming_transfer_log) == 2
    l1, l2 = b.incoming_transfer_log

    assert l1['stop'] < l2['start']
Exemplo n.º 10
0
def test_cancel_stress_sync(loop):
    da = pytest.importorskip('dask.array')
    x = da.random.random((50, 50), chunks=(2, 2))
    with cluster(active_rpc_timeout=10) as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            x = c.persist(x)
            y = (x.sum(axis=0) + x.sum(axis=1) + 1).std()
            wait(x)
            for i in range(5):
                f = c.compute(y)
                sleep(random.random())
                c.cancel(f)
Exemplo n.º 11
0
def test_load_balance_map(c, s, *workers):
    class Foo(object):
        def __init__(self, x, y=None):
            pass

    b = c.submit(operator.mul, 'b', 1000000)
    yield wait(b)

    actors = c.map(Foo, range(10), y=b, actor=True)
    yield wait(actors)

    assert all(len(w.actors) == 2 for w in workers)
Exemplo n.º 12
0
def test_cancel_stress_sync(loop):
    da = pytest.importorskip('dask.array')
    x = da.random.random((40, 40), chunks=(1, 1))
    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            x = c.persist(x)
            y = (x.sum(axis=0) + x.sum(axis=1) + 1).std()
            wait(x)
            for i in range(5):
                f = c.compute(y)
                sleep(1)
                c.cancel(f)
Exemplo n.º 13
0
def test_correct_bad_time_estimate(c, s, *workers):
    future = c.submit(slowinc, 1, delay=0)
    yield wait(future)

    futures = [c.submit(slowinc, future, delay=0.1, pure=False)
               for i in range(20)]

    yield gen.sleep(0.5)

    yield wait(futures)

    assert all(w.data for w in workers), [sorted(w.data) for w in workers]
Exemplo n.º 14
0
def test_dont_steal_few_saturated_tasks_many_workers(c, s, a, *rest):
    s.extensions['stealing']._pc.callback_time = 20
    x = c.submit(mul, b'0', 100000000, workers=a.address)  # 100 MB
    yield wait(x)
    s.task_duration['slowidentity'] = 0.2

    futures = [c.submit(slowidentity, x, pure=False, delay=0.2) for i in range(2)]

    yield wait(futures)

    assert len(a.data) == 3
    assert not any(w.task_state for w in rest)
Exemplo n.º 15
0
def test_dont_steal_expensive_data_fast_computation(c, s, a, b):
    np = pytest.importorskip('numpy')
    x = c.submit(np.arange, 1000000, workers=a.address)
    yield wait([x])
    future = c.submit(np.sum, [1], workers=a.address)  # learn that sum is fast
    yield wait([future])

    cheap = [c.submit(np.sum, x, pure=False, workers=a.address,
                      allow_other_workers=True) for i in range(10)]
    yield wait(cheap)
    assert len(s.who_has[x.key]) == 1
    assert len(b.data) == 0
    assert len(a.data) == 12
Exemplo n.º 16
0
def test_mising_data_errant_worker(c, s, w1, w2, w3):
    with dask.config.set({'distributed.comm.timeouts.connect': '1s'}):
        np = pytest.importorskip('numpy')

        x = c.submit(np.random.random, 10000000, workers=w1.address)
        yield wait(x)
        yield c.replicate(x, workers=[w1.address, w2.address])

        y = c.submit(len, x, workers=w3.address)
        while not w3.tasks:
            yield gen.sleep(0.001)
        w1._close()
        yield wait(y)
Exemplo n.º 17
0
def test_steal_expensive_data_slow_computation(c, s, a, b):
    np = pytest.importorskip('numpy')

    x = c.submit(slowinc, 100, delay=0.2, workers=a.address)
    yield wait(x)  # learn that slowinc is slow

    x = c.submit(np.arange, 1000000, workers=a.address)  # put expensive data
    yield wait(x)

    slow = [c.submit(slowinc, x, delay=0.1, pure=False) for i in range(20)]
    yield wait(slow)
    assert len(s.who_has[x.key]) > 1

    assert b.data  # not empty
Exemplo n.º 18
0
def test_dont_recompute_if_erred(c, s, a, b):
    x = delayed(inc)(1, dask_key_name='x')
    y = delayed(div)(x, 0, dask_key_name='y')

    yy = y.persist()
    yield wait(yy)

    old = list(s.transition_log)

    yyy = y.persist()
    yield wait(yyy)

    yield gen.sleep(0.100)
    assert list(s.transition_log) == old
Exemplo n.º 19
0
def test_file_descriptors(c, s):
    yield gen.sleep(0.1)
    psutil = pytest.importorskip('psutil')
    da = pytest.importorskip('dask.array')
    proc = psutil.Process()
    num_fds_1 = proc.num_fds()

    N = 20
    nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)]
    yield [n._start() for n in nannies]

    while len(s.ncores) < N:
        yield gen.sleep(0.1)

    num_fds_2 = proc.num_fds()

    yield gen.sleep(0.2)

    num_fds_3 = proc.num_fds()
    assert num_fds_3 <= num_fds_2 + N  # add some heartbeats

    x = da.random.random(size=(1000, 1000), chunks=(25, 25))
    x = c.persist(x)
    yield wait(x)

    num_fds_4 = proc.num_fds()
    assert num_fds_4 <= num_fds_2 + 2 * N

    y = c.persist(x + x.T)
    yield wait(y)

    num_fds_5 = proc.num_fds()
    assert num_fds_5 < num_fds_4 + N

    yield gen.sleep(1)

    num_fds_6 = proc.num_fds()
    assert num_fds_6 < num_fds_5 + N

    yield [n._close() for n in nannies]

    assert not s.rpc.open
    assert not c.rpc.open
    assert not s.stream_comms

    start = time()
    while proc.num_fds() > num_fds_1 + N:
        yield gen.sleep(0.01)
        assert time() < start + 3
Exemplo n.º 20
0
def test_dont_recompute_if_persisted_3(c, s, a, b):
    x = delayed(inc)(1, dask_key_name='x')
    y = delayed(inc)(2, dask_key_name='y')
    z = delayed(inc)(y, dask_key_name='z')
    w = delayed(add)(x, z, dask_key_name='w')

    ww = w.persist()
    yield wait(ww)

    old = list(s.transition_log)

    www = w.persist()
    yield wait(www)
    yield gen.sleep(0.100)
    assert list(s.transition_log) == old
Exemplo n.º 21
0
def test_avoid_oversubscription(c, s, *workers):
    np = pytest.importorskip('numpy')
    x = c.submit(np.random.random, 1000000, workers=[workers[0].address])
    yield wait(x)

    futures = [c.submit(len, x, pure=False, workers=[w.address])
               for w in workers[1:]]

    yield wait(futures)

    # Original worker not responsible for all transfers
    assert len(workers[0].outgoing_transfer_log) < len(workers) - 2

    # Some other workers did some work
    assert len([w for w in workers if len(w.outgoing_transfer_log) > 0]) >= 3
Exemplo n.º 22
0
def test_dont_steal_fast_tasks(c, s, *workers):
    np = pytest.importorskip('numpy')
    x = c.submit(np.random.random, 10000000, workers=workers[0].address)

    def do_nothing(x, y=None):
        pass

    yield wait(c.submit(do_nothing, 1))

    futures = c.map(do_nothing, range(1000), y=x)

    yield wait(futures)

    assert len(s.who_has[x.key]) == 1
    assert len(s.has_what[workers[0].address]) == 1001
Exemplo n.º 23
0
def test_work_steal_no_kwargs(c, s, a, b):
    yield wait(c.submit(slowinc, 1, delay=0.05))

    futures = c.map(slowinc, range(100), workers=a.address,
                    allow_other_workers=True, delay=0.05)

    yield wait(futures)

    assert 20 < len(a.data) < 80
    assert 20 < len(b.data) < 80

    total = c.submit(sum, futures)
    result = yield total

    assert result == sum(map(inc, range(100)))
Exemplo n.º 24
0
def test_dont_recompute_if_persisted_2(c, s, a, b):
    x = delayed(inc)(1, dask_key_name='x')
    y = delayed(inc)(x, dask_key_name='y')
    z = delayed(inc)(y, dask_key_name='z')

    yy = y.persist()
    yield wait(yy)

    old = s.story('x', 'y')

    zz = z.persist()
    yield wait(zz)

    yield gen.sleep(0.100)
    assert s.story('x', 'y') == old
Exemplo n.º 25
0
def test_worker_who_has_clears_after_failed_connection(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    futures = c.map(slowinc, range(20), delay=0.01,
                    key=['f%d' % i for i in range(20)])
    yield wait(futures)

    result = yield c.submit(sum, futures, workers=a.address)
    for dep in set(a.dep_state) - set(a.task_state):
        a.release_dep(dep, report=True)

    n_worker_address = n.worker_address
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[n_worker_address])

    while len(s.workers) > 2:
        yield gen.sleep(0.01)

    total = c.submit(sum, futures, workers=a.address)
    yield total

    assert not a.has_what.get(n_worker_address)
    assert not any(n_worker_address in s for s in a.who_has.values())

    yield n._close()
Exemplo n.º 26
0
def test_fail_write_many_to_disk(c, s, a):
    a.validate = False
    yield gen.sleep(0.1)
    assert not a.paused

    class Bad(object):
        def __init__(self, x):
            pass

        def __getstate__(self):
            raise TypeError()

        def __sizeof__(self):
            return int(2e9)

    futures = c.map(Bad, range(11))
    future = c.submit(lambda *args: 123, *futures)

    yield wait(future)

    with pytest.raises(Exception) as info:
        yield future

    # workers still operational
    result = yield c.submit(inc, 1, workers=a.address)
    assert result == 2
Exemplo n.º 27
0
def test_spill_by_default(c, s, w):
    da = pytest.importorskip('dask.array')
    x = da.ones(int(TOTAL_MEMORY * 0.7), chunks=10000000, dtype='u1')
    y = c.persist(x)
    yield wait(y)
    assert len(w.data.slow)  # something is on disk
    del x, y
Exemplo n.º 28
0
def test_dont_steal_long_running_tasks(c, s, a, b):
    def long(delay):
        with worker_client() as c:
            sleep(delay)

    yield c.submit(long, 0.1)  # learn duration
    yield c.submit(inc, 1)  # learn duration

    long_tasks = c.map(long, [0.5, 0.6], workers=a.address,
                       allow_other_workers=True)
    while sum(map(len, s.processing.values())) < 2:  # let them start
        yield gen.sleep(0.01)

    start = time()
    while any(t.key in s.extensions['stealing'].key_stealable for t in long_tasks):
        yield gen.sleep(0.01)
        assert time() < start + 1

    na = len(a.executing)
    nb = len(b.executing)

    incs = c.map(inc, range(100), workers=a.address, allow_other_workers=True)

    yield gen.sleep(0.2)

    yield wait(long_tasks)

    for t in long_tasks:
        assert (sum(log[1] == 'executing' for log in a.story(t)) +
                sum(log[1] == 'executing' for log in b.story(t))) <= 1
Exemplo n.º 29
0
def test_cleanup_repeated_tasks(c, s, a, b):
    class Foo(object):
        pass

    s.extensions['stealing']._pc.callback_time = 20
    yield c.submit(slowidentity, -1, delay=0.1)
    objects = [c.submit(Foo, pure=False, workers=a.address) for _ in range(50)]

    x = c.map(slowidentity, objects, workers=a.address, allow_other_workers=True,
              delay=0.05)
    del objects
    yield wait(x)
    assert a.data and b.data
    assert len(a.data) + len(b.data) > 10
    ws = weakref.WeakSet()
    ws.update(a.data.values())
    ws.update(b.data.values())
    del x

    start = time()
    while a.data or b.data:
        yield gen.sleep(0.01)
        assert time() < start + 1

    assert not s.who_has
    assert not any(s.has_what.values())

    assert not list(ws)
Exemplo n.º 30
0
def test_gather_then_submit_after_failed_workers(loop):
    with cluster(nworkers=4) as (s, [w, x, y, z]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            L = c.map(inc, range(20))
            wait(L)
            w['proc'].terminate()
            total = c.submit(sum, L)
            wait([total])

            (_, port) = first(c.scheduler.who_has[total.key])
            for d in [x, y, z]:
                if d['port'] == port:
                    d['proc'].terminate()

            result = c.gather([total])
            assert result == [sum(map(inc, range(20)))]
Exemplo n.º 31
0
def test_new_worker_steals(c, s, a):
    yield wait(c.submit(slowinc, 1, delay=0.01))

    futures = c.map(slowinc, range(100), delay=0.05)
    total = c.submit(sum, futures)
    while len(a.task_state) < 10:
        yield gen.sleep(0.01)

    b = yield Worker(s.address,
                     loop=s.loop,
                     nthreads=1,
                     memory_limit=MEMORY_LIMIT)

    result = yield total
    assert result == sum(map(inc, range(100)))

    for w in [a, b]:
        assert all(isinstance(v, int) for v in w.data.values())

    assert b.data

    yield b.close()
Exemplo n.º 32
0
def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.8 + 200e6
    np = pytest.importorskip('numpy')

    def f():
        x = np.ones(int(300e6), dtype='u1')
        sleep(1)

    with captured_logger(logging.getLogger('distributed.worker')) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(10), delay=0.1)

        yield gen.sleep(0.3)
        assert a.paused
        out = logger.getvalue()
        assert 'memory' in out.lower()
        assert 'stop' in out.lower()

    assert sum(f.status == 'finished' for f in futures) < 4

    yield wait(futures)
Exemplo n.º 33
0
def test_steal_more_attractive_tasks(c, s, a, *rest):
    def slow2(x):
        sleep(1)
        return x

    s.extensions['stealing']._pc.callback_time = 20
    x = c.submit(mul, b'0', 100000000, workers=a.address)  # 100 MB
    yield wait(x)

    s.task_duration['slowidentity'] = 0.2
    s.task_duration['slow2'] = 1

    futures = [
        c.submit(slowidentity, x, pure=False, delay=0.2) for i in range(10)
    ]
    future = c.submit(slow2, x, priority=-1)

    while not any(w.task_state for w in rest):
        yield gen.sleep(0.01)

    # good future moves first
    assert any(future.key in w.task_state for w in rest)
Exemplo n.º 34
0
    def test_device_spill(client, scheduler, worker):

        # There's a known issue with datetime64:
        # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
        # The same error above happens when spilling datetime64 to disk
        cdf = (dask.datasets.timeseries(
            dtypes={
                "x": int,
                "y": float
            }, freq="20ms").reset_index(drop=True).map_partitions(
                cudf.from_pandas))

        sizes = yield client.compute(
            cdf.map_partitions(lambda df: df.__sizeof__()))
        sizes = sizes.tolist()
        nbytes = sum(sizes)
        part_index_nbytes = (yield client.compute(
            cdf.partitions[0].index)).__sizeof__()

        cdf2 = cdf.persist()
        yield wait(cdf2)

        del cdf

        host_chunks = yield client.run(lambda: len(get_worker().data.host))
        disk_chunks = yield client.run(lambda: len(get_worker().data.disk))
        for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
            if params["spills_to_disk"]:
                assert dc > 0
            else:
                assert hc > 0
                assert dc == 0

        yield client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes)

        del cdf2

        yield client.run(delayed_worker_assert, 0, 0, 0)
Exemplo n.º 35
0
def test_context_specific_serialization_class(c, s, a, b):
    register_serialization(MyObject, my_dumps, my_loads)

    # Create the object on A, force communication to B
    x = c.submit(MyObject, x=1, y=2, workers=a.address)
    y = c.submit(lambda x: x, x, workers=b.address)

    yield wait(y)

    key = y.key

    def check(dask_worker):
        # Get the context from the object stored on B
        my_obj = dask_worker.data[key]
        return my_obj.context

    result = yield c.run(check, workers=[b.address])
    expected = {"sender": a.address, "recipient": b.address}
    assert result[b.address]["sender"] == a.address  # see origin worker

    z = yield y  # bring object to local process

    assert z.x == 1 and z.y == 2
    assert z.context["sender"] == b.address
Exemplo n.º 36
0
    def test_device_spill(client, scheduler, worker):
        rs = da.random.RandomState(RandomState=cupy.random.RandomState)
        x = rs.random(int(250e6), chunks=10e6)

        xx = x.persist()
        yield wait(xx)

        # Allow up to 1024 bytes overhead per chunk serialized
        yield client.run(worker_assert, x.nbytes, 1024, 1024)

        y = client.compute(x.sum())
        res = yield y

        assert (abs(res / x.size) - 0.5) < 1e-3

        yield client.run(worker_assert, x.nbytes, 1024, 1024)
        host_chunks = yield client.run(lambda: len(get_worker().data.host))
        disk_chunks = yield client.run(lambda: len(get_worker().data.disk))
        for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
            if params["spills_to_disk"]:
                assert dc > 0
            else:
                assert hc > 0
                assert dc == 0
Exemplo n.º 37
0
def test_get_client(c, s, a, b):
    def f(x):
        cc = get_client()
        future = cc.submit(inc, x)
        return future.result()

    assert default_client() is c

    future = c.submit(f, 10, workers=a.address)
    result = yield future
    assert result == 11

    assert a._client
    assert not b._client

    assert a._client is c
    assert default_client() is c

    a_client = a._client

    for i in range(10):
        yield wait(c.submit(f, i))

    assert a._client is a_client
Exemplo n.º 38
0
    def handle(self, *args, **options):
        dataset = self.DATASETS[options['dataset']]()
        example_f, labels_f = _create_dataset(dataset)
        try:
            ds = DataSet.objects.get(name=options['dataset'])
        except DataSet.DoesNotExist:
            ds = DataSet.objects.create(
                name=options['dataset'],
                examples=SimpleUploadedFile(example_f.name, example_f.read()),
                labels=SimpleUploadedFile(labels_f.name, labels_f.read()))

        if options['classifier'] == 'Tree':
            gs_tree = ATGridSearchCV(
                sklearn.tree.DecisionTreeClassifier(), {
                    'criterion': ['gini', 'entropy'],
                    'max_depth': range(1, 6),
                    'max_features': range(1, len(dataset.data[0]))
                },
                dataset=ds.name,
                webserver_url=options['url'])
            futures = gs_tree.fit(dataset.data, dataset.target)
            distributed.wait(futures)
        elif options['classifier'] == 'Forest':
            gs_forest = ATGridSearchCV(
                sklearn.ensemble.RandomForestClassifier(), {
                    'criterion': ['gini', 'entropy'],
                    'max_depth': range(1, 6),
                    'max_features': range(1, len(dataset.data[0]))
                },
                dataset=ds.name,
                webserver_url=options['url'])
            distributed.wait(gs_forest.fit(dataset.data, dataset.target))
        else:
            gs_network = ATGridSearchCV(
                sklearn.neural_network.MLPClassifier(), {
                    'solver': ['lbfgs', 'sgd', 'adam'],
                    'learning_rate': ['constant', 'invscaling', 'adaptive'],
                    'max_iter': range(200, 2000, 200)
                },
                dataset=ds.name,
                webserver_url=options['url'])
            distributed.wait(gs_network.fit(dataset.data, dataset.target))
Exemplo n.º 39
0
def test_balance_many_workers_2(c, s, *workers):
    s.extensions['stealing']._pc.callback_time = 100000000
    futures = c.map(slowinc, range(90), delay=0.2)
    yield wait(futures)
    assert set(map(len, s.has_what.values())) == {3}
Exemplo n.º 40
0
def test_balance_many_workers(c, s, *workers):
    futures = c.map(slowinc, range(20), delay=0.2)
    yield wait(futures)
    assert set(map(len, s.has_what.values())) == {0, 1}
Exemplo n.º 41
0
def test_balance_many_workers(c, s, *workers):
    futures = c.map(slowinc, range(20), delay=0.2)
    yield wait(futures)
    assert {len(w.has_what) for w in s.workers.values()} == {0, 1}
Exemplo n.º 42
0
def main(client, config):
    import cudf
    import dask_cudf

    (date_dim_df, web_page_df, web_sales_df) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days)
    q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int)
    q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int)
    filtered_date_df = date_dim_cov_df.query(
        f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}",
        meta=date_dim_cov_df._meta,
    ).reset_index(drop=True)

    # Convert wp_type to categorical and get cat_id of review and dynamic type
    # see https://github.com/rapidsai/cudf/issues/4093 for more info
    web_page_df = web_page_df.persist()

    # map_partitions is a bit faster than ddf[col].astype('category')
    web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions(
        lambda ser: ser.astype("category"))
    cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas(
    )
    REVIEW_CAT_CODE = cpu_categories.get_loc("review")

    # cast to minimum viable dtype
    codes_min_signed_type = cudf.utils.dtypes.min_signed_type(
        len(cpu_categories))

    web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype(
        codes_min_signed_type)
    web_page_newcols = ["wp_web_page_sk", "wp_type_codes"]
    web_page_df = web_page_df[web_page_newcols]

    web_clickstream_flist = glob.glob(config["data_dir"] +
                                      "web_clickstreams/*.parquet")

    task_ls = [
        delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0],
                         web_page_df.to_delayed()[0])
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int64),
        "tstamp_inSec": np.ones(1, dtype=np.int64),
        "wcs_sales_sk": np.ones(1, dtype=np.int64),
        "wp_type_codes": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)
    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.repartition(columns=["wcs_user_sk"])
    reviewed_sales = merged_df.map_partitions(
        reduction_function,
        REVIEW_CAT_CODE,
        meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}),
    )
    reviewed_sales = reviewed_sales.persist()
    wait(reviewed_sales)
    del merged_df

    all_sales_in_year = filtered_date_df.merge(web_sales_df,
                                               left_on=["d_date_sk"],
                                               right_on=["ws_sold_date_sk"],
                                               how="inner")
    all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]]

    all_sales_in_year = all_sales_in_year.persist()
    wait(all_sales_in_year)

    # note: switch to mainline
    # once https://github.com/dask/dask/pull/6066
    # lands

    q08_reviewed_sales = hash_merge(
        lhs=all_sales_in_year,
        rhs=reviewed_sales,
        left_on=["ws_order_number"],
        right_on=["wcs_sales_sk"],
        how="inner",
    )

    q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum()
    q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum()

    q08_reviewed_sales_sum, q08_all_sales_sum = client.compute(
        [q08_reviewed_sales_sum, q08_all_sales_sum])
    q08_reviewed_sales_sum, q08_all_sales_sum = (
        q08_reviewed_sales_sum.result(),
        q08_all_sales_sum.result(),
    )

    no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum

    final_result_df = cudf.DataFrame()
    final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum]
    final_result_df["q08_review_sales_amount"] = final_result_df[
        "q08_review_sales_amount"].astype("int")
    final_result_df["no_q08_review_sales_amount"] = [
        no_q08_review_sales_amount
    ]
    final_result_df["no_q08_review_sales_amount"] = final_result_df[
        "no_q08_review_sales_amount"].astype("int")

    return final_result_df
Exemplo n.º 43
0
def test_dont_steal_unknown_functions(c, s, a, b):
    futures = c.map(inc, [1, 2], workers=a.address, allow_other_workers=True)
    yield wait(futures)
    assert len(a.data) == 2
    assert len(b.data) == 0
Exemplo n.º 44
0
def test_Actors_create_dependencies(c, s, a, b):
    counter = yield c.submit(Counter, actor=True)
    future = c.submit(lambda x: None, counter)
    yield wait(future)
    assert s.tasks[future.key].dependencies == {s.tasks[counter.key]}
Exemplo n.º 45
0
def test_worker_task_data(c, s, w):
    x = delayed(2)
    xx = c.persist(x)
    yield wait(xx)
    assert w.data[x.key] == 2
Exemplo n.º 46
0
Arquivo: run.py Projeto: keshava/daks
def fwi_gradient(vp_in, nshots, client, solver, shots_container, auth, scale_gradient=None, mute_water=True,
                 exclude_boundaries=True, water_depth=20, checkpointing=False, checkpoint_params=None):
    start_time = time.time()

    reset_cluster(client)

    if not hasattr(fwi_gradient, "obj_fn_cache"):
        fwi_gradient.obj_fn_cache = {}

    if exclude_boundaries:
        vp = np.array(vec2mat(vp_in, solver.model.shape), dtype=solver.model.dtype)
    else:
        vp = np.array(vec2mat(vp_in, solver.model.vp.shape), dtype=solver.model.dtype)

    solver.model.update("vp", vp)

    # Dask enforces this for large objects
    f_solver = client.scatter(solver, broadcast=True)

    futures = []

    for i in range(nshots):
        if checkpointing:
            futures.append(client.submit(process_shot_checkpointed, i, f_solver, shots_container, auth, exclude_boundaries,
                                         checkpoint_params, resources={'tasks': 1}))
        else:
            futures.append(client.submit(process_shot, i, f_solver, shots_container, auth, exclude_boundaries,
                                         resources={'tasks': 1}))  # Ensure one task per worker (to run two, tasks=0.5)

    if exclude_boundaries:
        gradient_shape = solver.model.shape
    else:
        gradient_shape = solver.model.vp.shape

    def reduction(*args):
        grad = np.zeros(gradient_shape)  # Closured from above
        objective = 0.

        for a in args:
            o, g = a
            objective += o
            grad += g
        return objective, grad

    reduce_future = client.submit(reduction, *futures)

    wait(reduce_future)

    objective, grad = reduce_future.result()

    if mute_water:
        if exclude_boundaries:
            muted_depth = water_depth
        else:
            muted_depth = water_depth + solver.model.nbl
        grad[:, 0:muted_depth] = 0

    # Scipy LBFGS misbehaves if type is not float64
    grad = mat2vec(grad).astype(np.float64)

    if scale_gradient is not None:
        if scale_gradient == "W":
            if not hasattr(fwi_gradient, "gradient_scaling_factor"):
                fwi_gradient.gradient_scaling_factor = np.max(np.abs(grad))

            grad /= fwi_gradient.gradient_scaling_factor
        elif scale_gradient == "L":
            grad /= np.max(np.abs(grad))
        else:
            raise ValueError("Invalid value %s for gradient scaling. Allowed: None, L, W" % scale_gradient)

    fwi_gradient.obj_fn_cache[vp_in.tobytes()] = objective

    elapsed_time = time.time() - start_time
    eprint("Objective function evaluation completed in %f seconds. F=%f" % (elapsed_time, objective))

    return objective, -grad
Exemplo n.º 47
0
def test_balance_many_workers_2(c, s, *workers):
    s.extensions["stealing"]._pc.callback_time = 100000000
    futures = c.map(slowinc, range(90), delay=0.2)
    yield wait(futures)
    assert {len(w.has_what) for w in s.workers.values()} == {3}
Exemplo n.º 48
0
def test_eventually_steal_unknown_functions(c, s, a, b):
    futures = c.map(slowinc, range(10), delay=0.1,  workers=a.address,
                    allow_other_workers=True)
    yield wait(futures)
    assert len(a.data) >= 3
    assert len(b.data) >= 3
Exemplo n.º 49
0
def main(client):
    item_df, store_sales_df, web_clickstreams_df = read_tables()

    ### Query 0. Filtering item table
    filtered_item_df = string_filter(item_df, "i_category", q12_i_category_IN)
    filtered_item_df = filtered_item_df.persist()

    ### filtered_item_df is a single partition to allow a nx1 merge using map partitions
    filtered_item_df = filtered_item_df.repartition(npartitions=1)
    ###  Query 1

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this  causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    meta_d = {
        "wcs_user_sk": np.ones(1,
                               dtype=web_clickstreams_df["wcs_user_sk"].dtype),
        "wcs_click_date_sk": np.ones(1, dtype=np.int64),
    }
    meta_df = cudf.DataFrame(meta_d)

    filter_wcs_df = web_clickstreams_df.map_partitions(
        filter_wcs_table, filtered_item_df.to_delayed()[0], meta=meta_df)

    ###  Query 2

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this  causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    meta_d = {
        "ss_customer_sk":
        np.ones(1, dtype=store_sales_df["ss_customer_sk"].dtype),
        "ss_sold_date_sk": np.ones(1, dtype=np.int64),
    }
    meta_df = cudf.DataFrame(meta_d)

    filtered_ss_df = store_sales_df.map_partitions(
        filter_ss_table, filtered_item_df.to_delayed()[0], meta=meta_df)

    ### Result Query
    ### SELECT DISTINCT wcs_user_sk
    ### ....
    ### webInRange
    ### storeInRange
    ### WHERE wcs_user_sk = ss_customer_sk
    ### AND wcs_click_date_sk < ss_sold_date_sk -- buy AFTER viewed on website
    ### ORDER BY wcs_user_sk

    ### Note: Below brings it down to a single partition
    filter_wcs_df_d = filter_wcs_df.drop_duplicates()
    filtered_ss_df_d = filtered_ss_df.drop_duplicates()

    ss_wcs_join = filter_wcs_df_d.merge(filtered_ss_df_d,
                                        left_on="wcs_user_sk",
                                        right_on="ss_customer_sk",
                                        how="inner")

    ss_wcs_join = ss_wcs_join[
        ss_wcs_join["wcs_click_date_sk"] < ss_wcs_join["ss_sold_date_sk"]]
    ss_wcs_join = ss_wcs_join["wcs_user_sk"]

    ### todo: check performence by replacing with 1 single drop_duplicates call

    ### below decreases memory usage on the single gpu to help with subsequent compute
    ss_wcs_join = ss_wcs_join.map_partitions(lambda sr: sr.drop_duplicates())
    ss_wcs_join = ss_wcs_join.repartition(npartitions=1).persist()
    ss_wcs_join = ss_wcs_join.drop_duplicates().reset_index(drop=True)
    ss_wcs_join = ss_wcs_join.map_partitions(lambda ser: ser.sort_values())

    # todo:check if repartition helps for writing efficiency
    # context: 0.1 seconds on sf-1k
    wait(ss_wcs_join)
    return ss_wcs_join.to_frame()
Exemplo n.º 50
0
def test_decide_worker_with_restrictions(client, s, a, b, c):
    x = client.submit(inc, 1, workers=[a.address, b.address])
    yield wait(x)
    assert x.key in a.data or x.key in b.data
Exemplo n.º 51
0
    
    tfrecords = att.generate(
        csv_file=record,
        HSI_sensor_path=hyperspec_path,
        RGB_sensor_path=rgb_path,
        chunk_size=500,
        train=True,
        domain=numeric_domain,
        site=numeric_site,
        heights=heights,
        elevation=elevation,
        label_column="filtered_taxonID",
        species_label_dict=species_label_dict
    )
    
    return tfrecords
    
train_tfrecords = []
for record in records_to_run:
    future = client.submit(run, record=record)
    train_tfrecords.append(future)
    
wait(train_tfrecords)
for x in train_tfrecords:
    try:
        print(x.result())
    except Exception as e:
        print("{} failed with {}".format(x, e))
        pass
        
Exemplo n.º 52
0
def test_worker_bad_args(c, s, a, b):
    class NoReprObj(object):
        """ This object cannot be properly represented as a string. """

        def __str__(self):
            raise ValueError("I have no str representation.")

        def __repr__(self):
            raise ValueError("I have no repr representation.")

    x = c.submit(NoReprObj, workers=a.address)
    yield wait(x)
    assert not a.executing
    assert a.data

    def bad_func(*args, **kwargs):
        1 / 0

    class MockLoggingHandler(logging.Handler):
        """Mock logging handler to check for expected logs."""

        def __init__(self, *args, **kwargs):
            self.reset()
            logging.Handler.__init__(self, *args, **kwargs)

        def emit(self, record):
            self.messages[record.levelname.lower()].append(record.getMessage())

        def reset(self):
            self.messages = {
                'debug': [],
                'info': [],
                'warning': [],
                'error': [],
                'critical': [],
            }

    hdlr = MockLoggingHandler()
    old_level = logger.level
    logger.setLevel(logging.DEBUG)
    logger.addHandler(hdlr)
    y = c.submit(bad_func, x, k=x, workers=b.address)
    yield wait(y)

    assert not b.executing
    assert y.status == 'error'
    # Make sure job died because of bad func and not because of bad
    # argument.
    with pytest.raises(ZeroDivisionError):
        yield y

    if sys.version_info[0] >= 3:
        tb = yield y._traceback()
        assert any('1 / 0' in line
                   for line in pluck(3, traceback.extract_tb(tb))
                   if line)
    assert "Compute Failed" in hdlr.messages['warning'][0]
    logger.setLevel(old_level)

    # Now we check that both workers are still alive.

    xx = c.submit(add, 1, 2, workers=a.address)
    yy = c.submit(add, 3, 4, workers=b.address)

    results = yield c._gather([xx, yy])

    assert tuple(results) == (3, 7)
Exemplo n.º 53
0
Arquivo: base.py Projeto: z7ye/dask-1
 def f():
     if futures_of(self):
         yield wait(self)
     raise gen.Return(self)
Exemplo n.º 54
0
def test_statistical_profiling(c, s, a, b):
    futures = c.map(slowinc, range(10), delay=0.1)
    yield wait(futures)

    profile = a.profile_keys['slowinc']
    assert profile['count']
Exemplo n.º 55
0
def test_work_stealing(c, s, a, b):
    [x] = yield c._scatter([1], workers=a.address)
    futures = c.map(slowadd, range(50), [x] * 50)
    yield wait(futures)
    assert len(a.data) > 10
    assert len(b.data) > 10
Exemplo n.º 56
0
def test_move_data_over_break_restrictions(client, s, a, b, c):
    [x] = yield client._scatter([1], workers=b.address)
    y = client.submit(inc, x, workers=[a.address, b.address])
    yield wait(y)
    assert y.key in a.data or y.key in b.data
Exemplo n.º 57
0
def main(client, config):
    import cudf
    import dask_cudf

    store_sales, date_dim, store, product_reviews = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    ### adding a wait call slows this down by 3-4 seconds, removing it for now
    ### Make TEMP_TABLE1

    # filter date table
    q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int)
    q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int)

    date_dim_filtered = date_dim.loc[
        (date_dim.d_date.astype("datetime64[ms]").astype("int") >=
         q18_startDate_int)
        & (date_dim.d_date.astype("datetime64[ms]").astype("int") <=
           q18_endDate_int)].reset_index(drop=True)

    # build the regression_analysis table
    ss_date_dim_join = left_semi_join(
        store_sales,
        date_dim_filtered,
        left_on=["ss_sold_date_sk"],
        right_on=["d_date_sk"],
    )

    temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg(
        {
            "ss_net_paid": "sum"
        }).reset_index())

    temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk
    temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid
    temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"]

    regression_analysis = (temp.groupby(["ss_store_sk"]).agg({
        "x": ["count", "sum"],
        "xy":
        "sum",
        "y":
        "sum",
        "xx":
        "sum"
    }).reset_index(drop=False))

    regression_analysis["slope"] = (
        regression_analysis[("x", "count")] * regression_analysis[
            ("xy", "sum")] -
        regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")]
    ) / (regression_analysis[("x", "count")] * regression_analysis[
        ("xx", "sum")] -
         regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")])
    regression_analysis = regression_analysis[["ss_store_sk", "slope"]]
    regression_analysis.columns = ["ss_store_sk", "slope"]

    regression_analysis["ss_store_sk"] = regression_analysis[
        "ss_store_sk"].astype("int32")
    store["s_store_sk"] = store["s_store_sk"].astype("int32")
    temp_table1 = store.merge(
        regression_analysis[["ss_store_sk", "slope"
                             ]].query("slope <= 0").reset_index(drop=True),
        left_on="s_store_sk",
        right_on="ss_store_sk",
    )
    temp_table1 = temp_table1[["s_store_sk", "s_store_name"]]

    # repartition this table to be one partition, since its only 192 at SF1000
    temp_table1 = temp_table1.repartition(npartitions=1)
    temp_table1 = temp_table1.persist()
    ### Make TEMP_TABLE2
    stores_with_regression = temp_table1
    pr = product_reviews

    # known to be small. very few relevant stores (169) at SF1000
    targets = (stores_with_regression.s_store_name.str.lower().unique().
               compute().tolist())
    n_targets = len(targets)

    no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True)
    no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32")

    ### perssiting because no_nulls is used twice
    no_nulls = no_nulls.reset_index(drop=True).persist()

    temp_table2_meta_empty_df = cudf.DataFrame({
        "word": ["a"],
        "pr_review_sk":
        np.ones(1, dtype=np.int64),
        "pr_review_date": ["a"],
    }).head(0)

    ### get relevant reviews
    combined = no_nulls.map_partitions(
        find_relevant_reviews,
        targets,
        meta=temp_table2_meta_empty_df,
    )

    stores_with_regression[
        "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat(
            stores_with_regression.s_store_name, sep="_")
    stores_with_regression[
        "s_store_name"] = stores_with_regression.s_store_name.str.lower()

    # Keep this commented line to illustrate that we could exactly match Spark
    # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']]
    temp_table2 = combined.merge(stores_with_regression,
                                 how="inner",
                                 left_on=["word"],
                                 right_on=["s_store_name"])

    temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]]
    temp_table2 = temp_table2.persist()

    ### REAL QUERY (PART THREE)
    no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace(
        [". ", "? ", "! "], [EOL_CHAR], regex=False)
    sentences = no_nulls.map_partitions(create_sentences_from_reviews)

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    # This file comes from the official TPCx-BB kit
    # We extracted it from bigbenchqueriesmr.jar
    sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] +
                             ["sentiment_files"])
    with open(f"{sentiment_dir}/negativeSentiment.txt") as fh:
        negativeSentiment = list(map(str.strip, fh.readlines()))
        # dedupe for one extra record in the source file
        negativeSentiment = list(set(negativeSentiment))

    word_df = sentences.map_partitions(
        create_words_from_sentences,
        global_position_column="sentence_tokenized_global_pos",
    )
    sent_df = cudf.DataFrame({"word": negativeSentiment})
    sent_df["sentiment"] = "NEG"
    sent_df = dask_cudf.from_cudf(sent_df, npartitions=1)

    word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word")

    word_sentence_sentiment[
        "sentence_idx_global_pos"] = word_sentence_sentiment[
            "sentence_idx_global_pos"].astype("int64")
    sentences["sentence_tokenized_global_pos"] = sentences[
        "sentence_tokenized_global_pos"].astype("int64")

    word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge(
        sentences,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )
    temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32")

    final = word_sentence_sentiment_with_sentence_info.merge(
        temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]],
        how="inner",
        left_on="review_idx_global_pos",
        right_on="pr_review_sk",
    )

    keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"]
    final = final[keepcols]
    final.columns = [
        "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word"
    ]
    final = final.persist()
    wait(final)
    final = final.sort_values(
        ["s_name", "r_date", "r_sentence", "sentiment_word"])
    final = final.persist()
    wait(final)
    print(len(final))
    return final
Exemplo n.º 58
0
def test_log_tasks_during_restart(c, s, a, b):
    future = c.submit(sys.exit, 0)
    yield wait(future)
    assert 'exit' in str(s.events)
Exemplo n.º 59
0
def main(client, config):
    import dask_cudf
    import cudf

    item_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    wcs_tstamp_min = get_wcs_minima(config)

    item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32")
    item_df["i_category_id"] = item_df["i_category_id"].astype("int8")

    # we eventually will only care about these categories, so we can filter now
    item_df_filtered = item_df.loc[item_df.i_category_id.isin(
        q03_purchased_item_category_IN)].reset_index(drop=True)

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    web_clickstream_flist = glob.glob(
        os.path.join(config["data_dir"], "web_clickstreams/*.parquet"))
    task_ls = [
        delayed(pre_repartition_task)(fn, item_df.to_delayed()[0],
                                      wcs_tstamp_min)
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int32),
        "tstamp": np.ones(1, dtype=np.int32),
        "wcs_item_sk": np.ones(1, dtype=np.int32),
        "wcs_sales_sk": np.ones(1, dtype=np.int32),
        "i_category_id": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)

    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.shuffle(on="wcs_user_sk")

    meta_d = {
        "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
        "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
    }
    meta_df = cudf.DataFrame(meta_d)

    grouped_df = merged_df.map_partitions(reduction_function,
                                          item_df_filtered.to_delayed()[0],
                                          meta=meta_df)

    ### todo: check if this has any impact on stability
    grouped_df = grouped_df.persist(priority=10000)
    ### todo: remove this later after more testing
    wait(grouped_df)
    print("---" * 20)
    print("grouping complete ={}".format(len(grouped_df)))
    grouped_df = grouped_df.groupby(["i_item_sk"
                                     ]).sum(split_every=2).reset_index()
    grouped_df.columns = ["i_item_sk", "cnt"]
    result_df = grouped_df.map_partitions(
        lambda df: df.sort_values(by=["cnt"], ascending=False))

    result_df.columns = ["lastviewed_item", "cnt"]
    result_df["purchased_item"] = q03_purchased_item_IN
    cols_order = ["purchased_item", "lastviewed_item", "cnt"]
    result_df = result_df[cols_order]
    result_df = result_df.persist()
    ### todo: remove this later after more testing
    wait(result_df)
    print(len(result_df))
    result_df = result_df.head(q03_limit)
    print("result complete")
    print("---" * 20)
    return result_df
Exemplo n.º 60
0
def test_get_task_status(c, s, a, b):
    future = c.submit(inc, 1)
    yield wait(future)

    result = yield a.scheduler.get_task_status(keys=[future.key])
    assert result == {future.key: 'memory'}