Пример #1
0
def ctx(request, dask_executor):
    if request.param == 'inline':
        yield Context.make_with('inline')
    elif request.param == "dask_executor":
        yield Context(executor=dask_executor)
    elif request.param == "delayed_default":
        yield Context(executor=DelayedJobExecutor())
    elif request.param == "delayed_dist":
        with distributed.Client(n_workers=2,
                                threads_per_worker=4,
                                processes=True) as _:
            yield Context(executor=DelayedJobExecutor())
    elif request.param == "dask_make_default":
        try:
            ctx = Context.make_with('dask-make-default')
            yield ctx
        finally:
            # cleanup: Close cluster and client
            # This is also tested below, here just to make
            # sure things behave as expected.
            assert isinstance(ctx.executor, DaskJobExecutor)
            ctx.executor.is_local = True
            ctx.close()
    elif request.param == "dask_integration":
        with distributed.Client(n_workers=2,
                                threads_per_worker=4,
                                processes=False) as _:
            yield Context.make_with("dask-integration")
    elif request.param == "concurrent":
        yield Context.make_with("threads")
    elif request.param == "delayed":
        yield Context(executor=DelayedJobExecutor())
Пример #2
0
def test_threads_per_worker(default_raw, dask_executor):
    ctx = Context(executor=dask_executor)
    inline_ctx = Context(executor=InlineJobExecutor())
    res = ctx.run_udf(dataset=default_raw,
                      udf=ThreadsPerWorkerUDF())['num_threads']
    res_inline = inline_ctx.run_udf(dataset=default_raw,
                                    udf=ThreadsPerWorkerUDF())['num_threads']
    assert np.allclose(res, 1)
    assert np.allclose(res_inline, psutil.cpu_count(logical=False))
Пример #3
0
    def __init__(self, path: str, continuous=False, rois=None, max_runs=-1):
        """
        Parameters
        ----------

        path
            Path to the HDR file

        continuous
            If set to True, will continuously output data

        rois: List[np.ndarray]
            If a list of ROIs is given, in continuous mode, cycle through
            these ROIs from the source data

        max_runs: int
            Maximum number of continuous runs
        """
        if rois is None:
            rois = []
        if not path.lower().endswith(".hdr"):
            raise ValueError("please pass the path to the HDR file!")
        self._path = path
        self._continuous = continuous
        self._rois = rois
        self._ctx = Context(executor=InlineJobExecutor())
        self._ds = None
        self._max_runs = max_runs
        self._mmaps = {}
Пример #4
0
def chunked_emd(tmpdir_factory):
    lt_ctx = Context(executor=InlineJobExecutor())
    datadir = tmpdir_factory.mktemp('hdf5_chunked_data')
    filename = os.path.join(datadir, 'chunked.emd')

    chunks = (32, 32, 128, 128)

    with h5py.File(filename, mode="w") as f:
        f.attrs.create('version_major', 0)
        f.attrs.create('version_minor', 2)

        f.create_group('experimental/science_data')
        group = f['experimental/science_data']
        group.attrs.create('emd_group_type', 1)

        data = np.ones((256, 256, 128, 128), dtype=np.float32)

        group.create_dataset(name='data', data=data, chunks=chunks)
        group.create_dataset(name='dim1', data=range(256))
        group['dim1'].attrs.create('name', b'dim1')
        group['dim1'].attrs.create('units', b'units1')
        group.create_dataset(name='dim2', data=range(256))
        group['dim2'].attrs.create('name', b'dim2')
        group['dim2'].attrs.create('units', b'units2')
        group.create_dataset(name='dim3', data=range(128))
        group['dim3'].attrs.create('name', b'dim3')
        group['dim3'].attrs.create('units', b'units3')
        group.create_dataset(name='dim4', data=range(128))
        group['dim4'].attrs.create('name', b'dim4')
        group['dim4'].attrs.create('units', b'units4')
        f.close()

    yield lt_ctx.load("auto",
                      path=filename,
                      ds_path="/experimental/science_data/data")
Пример #5
0
def test_threads_per_worker_vanilla(default_raw, monkeypatch):
    old_threads = os.environ.get('NUMBA_NUM_THREADS')
    # Triggers #1053
    monkeypatch.delenv('NUMBA_NUM_THREADS', raising=False)
    ctx = Context()
    assert 'NUMBA_NUM_THREADS' not in os.environ
    # We have to reset it properly since it is set in pytest.ini
    # and Numba will complain if it is changed
    if old_threads:
        os.environ['NUMBA_NUM_THREADS'] = old_threads
    inline_ctx = Context(executor=InlineJobExecutor())
    res = ctx.run_udf(dataset=default_raw, udf=ThreadsPerWorkerUDF())
    res_inline = inline_ctx.run_udf(dataset=default_raw,
                                    udf=ThreadsPerWorkerUDF())
    print(res['num_threads'].data)
    assert np.all(res['num_threads'].data == 1)
    print(res_inline['num_threads'].data)
    assert np.all(res_inline['num_threads'].data == psutil.cpu_count(
        logical=False))
Пример #6
0
    def test_large_delayed_merge(self, shared_dist_ctx_globaldask, my_ds, benchmark):
        ctx = Context(executor=DelayedJobExecutor())
        udf = EchoMergeUDF()
        resources = DelayedJobExecutor.get_resources_from_udfs(udf)

        def doit():
            result = ctx.run_udf(dataset=my_ds, udf=udf)
            return result['intensity'].delayed_raw_data.sum(axis=0).compute(resources=resources)

        benchmark(doit)
Пример #7
0
    def test_sumsig_delayed(self, shared_dist_ctx_globaldask, my_ds, benchmark):
        ctx = Context(executor=DelayedJobExecutor())
        udf = MySumSigUDF()
        resources = DelayedJobExecutor.get_resources_from_udfs(udf)

        def doit():
            result = ctx.run_udf(dataset=my_ds, udf=udf)
            return result['intensity'].delayed_raw_data.compute(resources=resources)

        benchmark(doit)
Пример #8
0
def test_dask_array_2(dask_executor):
    # NOTE: keep in sync with the example in docs/source/api.rst!
    # Construct a Dask array from the dataset
    # The second return value contains information
    # on workers that hold parts of a dataset in local
    # storage to ensure optimal data locality
    ctx = Context(executor=dask_executor)
    dataset = ctx.load("memory", datashape=(16, 16, 16), sig_dims=2)
    dask_array, workers = make_dask_array(dataset)

    # Use the Dask.distributed client of LiberTEM, since it may not be
    # the default client:
    ctx.executor.client.compute(dask_array.sum(axis=(-1, -2))).result()
Пример #9
0
def test_connect_default(local_cluster_url):
    try:
        executor = DaskJobExecutor.connect(
            local_cluster_url, client_kwargs={'set_as_default': True})
        ctx = Context(executor=executor)
        # This queries Dask which scheduler it is using
        ctx2 = Context.make_with("dask-integration")
        # make sure the second uses the Client of the first
        assert ctx2.executor.client is ctx.executor.client
    finally:
        # Only close the Client, keep the cluster running
        # since that is test infrastructure
        executor.client.close()
        ctx.close()
Пример #10
0
def test_multiple_clients(local_cluster_url, default_raw):
    ex1 = DaskJobExecutor.connect(local_cluster_url)

    # this creates a second Client, and even though we are setting `set_as_default=False`,
    # this Client is then used by functions like `dd.as_completed`. That is because
    # `set_as_default` only sets the dask scheduler config to "dask.distributed", it does
    # not affect setting the _client_ as the global default `Client`!
    # so any time `as_completed` is called, the `loop` needs to be set correctly, otherwise
    # this may result in strange hangs and crashes
    DaskJobExecutor.connect(local_cluster_url)

    udf = SumUDF()

    cx1 = Context(executor=ex1)
    cx1.run_udf(dataset=default_raw, udf=udf)
Пример #11
0
def test_context_arguments():
    with pytest.raises(ValueError):
        # refs https://github.com/LiberTEM/LiberTEM/issues/918
        Context(executor=InlineJobExecutor)
Пример #12
0
def reference(load_kwargs):
    ctx = Context(executor=InlineJobExecutor())
    return _calculate(ctx, load_kwargs)
Пример #13
0
def test_threads_per_worker(dask_executor, default_raw):
    ctx = Context(executor=dask_executor)
    res = ctx.run_udf(dataset=default_raw,
                      udf=ThreadsPerWorkerUDF())['num_threads']
    assert np.allclose(res, 1)
Пример #14
0
def main(path, scheduler_uri, stackheight, scan_size, method, num_masks,
         num_workers, num_nodes, warmup_rounds):
    scan_size = tuple(int(x) for x in scan_size.split(","))
    if num_nodes is not None and scheduler_uri is None:
        raise Exception("num_nodes limit only works for non-local cluster")
    if scheduler_uri is None:
        dask_executor = BenchmarkDaskExecutor.make_local(
            cluster_kwargs={
                'threads_per_worker': 1,
                'n_workers': num_workers,
            })
    else:
        dask_executor = BenchmarkDaskExecutor.connect(scheduler_uri,
                                                      node_limit=num_nodes)
    ctx = Context(executor=dask_executor)

    workers = ctx.executor.get_available_workers()
    for worker in workers:
        ctx.executor.client.run(_preload, workers=[worker['name']])

    def _load():
        if method == "direct":
            ds = DirectRawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size=(128, 128),
                stackheight=stackheight,
            )
        elif method == "read":
            ds = DirectRawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size=(128, 128),
                stackheight=stackheight,
                enable_direct=False,
            )
        elif method == "mmap":
            ds = RawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size_raw=(128, 128),
                crop_detector_to=(128, 128),
                tileshape=(1, stackheight, 128, 128),
            )
        ds = ds.initialize()
        return ds

    def _getsize():
        return os.stat(path).st_size

    ds = dask_executor.run_function(_load)
    dask_executor.run_function(ds.check_valid)

    total_size = dask_executor.run_function(_getsize)
    assert total_size == np.dtype(ds.dtype).itemsize * ds.shape.size

    def _make_random_mask():
        return np.random.randn(128, 128).astype("float32")

    apply_mask = ctx.create_mask_analysis(dataset=ds,
                                          factories=num_masks *
                                          [_make_random_mask])

    # warmup rounds:
    for i in range(warmup_rounds):
        ctx.run(apply_mask)

    # timed run:
    t0 = time.time()
    ctx.run(apply_mask)
    t1 = time.time()
    delta = t1 - t0

    tilesize_bytes = stackheight * 128 * 128 * 4

    results = {
        "path": path,
        "num_masks": num_masks,
        "bytes": total_size,
        "time": delta,
        "throughput_mib": total_size / delta / 1024 / 1024,
        "tilesize_bytes": tilesize_bytes,
        "method": method,
        "num_nodes": num_nodes,
        "workers": workers,
    }
    print(json.dumps(results, indent=4))
Пример #15
0
def lt_ctx():
    return Context(executor=InlineJobExecutor())