예제 #1
0
    def profile(self, *dfs, **kwargs):
        """Profiles timing given input dataframes `dfs` which are passed to
        `fit_transform`.

        """

        if self.cache_input:
            dfs = self._cache_input(dfs)

        counter = 0
        baselines = []
        max_usages = []

        while counter < self.repetitions:
            gc.collect()

            with ResourceProfiler(dt=self.interval) as rprof:
                self.func(*dfs, **kwargs)

            mem_usages = [x.mem for x in rprof.results]
            baselines.append(np.min(mem_usages))
            max_usages.append(np.max(mem_usages))

            counter += 1

        self._max_usages = max_usages
        self._baselines = baselines
        self._measurements = np.subtract(max_usages, baselines).tolist()

        if self.cache_input:
            self._clear_cached_input(dfs)

        return self
def test_no_delay_during_large_transfer(c, s, w):
    pytest.importorskip('crick')
    np = pytest.importorskip('numpy')
    x = np.random.random(100000000)

    # Reset digests
    from distributed.counter import Digest
    from collections import defaultdict
    from functools import partial
    from dask.diagnostics import ResourceProfiler

    for server in [s, w]:
        server.digests = defaultdict(partial(Digest, loop=server.io_loop))
        server._last_tick = time()

    with ResourceProfiler(dt=0.01) as rprof:
        future = yield c.scatter(x, direct=True, hash=False)
        yield gen.sleep(0.5)

    rprof.close()

    for server in [s, w]:
        assert server.digests['tick-duration'].components[0].max() < 0.5

    nbytes = np.array([t.mem for t in rprof.results])
    nbytes -= nbytes[0]
    assert nbytes.max() < (x.nbytes * 2) / 1e6
    assert nbytes[-1] < (x.nbytes * 1.2) / 1e6
예제 #3
0
def test_plot_multiple():
    from dask.diagnostics.profile_visualize import visualize

    with ResourceProfiler(dt=0.01) as rprof:
        with prof:
            get(dsk2, "c")
    p = visualize([prof, rprof],
                  label_size=50,
                  title="Not the default",
                  show=False,
                  save=False)
    # Grid plot layouts changed in Bokeh 3.
    # See https://github.com/dask/dask/issues/9257 for more details
    if BOKEH_VERSION().major < 3:
        figures = [r[0] for r in p.children[1].children]
    else:
        figures = [r[0] for r in p.children]
    assert len(figures) == 2
    assert figures[0].title.text == "Not the default"
    assert figures[0].xaxis[0].axis_label is None
    assert figures[1].title is None
    assert figures[1].xaxis[0].axis_label == "Time (s)"
    # Test empty, checking for errors
    prof.clear()
    rprof.clear()
    visualize([prof, rprof], show=False, save=False)
예제 #4
0
def test_resource_profiler_plot():
    with ResourceProfiler(dt=0.01) as rprof:
        get(dsk2, "c")
    p = rprof.visualize(
        plot_width=500,
        plot_height=300,
        tools="hover",
        title="Not the default",
        show=False,
        save=False,
    )
    assert p.plot_width == 500
    assert p.plot_height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert check_title(p, "Not the default")

    # Test with empty and one point, checking for errors
    rprof.clear()
    for results in [[], [(1.0, 0, 0)]]:
        rprof.results = results
        with pytest.warns(None) as record:
            p = rprof.visualize(show=False, save=False)
        assert len(record) == 0
        # Check bounds are valid
        assert p.x_range.start == 0
        assert p.x_range.end == 1
        assert p.y_range.start == 0
        assert p.y_range.end == 100
        assert p.extra_y_ranges["memory"].start == 0
        assert p.extra_y_ranges["memory"].end == 100
예제 #5
0
def test_plot_multiple():
    from dask.diagnostics.profile_visualize import visualize

    with ResourceProfiler(dt=0.01) as rprof:
        with prof:
            get(dsk2, "c")
    p = visualize(
        [prof, rprof], label_size=50, title="Not the default", show=False, save=False
    )
    bokeh_version = LooseVersion(bokeh.__version__)
    if bokeh_version >= "1.1.0":
        figures = [r[0] for r in p.children[1].children]
    elif bokeh_version >= "0.12.0":
        figures = [r.children[0] for r in p.children[1].children]
    else:
        figures = [r[0] for r in p.children]
    assert len(figures) == 2
    assert check_title(figures[0], "Not the default")
    assert figures[0].xaxis[0].axis_label is None
    assert figures[1].title is None
    assert figures[1].xaxis[0].axis_label == "Time (s)"
    # Test empty, checking for errors
    prof.clear()
    rprof.clear()
    visualize([prof, rprof], show=False, save=False)
예제 #6
0
def test_resource_profiler_plot():
    with ResourceProfiler(dt=0.01) as rprof:
        get(dsk2, "c")
    p = rprof.visualize(
        width=500,
        height=300,
        tools="hover",
        title="Not the default",
        show=False,
        save=False,
    )
    if BOKEH_VERSION().major < 3:
        assert p.plot_width == 500
        assert p.plot_height == 300
    else:
        assert p.width == 500
        assert p.height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert p.title.text == "Not the default"

    # Test with empty and one point, checking for errors
    rprof.clear()
    for results in [[], [(1.0, 0, 0)]]:
        rprof.results = results
        with warnings.catch_warnings(record=True) as record:
            p = rprof.visualize(show=False, save=False)
        assert not record
        # Check bounds are valid
        assert p.x_range.start == 0
        assert p.x_range.end == 1
        assert p.y_range.start == 0
        assert p.y_range.end == 100
        assert p.extra_y_ranges["memory"].start == 0
        assert p.extra_y_ranges["memory"].end == 100
예제 #7
0
def main():

    global sky
    global dirty
    global psf
     
    list_schedule = []
    list_compute = []
    list_total = []
    list_load = []
   
    start_time1 = time.time()
    sky_npy, sky = load_data(os.path.split(os.getcwd())[0] + '/sky.npy')
    dirty_npy, dirty = load_data(os.path.split(os.getcwd())[0] + '/dirty.npy')
    psf_npy, psf = load_data(os.path.split(os.getcwd())[0] + '/psf.npy')
    end_time1 = time.time()
        
    start_time2 = time.time()
    scheduling()
    end_time2 = time.time()

    pbar = ProgressBar()
	
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof:	
        start_time3 = time.time()        
        hub.compute()
        end_time3 = time.time()
	
    #pbar.register()
    #quad.compute()
    #pbar.unregister()	
	
    with PrintKeys():
        hub.compute()

    print("\n" + "Resultats du profilling:")	
    print(prof.results[0])
    print("\n" + "La valeur d'usage de la memoire est en MB et l'information du CPU est %d'usage de la CPU")	
    print(rprof.results)
    print("\n" + "Resultats du profilling de la cache:")
    print(cprof.results[0])

    visualize([prof, rprof, cprof])

    list_load.append(end_time1 - start_time1)
    list_schedule.append(end_time2 - start_time2)
    list_compute.append(end_time3 - start_time3)
    list_total.append(end_time3 - start_time1)    

    print("\n" + "Temps du code pous analyse")
    print('load time: {}'.format(round(sum(list_load)/len(list_load), 4)))
    print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4)))
    print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4)))
    print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
예제 #8
0
def uncompress_to_hdf5():
    print('Writing to hdf5 file after loading raw data in RAM.')

    raw_arr = uncompress()

    # create dask array from data in RAM
    arr = da.from_array(raw_arr, chunks=(1400, 1400, 350))

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None)

        print(
            f'time to save the array to hdf5 without compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_commpressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)
예제 #9
0
def test_resource_profiler():
    with ResourceProfiler(dt=0.01) as rprof:
        out = get(dsk2, 'c')
    results = rprof.results
    assert all(isinstance(i, tuple) and len(i) == 3 for i in results)

    rprof.clear()
    assert rprof.results == []

    rprof.close()
    assert not rprof._tracker.is_alive()

    with pytest.raises(AssertionError):
        with rprof:
            get(dsk, 'e')
예제 #10
0
def onthefly_to_nps():
    print('Writing to npy stack file without loading raw data in RAM.')

    out_dir = 'data/out_3_numpy'
    out_file_path = "outputs/write_npy_stack.html"

    # write to numpy stack
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        write_to_npy_stack(out_dir, arr)

        print(f'time to save the array to numpy stack: {time.time() - t}')
        visualize([prof, rprof, cprof], out_file_path)
def test_resource_profiler_plot():
    with ResourceProfiler(dt=0.01) as rprof:
        get(dsk2, 'c')
    p = rprof.visualize(plot_width=500,
                        plot_height=300,
                        tools="hover",
                        title="Not the default",
                        show=False, save=False)
    assert p.plot_width == 500
    assert p.plot_height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert check_title(p, "Not the default")
    # Test empty, checking for errors
    rprof.clear()
    rprof.visualize(show=False, save=False)
예제 #12
0
def test_plot_multiple():
    from dask.diagnostics.profile_visualize import visualize
    from bokeh.plotting import GridPlot
    with ResourceProfiler(dt=0.01) as rprof:
        with prof:
            get(dsk2, 'c')
    p = visualize([prof, rprof], label_size=50,
                  title="Not the default", show=False, save=False)
    assert isinstance(p, GridPlot)
    assert len(p.children) == 2
    assert p.children[0][0].title == "Not the default"
    assert p.children[0][0].xaxis[0].axis_label is None
    assert p.children[1][0].title is None
    assert p.children[1][0].xaxis[0].axis_label == 'Time (s)'
    # Test empty, checking for errors
    prof.clear()
    rprof.clear()
    visualize([prof, rprof], show=False, save=False)
예제 #13
0
    def execute(self, wf, client):
        if not wf.processes:
            return {}

        dsk = wf.convertGraph()

        with Profiler() as prof, ResourceProfiler(
                dt=0.25) as rprof, CacheProfiler() as cprof:
            result = client.get(dsk[0], dsk[1])

        msg.logMessage('result:', result, level=msg.DEBUG)
        path = user_config_dir('xicam/profile.html')
        visualize([prof, rprof, cprof], show=False, file_path=path)
        msg.logMessage(f'Profile saved: {path}')

        wf.lastresult = result

        return result
예제 #14
0
def test_plot_multiple():
    from dask.diagnostics.profile_visualize import visualize

    with ResourceProfiler(dt=0.01) as rprof:
        with prof:
            get(dsk2, "c")
    p = visualize(
        [prof, rprof], label_size=50, title="Not the default", show=False, save=False
    )
    figures = [r[0] for r in p.children[1].children]
    assert len(figures) == 2
    assert figures[0].title.text == "Not the default"
    assert figures[0].xaxis[0].axis_label is None
    assert figures[1].title is None
    assert figures[1].xaxis[0].axis_label == "Time (s)"
    # Test empty, checking for errors
    prof.clear()
    rprof.clear()
    visualize([prof, rprof], show=False, save=False)
예제 #15
0
def uncompress_to_npy():
    print('Writing to numpy file after loading raw data in RAM.')
    out_filepath = 'data/out_1.npy'
    diagnostics_filepath = "outputs/load_raw_write_npy_file.html"

    raw_arr = uncompress()

    # write to numpy file
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        np.save(out_filepath, raw_arr)

        print(f'time to save the array to numpy file: {time.time() - t}')
        visualize([prof, rprof, cprof], diagnostics_filepath)
예제 #16
0
def test_resource_profiler_multiple_gets():
    with ResourceProfiler(dt=0.01) as rprof:
        get(dsk2, "c")
        assert len(rprof.results) == 0
        get(dsk2, "c")
    results = rprof.results
    assert all(isinstance(i, tuple) and len(i) == 3 for i in results)

    rprof.clear()
    rprof.register()
    get(dsk2, "c")
    assert len(rprof.results) > 0
    get(dsk2, "c")
    rprof.unregister()

    results = rprof.results
    assert all(isinstance(i, tuple) and len(i) == 3 for i in results)

    rprof.close()
    assert not rprof._is_running()
예제 #17
0
    def _execute_graph(self, *writes):
        # Set up Profilers and Progress Bars
        with ExitStack() as stack:
            profilers = []

            if can_profile:
                from dask.diagnostics import (Profiler, CacheProfiler,
                                              ResourceProfiler, visualize)

                profilers.append(stack.enter_context(Profiler()))
                profilers.append(stack.enter_context(CacheProfiler()))
                profilers.append(stack.enter_context(ResourceProfiler()))

            if sys.stdout.isatty() and not self.args.boring:
                from dask.diagnostics import ProgressBar
                stack.enter_context(ProgressBar())
            dask.compute(*writes, scheduler='single-threaded')
            logger.info("Averaging Complete")

        if can_profile:
            visualize(profilers)
예제 #18
0
def test_resource_profiler():
    with ResourceProfiler(dt=0.01) as rprof:
        get(dsk2, "c")
    results = rprof.results
    assert len(results) > 0
    assert all(isinstance(i, tuple) and len(i) == 3 for i in results)

    # Tracker stopped on exit
    assert not rprof._is_running()

    rprof.clear()
    assert rprof.results == []

    # Close is idempotent
    rprof.close()
    assert not rprof._is_running()

    # Restarts tracker if already closed
    with rprof:
        get(dsk2, "c")
    assert len(rprof.results) > 0
예제 #19
0
def uncompress_to_nps():
    print('Writing to numpy stack after loading raw data in RAM.')

    # load data in RAM
    raw_arr = uncompress()

    # create dask array from data in RAM
    arr = da.from_array(raw_arr, chunks=(1400, 1400, 350))

    # write to numpy stack
    out_dir = 'data/out_numpy'

    out_file_path = "outputs/load_raw_write_npy_stack.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        write_to_npy_stack(out_dir, arr)

        print(f'time to save the array to numpy stack: {time.time() - t}')
        visualize([prof, rprof, cprof], out_file_path)
예제 #20
0
def onthefly_to_hdf5():
    print('Writing to hdf5 file without loading raw data in RAM.')

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/write_hdf5.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath,
                   'data',
                   arr,
                   chunks=(1400, 1400, 350),
                   compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)
예제 #21
0
    with CacheProfiler(nbytes) as cprof:
        get(dsk2, "c")

    results = cprof.results
    assert tics[-1] == len(results)
    assert tics[-1] == results[-1].metric
    assert cprof._metric_name == "nbytes"
    assert CacheProfiler(metric=nbytes, metric_name="foo")._metric_name == "foo"


@pytest.mark.parametrize(
    "profiler",
    [
        Profiler,
        pytest.param(
            lambda: ResourceProfiler(dt=0.01), marks=pytest.mark.skipif("not psutil")
        ),
        CacheProfiler,
    ],
)
def test_register(profiler):
    prof = profiler()
    try:
        prof.register()
        get(dsk2, "c")
        n = len(prof.results)
        assert n > 0
        get(dsk2, "c")
        assert len(prof.results) > n
    finally:
        prof.unregister()
def initialize_dir(dirpath,
                   idpath,
                   meta_dict,
                   recurse=False,
                   IDR_data=False,
                   IDR_IDs=None,
                   ignore_old=True,
                   fname_prefix=None,
                   fname_suffix=None,
                   processes=None,
                   profiling=False,
                   verbose=False):
    """Intialize the data structure for a directory of new image stacks.
    
    This is a dask pipeline that applies the function `initialize_stack` from
    `katachi.tools.initialize` to an entire directory.
    
    See `katachi.tools.initialize.initialize_stack` for more information.
    
    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    idpath : string or None
        Path of the text file containing previously generated IDs.
        Necessary to ensure that newly generated IDs are unique.
    meta_dict : dict 
        A dictionary containing the initial (user-defined) metadata for the
        stack. See Notes below for the keys that must be included.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
        This is ignored if `IDR_data` is True, as recursing through subfolders
        is not supported on IDR data.
    IDR_data : bool, optional, default False
        If True, the data is expected to already be grouped into subdirectories
        named according to already assigned IDs, as this is how the data was
        deposited on the IDR database.
    IDR_IDs : list of IDs or None, optional, default None
        If IDR_data is True, a list of IDs can be passed to specify a subset of 
        samples for which this pipeline is to be run. 
    ignore_old : bool, optional, default True
        If True, files that already have a known ID listed in the ID file will
        be ignored. This is not supported for IDR data, so if IDR_data is True
        and ignore_old is True, an error is raised.
    fname_prefix : str or None, optional
        If not None, only file names that start with the given string are used.
    fname_suffix : str or None, optional
        If not None, only file names that end with the given string (or with
        the given string + .tif) are used.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (dask is not used).
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask]. 
    verbose : bool, optional, default False
        If True, more information is printed.
    
    Notes
    -----
    The meta_dict dictionary must contain the following entries:
    - 'channels'   : A list of strings naming the channels in order. Must not 
                     contain characters that cannot be used in file names.
    - 'resolution' : A list of floats denoting the voxel size of the input
                     stack in order ZYX.
    It may optionally contain other entries as well.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    if verbose: print "Detecting target files..."

    # Function to select file names and create paths
    def get_fnames_ready(fnames, fpath, known_ids=None):

        fnames = fnames[:]

        fnames = [fname for fname in fnames if fname.endswith(".tif")]

        if ignore_old:
            fnames = [
                fname for fname in fnames
                if not any([fname.startswith(ID) for ID in known_ids])
            ]

        if fname_prefix:
            fnames = [
                fname for fname in fnames if fname.startswith(fname_prefix)
            ]
        if fname_suffix:
            fnames = [
                fname for fname in fnames
                if fname.endswith(fname_suffix +
                                  ".tif") or fname.endswith(fname_suffix)
            ]

        fpaths = [os.path.join(fpath, fname) for fname in fnames]

        return fpaths

    # If this is run on IDR data, most of the work is already done!
    if IDR_data:

        # Handle inputs
        if ignore_old:
            raise IOError(
                "`ignore_old` is not supported for IDR data. Be " +
                "careful when running this so as to avoid over" +
                "writing important metadata. Aborting for now; set " +
                "`ignore_old` to False to prevent this error.")
        if IDR_IDs is None:
            IDR_IDs = [
                ID for ID in os.listdir(dirpath)
                if os.path.isdir(ID) and len(ID) == 10
            ]

        # Write the metadata files; all else is already done
        if verbose: print "Creating metadata files for IDR data..."
        for ID in IDR_IDs:
            meta_path = os.path.join(dirpath, ID, ID + '_stack_metadata.pkl')
            with open(meta_path, 'wb') as outfile:
                pickle.dump(meta_dict, outfile, pickle.HIGHEST_PROTOCOL)
        if verbose: print "Processing complete!"
        return

    # If needed, load previously generated IDs (to exclude those files)
    if ignore_old:
        try:
            with open(idpath, "r") as infile:
                known_ids = [line.strip() for line in infile.readlines()]
        except:
            print("Attempting to load existing IDs from id_file failed " +
                  "with this error:")
            raise
    else:
        known_ids = None

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = get_fnames_ready(fnames, dirpath, known_ids=known_ids)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += get_fnames_ready(fnames, dpath, known_ids)

    # Check
    if len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Report
    if verbose:
        print "-- Detected", len(fpaths), "target files."

    #--------------------------------------------------------------------------

    ### If desired: run sequentially (does not use dask)

    if processes == 1:
        if verbose: print "Processing target files sequentially..."
        for fpath in fpaths:
            initialize_stack(fpath, idpath, meta_dict, verbose=False)
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    if verbose: print "Processing target files in parallel..."

    dask_graph = dict()
    for i, fpath in enumerate(fpaths):
        dask_graph["initialize_%i" % i] = (initialize_stack, fpath, idpath,
                                           meta_dict, False)
    dask_graph['done'] = (lambda x: "done",
                          ["initialize_%i" % i for i in range(len(fpaths))])

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.threaded.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.threaded.get(dask_graph, 'done')
            visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
sys.path.append('/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan')

from dmagellan.feature.extractfeatures import extract_feature_vecs
from dmagellan.feature.autofeaturegen import get_features_for_matching

from dask import multiprocessing, threaded
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize
import cloudpickle
filename='./profres_exp_mt_dblp_300k_extractfeatvecs.html'

pbar = ProgressBar()
pbar.register()

#print("Mem. usage before reading:{0}".format( psutil.virtual_memory().used/1e9))
A = pd.read_csv('./datasets/sample_citeseer_300k.csv')
B = pd.read_csv('./datasets/sample_dblp_300k.csv')
#print("Mem. usage after reading:{0}".format(psutil.virtual_memory().used/1e9))

C = pd.read_csv('./datasets/candset.csv')

feature_table = get_features_for_matching(A, B)

feature_vecs = extract_feature_vecs(C, A, B, '_id', 'l_id',  'r_id', 'id', 'id', feature_table=feature_table,
        nchunks=4, compute=False)

with Profiler() as prof, CacheProfiler() as cprof, ResourceProfiler(dt=0.25) as rprof:
    D = feature_vecs.compute(get=threaded.get, num_workers=4)


visualize([prof, cprof, rprof], file_path=filename, show=False)
예제 #24
0
def _main(args):
    tic = time.time()

    log.info(banner())

    if args.disable_post_mortem:
        log.warn("Disabling crash debugging with the "
                 "Interactive Python Debugger, as per user request")
        post_mortem_handler.disable_pdb_on_error()

    log.info("Flagging on the {0:s} column".format(args.data_column))
    data_column = args.data_column
    masked_channels = [
        load_mask(fn, dilate=args.dilate_masks) for fn in collect_masks()
    ]
    GD = args.config

    log_configuration(args)

    # Group datasets by these columns
    group_cols = ["FIELD_ID", "DATA_DESC_ID", "SCAN_NUMBER"]
    # Index datasets by these columns
    index_cols = ['TIME']

    # Reopen the datasets using the aggregated row ordering
    columns = [data_column, "FLAG", "TIME", "ANTENNA1", "ANTENNA2"]

    if args.subtract_model_column is not None:
        columns.append(args.subtract_model_column)

    xds = list(
        xds_from_ms(args.ms,
                    columns=tuple(columns),
                    group_cols=group_cols,
                    index_cols=index_cols,
                    chunks={"row": args.row_chunks}))

    # Get support tables
    st = support_tables(args.ms)
    ddid_ds = st["DATA_DESCRIPTION"]
    field_ds = st["FIELD"]
    pol_ds = st["POLARIZATION"]
    spw_ds = st["SPECTRAL_WINDOW"]
    ant_ds = st["ANTENNA"]

    assert len(ant_ds) == 1
    assert len(ddid_ds) == 1

    antspos = ant_ds[0].POSITION.data
    antsnames = ant_ds[0].NAME.data
    fieldnames = [fds.NAME.data[0] for fds in field_ds]

    avail_scans = [ds.SCAN_NUMBER for ds in xds]
    args.scan_numbers = list(
        set(avail_scans).intersection(args.scan_numbers if args.scan_numbers
                                      is not None else avail_scans))

    if args.scan_numbers != []:
        log.info("Only considering scans '{0:s}' as "
                 "per user selection criterion".format(", ".join(
                     map(str, map(int, args.scan_numbers)))))

    if args.field_names != []:
        flatten_field_names = []
        for f in args.field_names:
            # accept comma lists per specification
            flatten_field_names += [x.strip() for x in f.split(",")]
        for f in flatten_field_names:
            if re.match(r"^\d+$", f) and int(f) < len(fieldnames):
                flatten_field_names.append(fieldnames[int(f)])
        flatten_field_names = list(
            set(
                filter(lambda x: not re.match(r"^\d+$", x),
                       flatten_field_names)))
        log.info("Only considering fields '{0:s}' for flagging per "
                 "user "
                 "selection criterion.".format(", ".join(flatten_field_names)))
        if not set(flatten_field_names) <= set(fieldnames):
            raise ValueError("One or more fields cannot be "
                             "found in dataset '{0:s}' "
                             "You specified {1:s}, but "
                             "only {2:s} are available".format(
                                 args.ms, ",".join(flatten_field_names),
                                 ",".join(fieldnames)))

        field_dict = {fieldnames.index(fn): fn for fn in flatten_field_names}
    else:
        field_dict = {i: fn for i, fn in enumerate(fieldnames)}

    # List which hold our dask compute graphs for each dataset
    write_computes = []
    original_stats = []
    final_stats = []

    # Iterate through each dataset
    for ds in xds:
        if ds.FIELD_ID not in field_dict:
            continue

        if (args.scan_numbers is not None
                and ds.SCAN_NUMBER not in args.scan_numbers):
            continue

        log.info("Adding field '{0:s}' scan {1:d} to "
                 "compute graph for processing".format(field_dict[ds.FIELD_ID],
                                                       ds.SCAN_NUMBER))

        ddid = ddid_ds[ds.attrs['DATA_DESC_ID']]
        spw_info = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]]
        pol_info = pol_ds[ddid.POLARIZATION_ID.data[0]]

        nrow, nchan, ncorr = getattr(ds, data_column).data.shape

        # Visibilities from the dataset
        vis = getattr(ds, data_column).data
        if args.subtract_model_column is not None:
            log.info("Forming residual data between '{0:s}' and "
                     "'{1:s}' for flagging.".format(
                         data_column, args.subtract_model_column))
            vismod = getattr(ds, args.subtract_model_column).data
            vis = vis - vismod

        antenna1 = ds.ANTENNA1.data
        antenna2 = ds.ANTENNA2.data
        chan_freq = spw_info.CHAN_FREQ.data[0]
        chan_width = spw_info.CHAN_WIDTH.data[0]

        # Generate unflagged defaults if we should ignore existing flags
        # otherwise take flags from the dataset
        if args.ignore_flags is True:
            flags = da.full_like(vis, False, dtype=np.bool)
            log.critical("Completely ignoring measurement set "
                         "flags as per '-if' request. "
                         "Strategy WILL NOT or with original flags, even if "
                         "specified!")
        else:
            flags = ds.FLAG.data

        # If we're flagging on polarised intensity,
        # we convert visibilities to polarised intensity
        # and any flagged correlation will flag the entire visibility
        if args.flagging_strategy == "polarisation":
            corr_type = pol_info.CORR_TYPE.data[0].tolist()
            stokes_map = stokes_corr_map(corr_type)
            stokes_pol = tuple(v for k, v in stokes_map.items() if k != "I")
            vis = polarised_intensity(vis, stokes_pol)
            flags = da.any(flags, axis=2, keepdims=True)
        elif args.flagging_strategy == "total_power":
            if args.subtract_model_column is None:
                log.critical("You requested to flag total quadrature "
                             "power, but not on residuals. "
                             "This is not advisable and the flagger "
                             "may mistake fringes of "
                             "off-axis sources for broadband RFI.")
            corr_type = pol_info.CORR_TYPE.data[0].tolist()
            stokes_map = stokes_corr_map(corr_type)
            stokes_pol = tuple(v for k, v in stokes_map.items())
            vis = polarised_intensity(vis, stokes_pol)
            flags = da.any(flags, axis=2, keepdims=True)
        elif args.flagging_strategy == "standard":
            if args.subtract_model_column is None:
                log.critical("You requested to flag per correlation, "
                             "but not on residuals. "
                             "This is not advisable and the flagger "
                             "may mistake fringes of off-axis sources "
                             "for broadband RFI.")
        else:
            raise ValueError("Invalid flagging strategy '%s'" %
                             args.flagging_strategy)

        ubl = unique_baselines(antenna1, antenna2)
        utime, time_inv = da.unique(ds.TIME.data, return_inverse=True)
        utime, ubl = dask.compute(utime, ubl)
        ubl = ubl.view(np.int32).reshape(-1, 2)
        # Stack the baseline index with the unique baselines
        bl_range = np.arange(ubl.shape[0], dtype=ubl.dtype)[:, None]
        ubl = np.concatenate([bl_range, ubl], axis=1)
        ubl = da.from_array(ubl, chunks=(args.baseline_chunks, 3))

        vis_windows, flag_windows = pack_data(time_inv,
                                              ubl,
                                              antenna1,
                                              antenna2,
                                              vis,
                                              flags,
                                              utime.shape[0],
                                              backend=args.window_backend,
                                              path=args.temporary_directory)

        original_stats.append(
            window_stats(flag_windows, ubl, chan_freq, antsnames,
                         ds.SCAN_NUMBER, field_dict[ds.FIELD_ID],
                         ds.attrs['DATA_DESC_ID']))

        with StrategyExecutor(antspos, ubl, chan_freq, chan_width,
                              masked_channels, GD['strategies']) as se:

            flag_windows = se.apply_strategies(flag_windows, vis_windows)

        final_stats.append(
            window_stats(flag_windows, ubl, chan_freq, antsnames,
                         ds.SCAN_NUMBER, field_dict[ds.FIELD_ID],
                         ds.attrs['DATA_DESC_ID']))

        # Unpack window data for writing back to the MS
        unpacked_flags = unpack_data(antenna1, antenna2, time_inv, ubl,
                                     flag_windows)

        # Flag entire visibility if any correlations are flagged
        equalized_flags = da.sum(unpacked_flags, axis=2, keepdims=True) > 0
        corr_flags = da.broadcast_to(equalized_flags, (nrow, nchan, ncorr))

        if corr_flags.chunks != ds.FLAG.data.chunks:
            raise ValueError("Output flag chunking does not "
                             "match input flag chunking")

        # Create new dataset containing new flags
        new_ds = ds.assign(FLAG=(("row", "chan", "corr"), corr_flags))

        # Write back to original dataset
        writes = xds_to_table(new_ds, args.ms, "FLAG")
        # original should also have .compute called because we need stats
        write_computes.append(writes)

    if len(write_computes) > 0:
        # Combine stats from all datasets
        original_stats = combine_window_stats(original_stats)
        final_stats = combine_window_stats(final_stats)

        with contextlib.ExitStack() as stack:
            # Create dask profiling contexts
            profilers = []

            if can_profile:
                profilers.append(stack.enter_context(Profiler()))
                profilers.append(stack.enter_context(CacheProfiler()))
                profilers.append(stack.enter_context(ResourceProfiler()))

            if sys.stdout.isatty():
                # Interactive terminal, default ProgressBar
                stack.enter_context(ProgressBar())
            else:
                # Non-interactive, emit a bar every 5 minutes so
                # as not to spam the log
                stack.enter_context(ProgressBar(minimum=1, dt=5 * 60))

            _, original_stats, final_stats = dask.compute(
                write_computes, original_stats, final_stats)
        if can_profile:
            visualize(profilers)

        toc = time.time()

        # Log each summary line
        for line in summarise_stats(final_stats, original_stats):
            log.info(line)

        elapsed = toc - tic
        log.info("Data flagged successfully in "
                 "{0:02.0f}h{1:02.0f}m{2:02.0f}s".format((elapsed // 60) // 60,
                                                         (elapsed // 60) % 60,
                                                         elapsed % 60))
    else:
        log.info("User data selection criteria resulted in empty dataset. "
                 "Nothing to be done. Bye!")
예제 #25
0
 def compute(self, **kwargs):
     with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
         self._computed_result = dask.compute(self._result, **kwargs)[0]
         self._prof = prof
         self._rprof = rprof
         self._cprof = cprof
예제 #26
0
def main(cfgfile, starttime=None, endtime=None, trajfile="", trajtype='plane',
         flashnr=0, infostr="", MULTIPROCESSING_DSET=False,
         MULTIPROCESSING_PROD=False, PROFILE_MULTIPROCESSING=False):
    """
    Main flow control. Processes radar data off-line over a period of time
    given either by the user, a trajectory file, or determined by the last
    volume processed and the current time. Multiple radars can be processed
    simultaneously

    Parameters
    ----------
    cfgfile : str
        path of the main config file
    starttime, endtime : datetime object
        start and end time of the data to be processed
    trajfile : str
        path to file describing the trajectory
    trajtype : str
        type of trajectory file. Can be either 'plane' or 'lightning'
    flashnr : int
        If larger than 0 will select a flash in a lightning trajectory file.
        If 0 the data corresponding to the trajectory of all flashes will be
        plotted
    infostr : str
        Information string about the actual data processing
        (e.g. 'RUN57'). This string is added to product files.
    MULTIPROCESSING_DSET : Bool
        If true the generation of datasets at the same processing level will
        be parallelized
    MULTIPROCESSING_PROD : Bool
        If true the generation of products from each dataset will be
        parallelized
    PROFILE_MULTIPROCESSING : Bool
        If true and code parallelized the multiprocessing is profiled

    """
    print("- PYRAD version: %s (compiled %s by %s)" %
          (pyrad_version.version, pyrad_version.compile_date_time,
           pyrad_version.username))
    print("- PYART version: " + pyart_version.version)

    # Define behaviour of warnings
    warnings.simplefilter('always')  # always print matching warnings
    # warnings.simplefilter('error')  # turn matching warnings into exceptions
    warnings.formatwarning = _warning_format  # define format

    if ALLOW_USER_BREAK:
        input_queue = _initialize_listener()

    if not _DASK_AVAILABLE:
        MULTIPROCESSING_DSET = False
        MULTIPROCESSING_PROD = False
        PROFILE_MULTIPROCESSING = False

    # check if multiprocessing profiling is necessary
    if not MULTIPROCESSING_DSET and not MULTIPROCESSING_PROD:
        PROFILE_MULTIPROCESSING = False
    elif MULTIPROCESSING_DSET and MULTIPROCESSING_PROD:
        PROFILE_MULTIPROCESSING = False

    if MULTIPROCESSING_DSET and MULTIPROCESSING_PROD:
        # necessary to launch tasks from tasks
        Client()

    if PROFILE_MULTIPROCESSING:
        prof = Profiler()
        rprof = ResourceProfiler()
        cprof = CacheProfiler()

        prof.register()
        rprof.register()
        cprof.register()

    cfg = _create_cfg_dict(cfgfile)
    datacfg = _create_datacfg_dict(cfg)

    starttime, endtime, traj = _get_times_and_traj(
        trajfile, starttime, endtime, cfg['ScanPeriod'],
        last_state_file=cfg['lastStateFile'], trajtype=trajtype,
        flashnr=flashnr)

    if infostr:
        print('- Info string : ' + infostr)

    # get data types and levels
    datatypesdescr_list = list()
    for i in range(1, cfg['NumRadars']+1):
        datatypesdescr_list.append(
            _get_datatype_list(cfg, radarnr='RADAR'+'{:03d}'.format(i)))

    dataset_levels = _get_datasets_list(cfg)

    masterfilelist, masterdatatypedescr, masterscan = _get_masterfile_list(
        datatypesdescr_list[0], starttime, endtime, datacfg,
        scan_list=datacfg['ScanList'])

    nvolumes = len(masterfilelist)
    if nvolumes == 0:
        raise ValueError(
            "ERROR: Could not find any valid volumes between " +
            starttime.strftime('%Y-%m-%d %H:%M:%S') + " and " +
            endtime.strftime('%Y-%m-%d %H:%M:%S') + " for " +
            "master scan '" + str(masterscan) +
            "' and master data type '" + masterdatatypedescr +
            "'")
    print('- Number of volumes to process: ' + str(nvolumes))
    print('- Start time: ' + starttime.strftime("%Y-%m-%d %H:%M:%S"))
    print('- end time: ' + endtime.strftime("%Y-%m-%d %H:%M:%S"))

    # initial processing of the datasets
    print('\n\n- Initializing datasets:')
    dscfg, traj = _initialize_datasets(
        dataset_levels, cfg, traj=traj, infostr=infostr)

    # process all data files in file list or until user interrupts processing
    for masterfile in masterfilelist:
        if ALLOW_USER_BREAK:
            # check if user has requested exit
            try:
                input_queue.get_nowait()
                warn('Program terminated by user')
                break
            except queue.Empty:
                pass

        print('\n- master file: ' + os.path.basename(masterfile))

        master_voltime = get_datetime(masterfile, masterdatatypedescr)

        radar_list = _get_radars_data(
            master_voltime, datatypesdescr_list, datacfg,
            num_radars=datacfg['NumRadars'])

        # process all data sets
        dscfg, traj = _process_datasets(
            dataset_levels, cfg, dscfg, radar_list, master_voltime, traj=traj,
            infostr=infostr, MULTIPROCESSING_DSET=MULTIPROCESSING_DSET,
            MULTIPROCESSING_PROD=MULTIPROCESSING_PROD)

        # delete variables
        del radar_list

        gc.collect()

    # post-processing of the datasets
    print('\n\n- Post-processing datasets:')
    dscfg, traj = _postprocess_datasets(
        dataset_levels, cfg, dscfg, traj=traj, infostr=infostr)

    if PROFILE_MULTIPROCESSING:
        prof.unregister()
        rprof.unregister()
        cprof.unregister()

        bokeh_plot = visualize([prof, rprof, cprof], show=False, save=False)

        profile_path = os.path.expanduser('~')+'/profiling/'
        if not os.path.isdir(profile_path):
            os.makedirs(profile_path)

        export_png(bokeh_plot, filename=(
            profile_path+datetime.utcnow().strftime('%Y%m%d%H%M%S') +
            '_profile.png'))

    print('- This is the end my friend! See you soon!')
예제 #27
0
파일: emd_sum.py 프로젝트: sk1p/LiberTEM
import sys
import time
import dask
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
from multiprocessing.pool import ThreadPool
import hyperspy.api as hs

emd_filename_list = sys.argv[1:]
emd_filename_list.sort()

with dask.set_options(
        pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler(
            dt=0.25) as rprof, CacheProfiler() as cprof:
    for emd_filename in emd_filename_list:
        s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3))
        t0 = time.time()
        result = s.sum()
        print(emd_filename)
        delta = time.time() - t0
        print(delta)
        print(f"{s.data.nbytes / delta / 1024 / 1024} MB/s")

visualize([prof, rprof, cprof])
예제 #28
0
def feature_extraction(dirpath,
                       suffix_seg,
                       suffix_int,
                       num_LMs,
                       downsample,
                       clustering,
                       features,
                       recurse=False,
                       select_IDs='all',
                       assign_landmarks_kwargs='default',
                       compute_TFOR=True,
                       transform_to_TFOR_kwargs='default',
                       perform_CBE_TFOR_kwargs='default',
                       compute_CFOR=True,
                       perform_CBE_CFOR_kwargs='default',
                       processes=None,
                       dask_graph_path=None,
                       profiling=False,
                       verbose=False):
    """Extract latent features from fluorescence distributions of single-cell
    segmentations by point cloud sampling and cluster-based embedding.

    This is a dask pipeline that applies point-cloud sampling from
    `katachi.tools.assign_landmars`, transformation to the TFOR (optional)
    from `katachi.tools.find_TFOR` and cluster-based embedding (either on TFOR
    data or by constructing a CFOR, or both) from `katachi.tools.perform_CBE`
    to a dataset of single-cell segmentations that has been generated by
    `katachi.pipelines.segmentation` or an equivalent approach.

    WARNING: Not all options provided by this pipeline have been extensively
    tested. Use with prudence!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    suffix_seg : string
        File suffix that identifies target segmentation files as produced by
        `katachi.pipelines.segmentation`. This will usually be "seg.tif" but
        could contain more information to distinguish different segmentations.
    suffix_int : string
        File suffix that identifies target intensity files matching the shape
        of the target segmentation files. Each retrieved segmentation file must
        have a matching intensity file.
    num_LMs : int
        The number of landmarks to extract for each cell.
    downsample : tuple (algorithm, output_size) or None
        A tuple specifying the algorithm to use for downsampling of the merged
        point cloud prior to cluster extraction.
        See `katachi.tools.perform_CBE` for more information.
    clustering : tuple (algorithm, n_clusters)
        A tuple specifying the algorithm to use for computing the clusters to
        use in cluster-based feature extraction.
        See `katachi.tools.perform_CBE` for more information.
        Special case: both elements of clustering (i.e. `algorithm` and
        `n_clusters`) may themselves be tuples. In this case, their first and
        second elements will be used in CBE on TFOR and CFOR, respectively.
    features : list of strings
        List containing any number of cluster features to be extracted.
        See `katachi.tools.perform_CBE` for more information.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
    select_IDs : 'all' or list of strings, optional, default 'all'
        If 'all' (default), all detected input files (i.e. all samples) are
        used. Instead, a list of strings containing IDs (as assigned by
        `katachi.tools.initialize`) can be passed, in which case only samples
        whose IDs are in the list are used. If there are IDs in the list for
        which no matching files were found, a warning is shown.
    assign_landmarks_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for assign_landmarks function.
        See `katachi.tools.assign_landmarks.assign_landmarks` for information
        about available options.
        See section "Prepare kwargs for landmark assignment" in this function
        for information on default settings.
    compute_TFOR : bool, optional, default True
        If True, the prim frame of reference is computed and CBE is performed
        on the TFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    transform_to_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for transform_to_TFOR function.
        See `katachi.tools.find_TFOR.transform_to_TFOR` for information
        about available options.
        See section "Prepare kwargs for transformation to TFOR" in this
        function for information on default settings.
    perform_CBE_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to TFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on TFOR" in this function for
        information on default settings.
    compute_CFOR : bool, optional, default True
        If True, the cell frame of reference is computed and CBE is performed
        on the CFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    perform_CBE_CFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to CFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on CFOR" in this function for
        information on default settings.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (but dask is still required for CBE!).
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        shows the constructed dask pipeline.
        Note: The resulting graph may get very large if many samples are used
        at the same time.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    # Function to select pairs of files (seg, dir) and create paths
    def prepare_fpaths(dirpath, fnames):

        # Find segmentation files
        seg_names = [
            fname for fname in fnames if fname.endswith(suffix_seg + ".tif")
        ]

        # Exclude files not in select_IDs
        if not select_IDs == 'all':
            seg_names = [
                fname for fname in seg_names
                if any([fname.startswith(ID) for ID in select_IDs])
            ]

        # Get IDs
        seg_IDs = [fname[:10] for fname in seg_names]

        # Get matching intensity files
        int_names = []
        for ID in seg_IDs:
            int_name = [
                fname for fname in fnames
                if fname.startswith(ID) and fname.endswith(suffix_int + ".tif")
            ]
            try:
                int_names.append(int_name[0])
            except IndexError:
                raise IOError("Could not find matching intensity file for " +
                              "segmentation file with ID " + ID)

        # Create path
        seg_paths = [os.path.join(dirpath, name) for name in seg_names]
        int_paths = [os.path.join(dirpath, name) for name in int_names]

        # Return results
        return [(seg_paths[i], int_paths[i]) for i in range(len(seg_paths))]

    # Remove .tif if it was specified with the suffix
    if suffix_seg.endswith(".tif"): suffix_seg = suffix_seg[:-4]
    if suffix_int.endswith(".tif"): suffix_int = suffix_int[:-4]

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = prepare_fpaths(dirpath, fnames)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += prepare_fpaths(dpath, fnames)

    # Test if all samples in select_IDs are present
    if not select_IDs == 'all':
        fpaths_IDs = [os.path.split(fp[0])[1][:10] for fp in fpaths]
        orphan_IDs = [ID for ID in select_IDs if ID not in fpaths_IDs]
        if any(orphan_IDs):
            warn(
                "No matching files found for some of the IDs in select_IDs: " +
                ", ".join(orphan_IDs))

    # Check
    if len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Handle processes
    if processes is None:
        processes = cpu_count() // 2

    # More checks
    if not compute_TFOR and not compute_CFOR:
        raise IOError("At least one of compute_TFOR or compute_CFOR must be " +
                      "set to True.")

    # Report
    if verbose:
        print "Detected", len(fpaths), "target file pairs."

    #--------------------------------------------------------------------------

    ### Prepare kwargs for landmark assignment

    # Default kwargs for landmark assignment
    la_kwargs = dict()
    la_kwargs['save_centroids'] = True
    la_kwargs['fpath_out'] = None
    la_kwargs['show_cells'] = None
    la_kwargs['verbose'] = False
    la_kwargs['global_prep_func'] = None
    la_kwargs['global_prep_params'] = None
    la_kwargs['local_prep_func'] = None
    la_kwargs['local_prep_params'] = None
    la_kwargs['landmark_func'] = 'default'
    la_kwargs['landmark_func_params'] = None

    # User-specified kwargs for landmark assignment
    if assign_landmarks_kwargs != 'default':
        for kw in assign_landmarks_kwargs.keys():
            la_kwargs[kw] = assign_landmarks_kwargs[kw]

    # Safety check
    if la_kwargs['fpath_out'] is not None:
        raise IOError(
            "`assign_landmarks_kwargs['fpath_out']` must be set to " +
            "`None`, otherwise files will overwrite each other.")

    #--------------------------------------------------------------------------

    ### Prepare kwargs for TFOR transformation

    # Default kwargs for transformation to TFOR
    TFOR_kwargs = dict()
    TFOR_kwargs['n_points'] = 3000
    TFOR_kwargs['verbose'] = False
    TFOR_kwargs['show'] = False

    # User-specified kwargs for TFOR
    if transform_to_TFOR_kwargs != 'default':
        for kw in transform_to_TFOR_kwargs.keys():
            TFOR_kwargs[kw] = transform_to_TFOR_kwargs[kw]

    # Safety check
    if not compute_TFOR and transform_to_TFOR_kwargs is not 'default':
        warn("Non-default kwargs were passed for transformation to TFOR but " +
             "compute_TFOR is set to False!")

    #--------------------------------------------------------------------------

    ### Prepare args for CBE

    # Handle differing clustering inputs for TFOR and CFOR
    if type(clustering[0]) == tuple:
        clustering_TFOR = (clustering[0][0], clustering[1][0])
        clustering_cfor = (clustering[0][1], clustering[1][1])
    else:
        clustering_TFOR = clustering_cfor = clustering

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on TFOR

    # Default kwargs for CBE
    cbe_TFOR_kwargs = dict()
    cbe_TFOR_kwargs['normalize_vol'] = None
    cbe_TFOR_kwargs['presample'] = None
    cbe_TFOR_kwargs['cfor'] = None
    cbe_TFOR_kwargs['standardize'] = False
    cbe_TFOR_kwargs['custom_feature_funcs'] = None
    cbe_TFOR_kwargs['dask_graph_path'] = None
    cbe_TFOR_kwargs['processes'] = processes
    cbe_TFOR_kwargs['profiling'] = False
    cbe_TFOR_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_TFOR_kwargs['save_metadata'] = True
    cbe_TFOR_kwargs['save_presampled'] = False
    cbe_TFOR_kwargs['save_cfor'] = False
    cbe_TFOR_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_TFOR_kwargs != 'default':
        for kw in perform_CBE_TFOR_kwargs.keys():
            cbe_TFOR_kwargs[kw] = perform_CBE_TFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on CFOR

    # Default kwargs for CBE
    cbe_cfor_kwargs = dict()
    cbe_cfor_kwargs['normalize_vol'] = True
    cbe_cfor_kwargs['presample'] = None
    cbe_cfor_kwargs['cfor'] = ('PD', 3)
    cbe_cfor_kwargs['standardize'] = True
    cbe_cfor_kwargs['custom_feature_funcs'] = None
    cbe_cfor_kwargs['dask_graph_path'] = None
    cbe_cfor_kwargs['processes'] = processes
    cbe_cfor_kwargs['profiling'] = False
    cbe_cfor_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_cfor_kwargs['save_metadata'] = True
    cbe_cfor_kwargs['save_presampled'] = False
    cbe_cfor_kwargs['save_cfor'] = True
    cbe_cfor_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_CFOR_kwargs != 'default':
        for kw in perform_CBE_CFOR_kwargs.keys():
            cbe_cfor_kwargs[kw] = perform_CBE_CFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### If desired: run sequentially

    if processes == 1:

        if verbose: print "Processing target file pairs sequentially..."

        # Landmark extraction
        if verbose: print "--Assigning landmarks..."
        fpaths_lm = []
        for seg_path, int_path in fpaths:
            assign_landmarks(seg_path, int_path, num_LMs, **la_kwargs)
            fpaths_lm.append((seg_path, int_path[:-4] + "_LMs.npy"))

        # Computing the TFOR and performing CBE on TFOR
        if compute_TFOR:

            # Run the transformation to TFOR
            if verbose: print "--Transforming to TFOR..."
            fpaths_TFOR = []
            for seg_path, lm_path in fpaths_lm:
                transform_to_TFOR(seg_path, lm_path, **TFOR_kwargs)
                fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

            # Performing CBE on TFOR
            if verbose: print "--Performing CBE on TFOR..."
            cbe(fpaths_TFOR, downsample, clustering_TFOR, features,
                **cbe_TFOR_kwargs)

        # Performing CBE on CFOR
        if compute_CFOR:
            if verbose: print "--Performing CBE on CFOR..."
            lm_paths = [fpath[1] for fpath in fpaths_lm]
            cbe(lm_paths, downsample, clustering_cfor, features,
                **cbe_cfor_kwargs)

        # Done
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    dask_graph = dict()

    # For each input...
    fpaths_lm = []
    fpaths_TFOR = []
    for idx, fpath in enumerate(fpaths):

        # Landmark extraction nodes
        seg_path, int_path = fpath
        asgn_lms = partial(assign_landmarks, **la_kwargs)
        dask_graph["asgn_lms_%i" % idx] = (asgn_lms, seg_path, int_path,
                                           num_LMs)
        lm_path = int_path[:-4] + "_LMs.npy"
        fpaths_lm.append(lm_path)

        # Transform to TFOR
        if compute_TFOR:

            # Transform to TFOR
            tf2TFOR = partial(transform_to_TFOR, **TFOR_kwargs)
            tf2TFOR_await = lambda _, s, lmp: tf2TFOR(s, lmp)
            dask_graph["tf2TFOR_%i" % idx] = (tf2TFOR_await,
                                              "asgn_lms_%i" % idx, seg_path,
                                              lm_path)
            fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

    # Perform CBE on TFOR
    if compute_TFOR:
        cbe_TFOR = partial(cbe, **cbe_TFOR_kwargs)
        cbe_TFOR_await = lambda _, lmp, ds, cl, fe: cbe_TFOR(lmp, ds, cl, fe)
        dask_graph["CBE_TFOR"] = (cbe_TFOR_await, [
            "tf2TFOR_%i" % idx for idx in range(len(fpaths))
        ], fpaths_TFOR, downsample, clustering_TFOR, features)

    # Perform CBE on CFOR
    if compute_CFOR:

        cbe_cfor = partial(cbe, **cbe_cfor_kwargs)
        cbe_cfor_await = lambda _, lmp, ds, cl, fe: cbe_cfor(lmp, ds, cl, fe)

        # Don't parallelize CBEs; wait for TFOR-CBE to finish
        if compute_TFOR:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, "CBE_TFOR", fpaths_lm,
                                      downsample, clustering_cfor, features)
        else:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, [
                "asgn_lms_%i" % idx for idx in range(len(fpaths))
            ], fpaths_lm, downsample, clustering_cfor, features)

    # Create dask graph
    if dask_graph_path is not None:
        from dask.dot import dot_graph
        dot_graph(dask_graph, filename=dask_graph_path)

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # Report
    if verbose: print "Processing target file pairs in parallel..."

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_CFOR')
        else:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_TFOR')

    # Run the pipeline (with resource profiling)
    if profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_CFOR')
                visualize([prof, rprof], save=False)
        else:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_TFOR')
                visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
예제 #29
0
    # subtract mean
    axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:])
    b1 -= b1.mean(axis=axes, keepdims=True)
    b2 -= b2.mean(axis=axes, keepdims=True)
    # numerator of corrcoef
    numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False)
    # denomenator of corrcoef
    dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] )
    b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof )
    b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof )
    denominator = np.multiply(b1_std, b2_std)
    # divide
    out = np.divide(numerator, denominator)
    return out


if __name__ == '__main__':
    f1 = h5py.File("test.h5", "r")
    f2 = h5py.File("test2.h5", "r")
    arr1 = da.from_array(f1["arr"])
    arr2 = da.from_array(f2["arr"])

    block_shape = (10, 10)

    with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\
            ProgressBar():
        out = da.map_blocks(corrcoef, arr1, arr2, block_shape,
                chunks=(400, 400))
        da.to_hdf5("out.h5", "/arr", out)
    visualize([prof, rprof])
        return tics[0]

    with CacheProfiler(nbytes) as cprof:
        get(dsk2, 'c')

    results = cprof.results
    assert tics[-1] == len(results)
    assert tics[-1] == results[-1].metric
    assert cprof._metric_name == 'nbytes'
    assert CacheProfiler(metric=nbytes, metric_name='foo')._metric_name == 'foo'


@pytest.mark.parametrize(
    'profiler',
    [Profiler,
     pytest.param(lambda: ResourceProfiler(dt=0.01),
                  marks=pytest.mark.skipif("not psutil")),
     CacheProfiler])
def test_register(profiler):
    prof = profiler()
    try:
        prof.register()
        get(dsk2, 'c')
        n = len(prof.results)
        assert n > 0
        get(dsk2, 'c')
        assert len(prof.results) > n
    finally:
        prof.unregister()