def profile(self, *dfs, **kwargs): """Profiles timing given input dataframes `dfs` which are passed to `fit_transform`. """ if self.cache_input: dfs = self._cache_input(dfs) counter = 0 baselines = [] max_usages = [] while counter < self.repetitions: gc.collect() with ResourceProfiler(dt=self.interval) as rprof: self.func(*dfs, **kwargs) mem_usages = [x.mem for x in rprof.results] baselines.append(np.min(mem_usages)) max_usages.append(np.max(mem_usages)) counter += 1 self._max_usages = max_usages self._baselines = baselines self._measurements = np.subtract(max_usages, baselines).tolist() if self.cache_input: self._clear_cached_input(dfs) return self
def test_no_delay_during_large_transfer(c, s, w): pytest.importorskip('crick') np = pytest.importorskip('numpy') x = np.random.random(100000000) # Reset digests from distributed.counter import Digest from collections import defaultdict from functools import partial from dask.diagnostics import ResourceProfiler for server in [s, w]: server.digests = defaultdict(partial(Digest, loop=server.io_loop)) server._last_tick = time() with ResourceProfiler(dt=0.01) as rprof: future = yield c.scatter(x, direct=True, hash=False) yield gen.sleep(0.5) rprof.close() for server in [s, w]: assert server.digests['tick-duration'].components[0].max() < 0.5 nbytes = np.array([t.mem for t in rprof.results]) nbytes -= nbytes[0] assert nbytes.max() < (x.nbytes * 2) / 1e6 assert nbytes[-1] < (x.nbytes * 1.2) / 1e6
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, "c") p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) # Grid plot layouts changed in Bokeh 3. # See https://github.com/dask/dask/issues/9257 for more details if BOKEH_VERSION().major < 3: figures = [r[0] for r in p.children[1].children] else: figures = [r[0] for r in p.children] assert len(figures) == 2 assert figures[0].title.text == "Not the default" assert figures[0].xaxis[0].axis_label is None assert figures[1].title is None assert figures[1].xaxis[0].axis_label == "Time (s)" # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") p = rprof.visualize( plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False, ) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") # Test with empty and one point, checking for errors rprof.clear() for results in [[], [(1.0, 0, 0)]]: rprof.results = results with pytest.warns(None) as record: p = rprof.visualize(show=False, save=False) assert len(record) == 0 # Check bounds are valid assert p.x_range.start == 0 assert p.x_range.end == 1 assert p.y_range.start == 0 assert p.y_range.end == 100 assert p.extra_y_ranges["memory"].start == 0 assert p.extra_y_ranges["memory"].end == 100
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, "c") p = visualize( [prof, rprof], label_size=50, title="Not the default", show=False, save=False ) bokeh_version = LooseVersion(bokeh.__version__) if bokeh_version >= "1.1.0": figures = [r[0] for r in p.children[1].children] elif bokeh_version >= "0.12.0": figures = [r.children[0] for r in p.children[1].children] else: figures = [r[0] for r in p.children] assert len(figures) == 2 assert check_title(figures[0], "Not the default") assert figures[0].xaxis[0].axis_label is None assert figures[1].title is None assert figures[1].xaxis[0].axis_label == "Time (s)" # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") p = rprof.visualize( width=500, height=300, tools="hover", title="Not the default", show=False, save=False, ) if BOKEH_VERSION().major < 3: assert p.plot_width == 500 assert p.plot_height == 300 else: assert p.width == 500 assert p.height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title.text == "Not the default" # Test with empty and one point, checking for errors rprof.clear() for results in [[], [(1.0, 0, 0)]]: rprof.results = results with warnings.catch_warnings(record=True) as record: p = rprof.visualize(show=False, save=False) assert not record # Check bounds are valid assert p.x_range.start == 0 assert p.x_range.end == 1 assert p.y_range.start == 0 assert p.y_range.end == 100 assert p.extra_y_ranges["memory"].start == 0 assert p.extra_y_ranges["memory"].end == 100
def main(): global sky global dirty global psf list_schedule = [] list_compute = [] list_total = [] list_load = [] start_time1 = time.time() sky_npy, sky = load_data(os.path.split(os.getcwd())[0] + '/sky.npy') dirty_npy, dirty = load_data(os.path.split(os.getcwd())[0] + '/dirty.npy') psf_npy, psf = load_data(os.path.split(os.getcwd())[0] + '/psf.npy') end_time1 = time.time() start_time2 = time.time() scheduling() end_time2 = time.time() pbar = ProgressBar() with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof: start_time3 = time.time() hub.compute() end_time3 = time.time() #pbar.register() #quad.compute() #pbar.unregister() with PrintKeys(): hub.compute() print("\n" + "Resultats du profilling:") print(prof.results[0]) print("\n" + "La valeur d'usage de la memoire est en MB et l'information du CPU est %d'usage de la CPU") print(rprof.results) print("\n" + "Resultats du profilling de la cache:") print(cprof.results[0]) visualize([prof, rprof, cprof]) list_load.append(end_time1 - start_time1) list_schedule.append(end_time2 - start_time2) list_compute.append(end_time3 - start_time3) list_total.append(end_time3 - start_time1) print("\n" + "Temps du code pous analyse") print('load time: {}'.format(round(sum(list_load)/len(list_load), 4))) print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4))) print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4))) print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
def uncompress_to_hdf5(): print('Writing to hdf5 file after loading raw data in RAM.') raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None) print( f'time to save the array to hdf5 without compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path) # write to numpy stack out_filepath = 'data/out.hdf5' os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_commpressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
def test_resource_profiler(): with ResourceProfiler(dt=0.01) as rprof: out = get(dsk2, 'c') results = rprof.results assert all(isinstance(i, tuple) and len(i) == 3 for i in results) rprof.clear() assert rprof.results == [] rprof.close() assert not rprof._tracker.is_alive() with pytest.raises(AssertionError): with rprof: get(dsk, 'e')
def onthefly_to_nps(): print('Writing to npy stack file without loading raw data in RAM.') out_dir = 'data/out_3_numpy' out_file_path = "outputs/write_npy_stack.html" # write to numpy stack with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def test_resource_profiler_plot(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, 'c') p = rprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") # Test empty, checking for errors rprof.clear() rprof.visualize(show=False, save=False)
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize from bokeh.plotting import GridPlot with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, 'c') p = visualize([prof, rprof], label_size=50, title="Not the default", show=False, save=False) assert isinstance(p, GridPlot) assert len(p.children) == 2 assert p.children[0][0].title == "Not the default" assert p.children[0][0].xaxis[0].axis_label is None assert p.children[1][0].title is None assert p.children[1][0].xaxis[0].axis_label == 'Time (s)' # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def execute(self, wf, client): if not wf.processes: return {} dsk = wf.convertGraph() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: result = client.get(dsk[0], dsk[1]) msg.logMessage('result:', result, level=msg.DEBUG) path = user_config_dir('xicam/profile.html') visualize([prof, rprof, cprof], show=False, file_path=path) msg.logMessage(f'Profile saved: {path}') wf.lastresult = result return result
def test_plot_multiple(): from dask.diagnostics.profile_visualize import visualize with ResourceProfiler(dt=0.01) as rprof: with prof: get(dsk2, "c") p = visualize( [prof, rprof], label_size=50, title="Not the default", show=False, save=False ) figures = [r[0] for r in p.children[1].children] assert len(figures) == 2 assert figures[0].title.text == "Not the default" assert figures[0].xaxis[0].axis_label is None assert figures[1].title is None assert figures[1].xaxis[0].axis_label == "Time (s)" # Test empty, checking for errors prof.clear() rprof.clear() visualize([prof, rprof], show=False, save=False)
def uncompress_to_npy(): print('Writing to numpy file after loading raw data in RAM.') out_filepath = 'data/out_1.npy' diagnostics_filepath = "outputs/load_raw_write_npy_file.html" raw_arr = uncompress() # write to numpy file if os.path.isfile(out_filepath): os.remove(out_filepath) with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() np.save(out_filepath, raw_arr) print(f'time to save the array to numpy file: {time.time() - t}') visualize([prof, rprof, cprof], diagnostics_filepath)
def test_resource_profiler_multiple_gets(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") assert len(rprof.results) == 0 get(dsk2, "c") results = rprof.results assert all(isinstance(i, tuple) and len(i) == 3 for i in results) rprof.clear() rprof.register() get(dsk2, "c") assert len(rprof.results) > 0 get(dsk2, "c") rprof.unregister() results = rprof.results assert all(isinstance(i, tuple) and len(i) == 3 for i in results) rprof.close() assert not rprof._is_running()
def _execute_graph(self, *writes): # Set up Profilers and Progress Bars with ExitStack() as stack: profilers = [] if can_profile: from dask.diagnostics import (Profiler, CacheProfiler, ResourceProfiler, visualize) profilers.append(stack.enter_context(Profiler())) profilers.append(stack.enter_context(CacheProfiler())) profilers.append(stack.enter_context(ResourceProfiler())) if sys.stdout.isatty() and not self.args.boring: from dask.diagnostics import ProgressBar stack.enter_context(ProgressBar()) dask.compute(*writes, scheduler='single-threaded') logger.info("Averaging Complete") if can_profile: visualize(profilers)
def test_resource_profiler(): with ResourceProfiler(dt=0.01) as rprof: get(dsk2, "c") results = rprof.results assert len(results) > 0 assert all(isinstance(i, tuple) and len(i) == 3 for i in results) # Tracker stopped on exit assert not rprof._is_running() rprof.clear() assert rprof.results == [] # Close is idempotent rprof.close() assert not rprof._is_running() # Restarts tracker if already closed with rprof: get(dsk2, "c") assert len(rprof.results) > 0
def uncompress_to_nps(): print('Writing to numpy stack after loading raw data in RAM.') # load data in RAM raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_dir = 'data/out_numpy' out_file_path = "outputs/load_raw_write_npy_stack.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def onthefly_to_hdf5(): print('Writing to hdf5 file without loading raw data in RAM.') # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/write_hdf5.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=(1400, 1400, 350), compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
with CacheProfiler(nbytes) as cprof: get(dsk2, "c") results = cprof.results assert tics[-1] == len(results) assert tics[-1] == results[-1].metric assert cprof._metric_name == "nbytes" assert CacheProfiler(metric=nbytes, metric_name="foo")._metric_name == "foo" @pytest.mark.parametrize( "profiler", [ Profiler, pytest.param( lambda: ResourceProfiler(dt=0.01), marks=pytest.mark.skipif("not psutil") ), CacheProfiler, ], ) def test_register(profiler): prof = profiler() try: prof.register() get(dsk2, "c") n = len(prof.results) assert n > 0 get(dsk2, "c") assert len(prof.results) > n finally: prof.unregister()
def initialize_dir(dirpath, idpath, meta_dict, recurse=False, IDR_data=False, IDR_IDs=None, ignore_old=True, fname_prefix=None, fname_suffix=None, processes=None, profiling=False, verbose=False): """Intialize the data structure for a directory of new image stacks. This is a dask pipeline that applies the function `initialize_stack` from `katachi.tools.initialize` to an entire directory. See `katachi.tools.initialize.initialize_stack` for more information. Parameters ---------- dirpath : string The path (either local from cwd or global) to the directory with the input data to be processed. idpath : string or None Path of the text file containing previously generated IDs. Necessary to ensure that newly generated IDs are unique. meta_dict : dict A dictionary containing the initial (user-defined) metadata for the stack. See Notes below for the keys that must be included. recurse : bool, optional, default False If True, files are searched recursively in the subdirs of fpath. This is ignored if `IDR_data` is True, as recursing through subfolders is not supported on IDR data. IDR_data : bool, optional, default False If True, the data is expected to already be grouped into subdirectories named according to already assigned IDs, as this is how the data was deposited on the IDR database. IDR_IDs : list of IDs or None, optional, default None If IDR_data is True, a list of IDs can be passed to specify a subset of samples for which this pipeline is to be run. ignore_old : bool, optional, default True If True, files that already have a known ID listed in the ID file will be ignored. This is not supported for IDR data, so if IDR_data is True and ignore_old is True, an error is raised. fname_prefix : str or None, optional If not None, only file names that start with the given string are used. fname_suffix : str or None, optional If not None, only file names that end with the given string (or with the given string + .tif) are used. processes : int or None, optional Number of processes dask may use for parallel processing. If None, half of the available CPUs are used. If set to 1, the entire code is run sequentially (dask is not used). profiling: bool, optional, default False If True, dask resource profiling is performed and visualized after the pipeline run is finished. This may generate a `profile.html` file in the working directory [bug in dask]. verbose : bool, optional, default False If True, more information is printed. Notes ----- The meta_dict dictionary must contain the following entries: - 'channels' : A list of strings naming the channels in order. Must not contain characters that cannot be used in file names. - 'resolution' : A list of floats denoting the voxel size of the input stack in order ZYX. It may optionally contain other entries as well. """ #-------------------------------------------------------------------------- ### Get a list of files to run if verbose: print "Detecting target files..." # Function to select file names and create paths def get_fnames_ready(fnames, fpath, known_ids=None): fnames = fnames[:] fnames = [fname for fname in fnames if fname.endswith(".tif")] if ignore_old: fnames = [ fname for fname in fnames if not any([fname.startswith(ID) for ID in known_ids]) ] if fname_prefix: fnames = [ fname for fname in fnames if fname.startswith(fname_prefix) ] if fname_suffix: fnames = [ fname for fname in fnames if fname.endswith(fname_suffix + ".tif") or fname.endswith(fname_suffix) ] fpaths = [os.path.join(fpath, fname) for fname in fnames] return fpaths # If this is run on IDR data, most of the work is already done! if IDR_data: # Handle inputs if ignore_old: raise IOError( "`ignore_old` is not supported for IDR data. Be " + "careful when running this so as to avoid over" + "writing important metadata. Aborting for now; set " + "`ignore_old` to False to prevent this error.") if IDR_IDs is None: IDR_IDs = [ ID for ID in os.listdir(dirpath) if os.path.isdir(ID) and len(ID) == 10 ] # Write the metadata files; all else is already done if verbose: print "Creating metadata files for IDR data..." for ID in IDR_IDs: meta_path = os.path.join(dirpath, ID, ID + '_stack_metadata.pkl') with open(meta_path, 'wb') as outfile: pickle.dump(meta_dict, outfile, pickle.HIGHEST_PROTOCOL) if verbose: print "Processing complete!" return # If needed, load previously generated IDs (to exclude those files) if ignore_old: try: with open(idpath, "r") as infile: known_ids = [line.strip() for line in infile.readlines()] except: print("Attempting to load existing IDs from id_file failed " + "with this error:") raise else: known_ids = None # Run for single dir if not recurse: fnames = os.listdir(dirpath) fpaths = get_fnames_ready(fnames, dirpath, known_ids=known_ids) # Run for multiple subdirs if recurse: fpaths = [] for dpath, _, fnames in os.walk(dirpath): fpaths += get_fnames_ready(fnames, dpath, known_ids) # Check if len(fpaths) == 0: raise IOError("No matching files found in target directory.") # Report if verbose: print "-- Detected", len(fpaths), "target files." #-------------------------------------------------------------------------- ### If desired: run sequentially (does not use dask) if processes == 1: if verbose: print "Processing target files sequentially..." for fpath in fpaths: initialize_stack(fpath, idpath, meta_dict, verbose=False) if verbose: print "Processing complete!" return #-------------------------------------------------------------------------- ### Prepare dask dict if verbose: print "Processing target files in parallel..." dask_graph = dict() for i, fpath in enumerate(fpaths): dask_graph["initialize_%i" % i] = (initialize_stack, fpath, idpath, meta_dict, False) dask_graph['done'] = (lambda x: "done", ["initialize_%i" % i for i in range(len(fpaths))]) #-------------------------------------------------------------------------- ### Run in parallel (with dask) # If necessary: choose number of threads (half of available cores) if processes is None: processes = cpu_count() // 2 # Set number of threads dask.set_options(pool=ThreadPool(processes)) # Run the pipeline (no profiling) if not profiling: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'done') # Run the pipeline (with resource profiling) if profiling: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'done') visualize([prof, rprof], save=False) # Report and return if verbose: print "Processing complete!" return
sys.path.append('/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan') from dmagellan.feature.extractfeatures import extract_feature_vecs from dmagellan.feature.autofeaturegen import get_features_for_matching from dask import multiprocessing, threaded from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize import cloudpickle filename='./profres_exp_mt_dblp_300k_extractfeatvecs.html' pbar = ProgressBar() pbar.register() #print("Mem. usage before reading:{0}".format( psutil.virtual_memory().used/1e9)) A = pd.read_csv('./datasets/sample_citeseer_300k.csv') B = pd.read_csv('./datasets/sample_dblp_300k.csv') #print("Mem. usage after reading:{0}".format(psutil.virtual_memory().used/1e9)) C = pd.read_csv('./datasets/candset.csv') feature_table = get_features_for_matching(A, B) feature_vecs = extract_feature_vecs(C, A, B, '_id', 'l_id', 'r_id', 'id', 'id', feature_table=feature_table, nchunks=4, compute=False) with Profiler() as prof, CacheProfiler() as cprof, ResourceProfiler(dt=0.25) as rprof: D = feature_vecs.compute(get=threaded.get, num_workers=4) visualize([prof, cprof, rprof], file_path=filename, show=False)
def _main(args): tic = time.time() log.info(banner()) if args.disable_post_mortem: log.warn("Disabling crash debugging with the " "Interactive Python Debugger, as per user request") post_mortem_handler.disable_pdb_on_error() log.info("Flagging on the {0:s} column".format(args.data_column)) data_column = args.data_column masked_channels = [ load_mask(fn, dilate=args.dilate_masks) for fn in collect_masks() ] GD = args.config log_configuration(args) # Group datasets by these columns group_cols = ["FIELD_ID", "DATA_DESC_ID", "SCAN_NUMBER"] # Index datasets by these columns index_cols = ['TIME'] # Reopen the datasets using the aggregated row ordering columns = [data_column, "FLAG", "TIME", "ANTENNA1", "ANTENNA2"] if args.subtract_model_column is not None: columns.append(args.subtract_model_column) xds = list( xds_from_ms(args.ms, columns=tuple(columns), group_cols=group_cols, index_cols=index_cols, chunks={"row": args.row_chunks})) # Get support tables st = support_tables(args.ms) ddid_ds = st["DATA_DESCRIPTION"] field_ds = st["FIELD"] pol_ds = st["POLARIZATION"] spw_ds = st["SPECTRAL_WINDOW"] ant_ds = st["ANTENNA"] assert len(ant_ds) == 1 assert len(ddid_ds) == 1 antspos = ant_ds[0].POSITION.data antsnames = ant_ds[0].NAME.data fieldnames = [fds.NAME.data[0] for fds in field_ds] avail_scans = [ds.SCAN_NUMBER for ds in xds] args.scan_numbers = list( set(avail_scans).intersection(args.scan_numbers if args.scan_numbers is not None else avail_scans)) if args.scan_numbers != []: log.info("Only considering scans '{0:s}' as " "per user selection criterion".format(", ".join( map(str, map(int, args.scan_numbers))))) if args.field_names != []: flatten_field_names = [] for f in args.field_names: # accept comma lists per specification flatten_field_names += [x.strip() for x in f.split(",")] for f in flatten_field_names: if re.match(r"^\d+$", f) and int(f) < len(fieldnames): flatten_field_names.append(fieldnames[int(f)]) flatten_field_names = list( set( filter(lambda x: not re.match(r"^\d+$", x), flatten_field_names))) log.info("Only considering fields '{0:s}' for flagging per " "user " "selection criterion.".format(", ".join(flatten_field_names))) if not set(flatten_field_names) <= set(fieldnames): raise ValueError("One or more fields cannot be " "found in dataset '{0:s}' " "You specified {1:s}, but " "only {2:s} are available".format( args.ms, ",".join(flatten_field_names), ",".join(fieldnames))) field_dict = {fieldnames.index(fn): fn for fn in flatten_field_names} else: field_dict = {i: fn for i, fn in enumerate(fieldnames)} # List which hold our dask compute graphs for each dataset write_computes = [] original_stats = [] final_stats = [] # Iterate through each dataset for ds in xds: if ds.FIELD_ID not in field_dict: continue if (args.scan_numbers is not None and ds.SCAN_NUMBER not in args.scan_numbers): continue log.info("Adding field '{0:s}' scan {1:d} to " "compute graph for processing".format(field_dict[ds.FIELD_ID], ds.SCAN_NUMBER)) ddid = ddid_ds[ds.attrs['DATA_DESC_ID']] spw_info = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]] pol_info = pol_ds[ddid.POLARIZATION_ID.data[0]] nrow, nchan, ncorr = getattr(ds, data_column).data.shape # Visibilities from the dataset vis = getattr(ds, data_column).data if args.subtract_model_column is not None: log.info("Forming residual data between '{0:s}' and " "'{1:s}' for flagging.".format( data_column, args.subtract_model_column)) vismod = getattr(ds, args.subtract_model_column).data vis = vis - vismod antenna1 = ds.ANTENNA1.data antenna2 = ds.ANTENNA2.data chan_freq = spw_info.CHAN_FREQ.data[0] chan_width = spw_info.CHAN_WIDTH.data[0] # Generate unflagged defaults if we should ignore existing flags # otherwise take flags from the dataset if args.ignore_flags is True: flags = da.full_like(vis, False, dtype=np.bool) log.critical("Completely ignoring measurement set " "flags as per '-if' request. " "Strategy WILL NOT or with original flags, even if " "specified!") else: flags = ds.FLAG.data # If we're flagging on polarised intensity, # we convert visibilities to polarised intensity # and any flagged correlation will flag the entire visibility if args.flagging_strategy == "polarisation": corr_type = pol_info.CORR_TYPE.data[0].tolist() stokes_map = stokes_corr_map(corr_type) stokes_pol = tuple(v for k, v in stokes_map.items() if k != "I") vis = polarised_intensity(vis, stokes_pol) flags = da.any(flags, axis=2, keepdims=True) elif args.flagging_strategy == "total_power": if args.subtract_model_column is None: log.critical("You requested to flag total quadrature " "power, but not on residuals. " "This is not advisable and the flagger " "may mistake fringes of " "off-axis sources for broadband RFI.") corr_type = pol_info.CORR_TYPE.data[0].tolist() stokes_map = stokes_corr_map(corr_type) stokes_pol = tuple(v for k, v in stokes_map.items()) vis = polarised_intensity(vis, stokes_pol) flags = da.any(flags, axis=2, keepdims=True) elif args.flagging_strategy == "standard": if args.subtract_model_column is None: log.critical("You requested to flag per correlation, " "but not on residuals. " "This is not advisable and the flagger " "may mistake fringes of off-axis sources " "for broadband RFI.") else: raise ValueError("Invalid flagging strategy '%s'" % args.flagging_strategy) ubl = unique_baselines(antenna1, antenna2) utime, time_inv = da.unique(ds.TIME.data, return_inverse=True) utime, ubl = dask.compute(utime, ubl) ubl = ubl.view(np.int32).reshape(-1, 2) # Stack the baseline index with the unique baselines bl_range = np.arange(ubl.shape[0], dtype=ubl.dtype)[:, None] ubl = np.concatenate([bl_range, ubl], axis=1) ubl = da.from_array(ubl, chunks=(args.baseline_chunks, 3)) vis_windows, flag_windows = pack_data(time_inv, ubl, antenna1, antenna2, vis, flags, utime.shape[0], backend=args.window_backend, path=args.temporary_directory) original_stats.append( window_stats(flag_windows, ubl, chan_freq, antsnames, ds.SCAN_NUMBER, field_dict[ds.FIELD_ID], ds.attrs['DATA_DESC_ID'])) with StrategyExecutor(antspos, ubl, chan_freq, chan_width, masked_channels, GD['strategies']) as se: flag_windows = se.apply_strategies(flag_windows, vis_windows) final_stats.append( window_stats(flag_windows, ubl, chan_freq, antsnames, ds.SCAN_NUMBER, field_dict[ds.FIELD_ID], ds.attrs['DATA_DESC_ID'])) # Unpack window data for writing back to the MS unpacked_flags = unpack_data(antenna1, antenna2, time_inv, ubl, flag_windows) # Flag entire visibility if any correlations are flagged equalized_flags = da.sum(unpacked_flags, axis=2, keepdims=True) > 0 corr_flags = da.broadcast_to(equalized_flags, (nrow, nchan, ncorr)) if corr_flags.chunks != ds.FLAG.data.chunks: raise ValueError("Output flag chunking does not " "match input flag chunking") # Create new dataset containing new flags new_ds = ds.assign(FLAG=(("row", "chan", "corr"), corr_flags)) # Write back to original dataset writes = xds_to_table(new_ds, args.ms, "FLAG") # original should also have .compute called because we need stats write_computes.append(writes) if len(write_computes) > 0: # Combine stats from all datasets original_stats = combine_window_stats(original_stats) final_stats = combine_window_stats(final_stats) with contextlib.ExitStack() as stack: # Create dask profiling contexts profilers = [] if can_profile: profilers.append(stack.enter_context(Profiler())) profilers.append(stack.enter_context(CacheProfiler())) profilers.append(stack.enter_context(ResourceProfiler())) if sys.stdout.isatty(): # Interactive terminal, default ProgressBar stack.enter_context(ProgressBar()) else: # Non-interactive, emit a bar every 5 minutes so # as not to spam the log stack.enter_context(ProgressBar(minimum=1, dt=5 * 60)) _, original_stats, final_stats = dask.compute( write_computes, original_stats, final_stats) if can_profile: visualize(profilers) toc = time.time() # Log each summary line for line in summarise_stats(final_stats, original_stats): log.info(line) elapsed = toc - tic log.info("Data flagged successfully in " "{0:02.0f}h{1:02.0f}m{2:02.0f}s".format((elapsed // 60) // 60, (elapsed // 60) % 60, elapsed % 60)) else: log.info("User data selection criteria resulted in empty dataset. " "Nothing to be done. Bye!")
def compute(self, **kwargs): with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof: self._computed_result = dask.compute(self._result, **kwargs)[0] self._prof = prof self._rprof = rprof self._cprof = cprof
def main(cfgfile, starttime=None, endtime=None, trajfile="", trajtype='plane', flashnr=0, infostr="", MULTIPROCESSING_DSET=False, MULTIPROCESSING_PROD=False, PROFILE_MULTIPROCESSING=False): """ Main flow control. Processes radar data off-line over a period of time given either by the user, a trajectory file, or determined by the last volume processed and the current time. Multiple radars can be processed simultaneously Parameters ---------- cfgfile : str path of the main config file starttime, endtime : datetime object start and end time of the data to be processed trajfile : str path to file describing the trajectory trajtype : str type of trajectory file. Can be either 'plane' or 'lightning' flashnr : int If larger than 0 will select a flash in a lightning trajectory file. If 0 the data corresponding to the trajectory of all flashes will be plotted infostr : str Information string about the actual data processing (e.g. 'RUN57'). This string is added to product files. MULTIPROCESSING_DSET : Bool If true the generation of datasets at the same processing level will be parallelized MULTIPROCESSING_PROD : Bool If true the generation of products from each dataset will be parallelized PROFILE_MULTIPROCESSING : Bool If true and code parallelized the multiprocessing is profiled """ print("- PYRAD version: %s (compiled %s by %s)" % (pyrad_version.version, pyrad_version.compile_date_time, pyrad_version.username)) print("- PYART version: " + pyart_version.version) # Define behaviour of warnings warnings.simplefilter('always') # always print matching warnings # warnings.simplefilter('error') # turn matching warnings into exceptions warnings.formatwarning = _warning_format # define format if ALLOW_USER_BREAK: input_queue = _initialize_listener() if not _DASK_AVAILABLE: MULTIPROCESSING_DSET = False MULTIPROCESSING_PROD = False PROFILE_MULTIPROCESSING = False # check if multiprocessing profiling is necessary if not MULTIPROCESSING_DSET and not MULTIPROCESSING_PROD: PROFILE_MULTIPROCESSING = False elif MULTIPROCESSING_DSET and MULTIPROCESSING_PROD: PROFILE_MULTIPROCESSING = False if MULTIPROCESSING_DSET and MULTIPROCESSING_PROD: # necessary to launch tasks from tasks Client() if PROFILE_MULTIPROCESSING: prof = Profiler() rprof = ResourceProfiler() cprof = CacheProfiler() prof.register() rprof.register() cprof.register() cfg = _create_cfg_dict(cfgfile) datacfg = _create_datacfg_dict(cfg) starttime, endtime, traj = _get_times_and_traj( trajfile, starttime, endtime, cfg['ScanPeriod'], last_state_file=cfg['lastStateFile'], trajtype=trajtype, flashnr=flashnr) if infostr: print('- Info string : ' + infostr) # get data types and levels datatypesdescr_list = list() for i in range(1, cfg['NumRadars']+1): datatypesdescr_list.append( _get_datatype_list(cfg, radarnr='RADAR'+'{:03d}'.format(i))) dataset_levels = _get_datasets_list(cfg) masterfilelist, masterdatatypedescr, masterscan = _get_masterfile_list( datatypesdescr_list[0], starttime, endtime, datacfg, scan_list=datacfg['ScanList']) nvolumes = len(masterfilelist) if nvolumes == 0: raise ValueError( "ERROR: Could not find any valid volumes between " + starttime.strftime('%Y-%m-%d %H:%M:%S') + " and " + endtime.strftime('%Y-%m-%d %H:%M:%S') + " for " + "master scan '" + str(masterscan) + "' and master data type '" + masterdatatypedescr + "'") print('- Number of volumes to process: ' + str(nvolumes)) print('- Start time: ' + starttime.strftime("%Y-%m-%d %H:%M:%S")) print('- end time: ' + endtime.strftime("%Y-%m-%d %H:%M:%S")) # initial processing of the datasets print('\n\n- Initializing datasets:') dscfg, traj = _initialize_datasets( dataset_levels, cfg, traj=traj, infostr=infostr) # process all data files in file list or until user interrupts processing for masterfile in masterfilelist: if ALLOW_USER_BREAK: # check if user has requested exit try: input_queue.get_nowait() warn('Program terminated by user') break except queue.Empty: pass print('\n- master file: ' + os.path.basename(masterfile)) master_voltime = get_datetime(masterfile, masterdatatypedescr) radar_list = _get_radars_data( master_voltime, datatypesdescr_list, datacfg, num_radars=datacfg['NumRadars']) # process all data sets dscfg, traj = _process_datasets( dataset_levels, cfg, dscfg, radar_list, master_voltime, traj=traj, infostr=infostr, MULTIPROCESSING_DSET=MULTIPROCESSING_DSET, MULTIPROCESSING_PROD=MULTIPROCESSING_PROD) # delete variables del radar_list gc.collect() # post-processing of the datasets print('\n\n- Post-processing datasets:') dscfg, traj = _postprocess_datasets( dataset_levels, cfg, dscfg, traj=traj, infostr=infostr) if PROFILE_MULTIPROCESSING: prof.unregister() rprof.unregister() cprof.unregister() bokeh_plot = visualize([prof, rprof, cprof], show=False, save=False) profile_path = os.path.expanduser('~')+'/profiling/' if not os.path.isdir(profile_path): os.makedirs(profile_path) export_png(bokeh_plot, filename=( profile_path+datetime.utcnow().strftime('%Y%m%d%H%M%S') + '_profile.png')) print('- This is the end my friend! See you soon!')
import sys import time import dask from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize from multiprocessing.pool import ThreadPool import hyperspy.api as hs emd_filename_list = sys.argv[1:] emd_filename_list.sort() with dask.set_options( pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: for emd_filename in emd_filename_list: s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3)) t0 = time.time() result = s.sum() print(emd_filename) delta = time.time() - t0 print(delta) print(f"{s.data.nbytes / delta / 1024 / 1024} MB/s") visualize([prof, rprof, cprof])
def feature_extraction(dirpath, suffix_seg, suffix_int, num_LMs, downsample, clustering, features, recurse=False, select_IDs='all', assign_landmarks_kwargs='default', compute_TFOR=True, transform_to_TFOR_kwargs='default', perform_CBE_TFOR_kwargs='default', compute_CFOR=True, perform_CBE_CFOR_kwargs='default', processes=None, dask_graph_path=None, profiling=False, verbose=False): """Extract latent features from fluorescence distributions of single-cell segmentations by point cloud sampling and cluster-based embedding. This is a dask pipeline that applies point-cloud sampling from `katachi.tools.assign_landmars`, transformation to the TFOR (optional) from `katachi.tools.find_TFOR` and cluster-based embedding (either on TFOR data or by constructing a CFOR, or both) from `katachi.tools.perform_CBE` to a dataset of single-cell segmentations that has been generated by `katachi.pipelines.segmentation` or an equivalent approach. WARNING: Not all options provided by this pipeline have been extensively tested. Use with prudence! Parameters ---------- dirpath : string The path (either local from cwd or global) to the directory with the input data to be processed. suffix_seg : string File suffix that identifies target segmentation files as produced by `katachi.pipelines.segmentation`. This will usually be "seg.tif" but could contain more information to distinguish different segmentations. suffix_int : string File suffix that identifies target intensity files matching the shape of the target segmentation files. Each retrieved segmentation file must have a matching intensity file. num_LMs : int The number of landmarks to extract for each cell. downsample : tuple (algorithm, output_size) or None A tuple specifying the algorithm to use for downsampling of the merged point cloud prior to cluster extraction. See `katachi.tools.perform_CBE` for more information. clustering : tuple (algorithm, n_clusters) A tuple specifying the algorithm to use for computing the clusters to use in cluster-based feature extraction. See `katachi.tools.perform_CBE` for more information. Special case: both elements of clustering (i.e. `algorithm` and `n_clusters`) may themselves be tuples. In this case, their first and second elements will be used in CBE on TFOR and CFOR, respectively. features : list of strings List containing any number of cluster features to be extracted. See `katachi.tools.perform_CBE` for more information. recurse : bool, optional, default False If True, files are searched recursively in the subdirs of fpath. select_IDs : 'all' or list of strings, optional, default 'all' If 'all' (default), all detected input files (i.e. all samples) are used. Instead, a list of strings containing IDs (as assigned by `katachi.tools.initialize`) can be passed, in which case only samples whose IDs are in the list are used. If there are IDs in the list for which no matching files were found, a warning is shown. assign_landmarks_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for assign_landmarks function. See `katachi.tools.assign_landmarks.assign_landmarks` for information about available options. See section "Prepare kwargs for landmark assignment" in this function for information on default settings. compute_TFOR : bool, optional, default True If True, the prim frame of reference is computed and CBE is performed on the TFOR landmark data. At least one of compute_TFOR or compute_CFOR must be set to True. transform_to_TFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for transform_to_TFOR function. See `katachi.tools.find_TFOR.transform_to_TFOR` for information about available options. See section "Prepare kwargs for transformation to TFOR" in this function for information on default settings. perform_CBE_TFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for cbe function applied to TFOR. See `katachi.tools.perform_CBE.cbe` for information about available options. See section "Prepare kwargs for CBE on TFOR" in this function for information on default settings. compute_CFOR : bool, optional, default True If True, the cell frame of reference is computed and CBE is performed on the CFOR landmark data. At least one of compute_TFOR or compute_CFOR must be set to True. perform_CBE_CFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for cbe function applied to CFOR. See `katachi.tools.perform_CBE.cbe` for information about available options. See section "Prepare kwargs for CBE on CFOR" in this function for information on default settings. processes : int or None, optional Number of processes dask may use for parallel processing. If None, half of the available CPUs are used. If set to 1, the entire code is run sequentially (but dask is still required for CBE!). dask_graph_path : string or None, optional, default None If a path (including a file ending matching a known image format, such as '.png') is specified as a string, a dask graph image is created that shows the constructed dask pipeline. Note: The resulting graph may get very large if many samples are used at the same time. profiling: bool, optional, default False If True, dask resource profiling is performed and visualized after the pipeline run is finished. This may generate a `profile.html` file in the working directory [bug in dask]. verbose : bool, optional, default False If True, more information is printed. """ #-------------------------------------------------------------------------- ### Get a list of files to run # Function to select pairs of files (seg, dir) and create paths def prepare_fpaths(dirpath, fnames): # Find segmentation files seg_names = [ fname for fname in fnames if fname.endswith(suffix_seg + ".tif") ] # Exclude files not in select_IDs if not select_IDs == 'all': seg_names = [ fname for fname in seg_names if any([fname.startswith(ID) for ID in select_IDs]) ] # Get IDs seg_IDs = [fname[:10] for fname in seg_names] # Get matching intensity files int_names = [] for ID in seg_IDs: int_name = [ fname for fname in fnames if fname.startswith(ID) and fname.endswith(suffix_int + ".tif") ] try: int_names.append(int_name[0]) except IndexError: raise IOError("Could not find matching intensity file for " + "segmentation file with ID " + ID) # Create path seg_paths = [os.path.join(dirpath, name) for name in seg_names] int_paths = [os.path.join(dirpath, name) for name in int_names] # Return results return [(seg_paths[i], int_paths[i]) for i in range(len(seg_paths))] # Remove .tif if it was specified with the suffix if suffix_seg.endswith(".tif"): suffix_seg = suffix_seg[:-4] if suffix_int.endswith(".tif"): suffix_int = suffix_int[:-4] # Run for single dir if not recurse: fnames = os.listdir(dirpath) fpaths = prepare_fpaths(dirpath, fnames) # Run for multiple subdirs if recurse: fpaths = [] for dpath, _, fnames in os.walk(dirpath): fpaths += prepare_fpaths(dpath, fnames) # Test if all samples in select_IDs are present if not select_IDs == 'all': fpaths_IDs = [os.path.split(fp[0])[1][:10] for fp in fpaths] orphan_IDs = [ID for ID in select_IDs if ID not in fpaths_IDs] if any(orphan_IDs): warn( "No matching files found for some of the IDs in select_IDs: " + ", ".join(orphan_IDs)) # Check if len(fpaths) == 0: raise IOError("No matching files found in target directory.") # Handle processes if processes is None: processes = cpu_count() // 2 # More checks if not compute_TFOR and not compute_CFOR: raise IOError("At least one of compute_TFOR or compute_CFOR must be " + "set to True.") # Report if verbose: print "Detected", len(fpaths), "target file pairs." #-------------------------------------------------------------------------- ### Prepare kwargs for landmark assignment # Default kwargs for landmark assignment la_kwargs = dict() la_kwargs['save_centroids'] = True la_kwargs['fpath_out'] = None la_kwargs['show_cells'] = None la_kwargs['verbose'] = False la_kwargs['global_prep_func'] = None la_kwargs['global_prep_params'] = None la_kwargs['local_prep_func'] = None la_kwargs['local_prep_params'] = None la_kwargs['landmark_func'] = 'default' la_kwargs['landmark_func_params'] = None # User-specified kwargs for landmark assignment if assign_landmarks_kwargs != 'default': for kw in assign_landmarks_kwargs.keys(): la_kwargs[kw] = assign_landmarks_kwargs[kw] # Safety check if la_kwargs['fpath_out'] is not None: raise IOError( "`assign_landmarks_kwargs['fpath_out']` must be set to " + "`None`, otherwise files will overwrite each other.") #-------------------------------------------------------------------------- ### Prepare kwargs for TFOR transformation # Default kwargs for transformation to TFOR TFOR_kwargs = dict() TFOR_kwargs['n_points'] = 3000 TFOR_kwargs['verbose'] = False TFOR_kwargs['show'] = False # User-specified kwargs for TFOR if transform_to_TFOR_kwargs != 'default': for kw in transform_to_TFOR_kwargs.keys(): TFOR_kwargs[kw] = transform_to_TFOR_kwargs[kw] # Safety check if not compute_TFOR and transform_to_TFOR_kwargs is not 'default': warn("Non-default kwargs were passed for transformation to TFOR but " + "compute_TFOR is set to False!") #-------------------------------------------------------------------------- ### Prepare args for CBE # Handle differing clustering inputs for TFOR and CFOR if type(clustering[0]) == tuple: clustering_TFOR = (clustering[0][0], clustering[1][0]) clustering_cfor = (clustering[0][1], clustering[1][1]) else: clustering_TFOR = clustering_cfor = clustering #-------------------------------------------------------------------------- ### Prepare kwargs for CBE on TFOR # Default kwargs for CBE cbe_TFOR_kwargs = dict() cbe_TFOR_kwargs['normalize_vol'] = None cbe_TFOR_kwargs['presample'] = None cbe_TFOR_kwargs['cfor'] = None cbe_TFOR_kwargs['standardize'] = False cbe_TFOR_kwargs['custom_feature_funcs'] = None cbe_TFOR_kwargs['dask_graph_path'] = None cbe_TFOR_kwargs['processes'] = processes cbe_TFOR_kwargs['profiling'] = False cbe_TFOR_kwargs['suffix_out'] = {'META': suffix_int} cbe_TFOR_kwargs['save_metadata'] = True cbe_TFOR_kwargs['save_presampled'] = False cbe_TFOR_kwargs['save_cfor'] = False cbe_TFOR_kwargs['verbose'] = False # User-specified kwargs for CBE if perform_CBE_TFOR_kwargs != 'default': for kw in perform_CBE_TFOR_kwargs.keys(): cbe_TFOR_kwargs[kw] = perform_CBE_TFOR_kwargs[kw] #-------------------------------------------------------------------------- ### Prepare kwargs for CBE on CFOR # Default kwargs for CBE cbe_cfor_kwargs = dict() cbe_cfor_kwargs['normalize_vol'] = True cbe_cfor_kwargs['presample'] = None cbe_cfor_kwargs['cfor'] = ('PD', 3) cbe_cfor_kwargs['standardize'] = True cbe_cfor_kwargs['custom_feature_funcs'] = None cbe_cfor_kwargs['dask_graph_path'] = None cbe_cfor_kwargs['processes'] = processes cbe_cfor_kwargs['profiling'] = False cbe_cfor_kwargs['suffix_out'] = {'META': suffix_int} cbe_cfor_kwargs['save_metadata'] = True cbe_cfor_kwargs['save_presampled'] = False cbe_cfor_kwargs['save_cfor'] = True cbe_cfor_kwargs['verbose'] = False # User-specified kwargs for CBE if perform_CBE_CFOR_kwargs != 'default': for kw in perform_CBE_CFOR_kwargs.keys(): cbe_cfor_kwargs[kw] = perform_CBE_CFOR_kwargs[kw] #-------------------------------------------------------------------------- ### If desired: run sequentially if processes == 1: if verbose: print "Processing target file pairs sequentially..." # Landmark extraction if verbose: print "--Assigning landmarks..." fpaths_lm = [] for seg_path, int_path in fpaths: assign_landmarks(seg_path, int_path, num_LMs, **la_kwargs) fpaths_lm.append((seg_path, int_path[:-4] + "_LMs.npy")) # Computing the TFOR and performing CBE on TFOR if compute_TFOR: # Run the transformation to TFOR if verbose: print "--Transforming to TFOR..." fpaths_TFOR = [] for seg_path, lm_path in fpaths_lm: transform_to_TFOR(seg_path, lm_path, **TFOR_kwargs) fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy") # Performing CBE on TFOR if verbose: print "--Performing CBE on TFOR..." cbe(fpaths_TFOR, downsample, clustering_TFOR, features, **cbe_TFOR_kwargs) # Performing CBE on CFOR if compute_CFOR: if verbose: print "--Performing CBE on CFOR..." lm_paths = [fpath[1] for fpath in fpaths_lm] cbe(lm_paths, downsample, clustering_cfor, features, **cbe_cfor_kwargs) # Done if verbose: print "Processing complete!" return #-------------------------------------------------------------------------- ### Prepare dask dict dask_graph = dict() # For each input... fpaths_lm = [] fpaths_TFOR = [] for idx, fpath in enumerate(fpaths): # Landmark extraction nodes seg_path, int_path = fpath asgn_lms = partial(assign_landmarks, **la_kwargs) dask_graph["asgn_lms_%i" % idx] = (asgn_lms, seg_path, int_path, num_LMs) lm_path = int_path[:-4] + "_LMs.npy" fpaths_lm.append(lm_path) # Transform to TFOR if compute_TFOR: # Transform to TFOR tf2TFOR = partial(transform_to_TFOR, **TFOR_kwargs) tf2TFOR_await = lambda _, s, lmp: tf2TFOR(s, lmp) dask_graph["tf2TFOR_%i" % idx] = (tf2TFOR_await, "asgn_lms_%i" % idx, seg_path, lm_path) fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy") # Perform CBE on TFOR if compute_TFOR: cbe_TFOR = partial(cbe, **cbe_TFOR_kwargs) cbe_TFOR_await = lambda _, lmp, ds, cl, fe: cbe_TFOR(lmp, ds, cl, fe) dask_graph["CBE_TFOR"] = (cbe_TFOR_await, [ "tf2TFOR_%i" % idx for idx in range(len(fpaths)) ], fpaths_TFOR, downsample, clustering_TFOR, features) # Perform CBE on CFOR if compute_CFOR: cbe_cfor = partial(cbe, **cbe_cfor_kwargs) cbe_cfor_await = lambda _, lmp, ds, cl, fe: cbe_cfor(lmp, ds, cl, fe) # Don't parallelize CBEs; wait for TFOR-CBE to finish if compute_TFOR: dask_graph["CBE_CFOR"] = (cbe_cfor_await, "CBE_TFOR", fpaths_lm, downsample, clustering_cfor, features) else: dask_graph["CBE_CFOR"] = (cbe_cfor_await, [ "asgn_lms_%i" % idx for idx in range(len(fpaths)) ], fpaths_lm, downsample, clustering_cfor, features) # Create dask graph if dask_graph_path is not None: from dask.dot import dot_graph dot_graph(dask_graph, filename=dask_graph_path) #-------------------------------------------------------------------------- ### Run in parallel (with dask) # Report if verbose: print "Processing target file pairs in parallel..." # Set number of threads dask.set_options(pool=ThreadPool(processes)) # Run the pipeline (no profiling) if not profiling: if compute_CFOR: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'CBE_CFOR') else: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'CBE_TFOR') # Run the pipeline (with resource profiling) if profiling: if compute_CFOR: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'CBE_CFOR') visualize([prof, rprof], save=False) else: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'CBE_TFOR') visualize([prof, rprof], save=False) # Report and return if verbose: print "Processing complete!" return
# subtract mean axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:]) b1 -= b1.mean(axis=axes, keepdims=True) b2 -= b2.mean(axis=axes, keepdims=True) # numerator of corrcoef numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False) # denomenator of corrcoef dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] ) b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof ) b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof ) denominator = np.multiply(b1_std, b2_std) # divide out = np.divide(numerator, denominator) return out if __name__ == '__main__': f1 = h5py.File("test.h5", "r") f2 = h5py.File("test2.h5", "r") arr1 = da.from_array(f1["arr"]) arr2 = da.from_array(f2["arr"]) block_shape = (10, 10) with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\ ProgressBar(): out = da.map_blocks(corrcoef, arr1, arr2, block_shape, chunks=(400, 400)) da.to_hdf5("out.h5", "/arr", out) visualize([prof, rprof])
return tics[0] with CacheProfiler(nbytes) as cprof: get(dsk2, 'c') results = cprof.results assert tics[-1] == len(results) assert tics[-1] == results[-1].metric assert cprof._metric_name == 'nbytes' assert CacheProfiler(metric=nbytes, metric_name='foo')._metric_name == 'foo' @pytest.mark.parametrize( 'profiler', [Profiler, pytest.param(lambda: ResourceProfiler(dt=0.01), marks=pytest.mark.skipif("not psutil")), CacheProfiler]) def test_register(profiler): prof = profiler() try: prof.register() get(dsk2, 'c') n = len(prof.results) assert n > 0 get(dsk2, 'c') assert len(prof.results) > n finally: prof.unregister()