def _visualize(self, optimize_graph=False): from dask.dot import dot_graph from .optimize import optimize if optimize_graph: dot_graph(optimize(self.dask, self._keys())) else: dot_graph(self.dask)
def visualize(self, optimize_graph=False, **kwargs): """Visualize the dask as a graph""" from dask.dot import dot_graph if optimize_graph: return dot_graph(optimize(self.dask, self.key), **kwargs) else: return dot_graph(self.dask, **kwargs)
def printGraph(dsk,outfile): '''output file without extension. Will make a pdf. Make sure you have two packages install: python-graphviz (the python bindings to graphviz), and graphviz (the system library) Try `pip install graphviz` for the bindings and `conda install graphviz` for the library. If you don't use conda, or conda library install fails then install from `http://www.graphviz.org/` ''' dot_graph(dsk,filename=outfile,format='pdf') print("output image in {}.pdf".format(outfile))
def test_dot_graph(): fn = 'test_dot_graph' fns = [fn + ext for ext in ['.png', '.pdf', '.dot']] try: dot_graph(dsk, filename=fn) assert all(os.path.exists(f) for f in fns) finally: for f in fns: if os.path.exists(f): os.remove(f)
def test_dot_graph(): fn = 'test_dot_graph' fns = [fn + ext for ext in ['.png', '.pdf', '.dot']] try: dot_graph(dsk, filename=fn) assert all(os.path.exists(f) for f in fns) except (ImportError, AttributeError): pass finally: for f in fns: if os.path.exists(f): os.remove(f)
def test_dot_graph_no_filename(tmpdir, format, typ): before = tmpdir.listdir() result = dot_graph(dsk, filename=None, format=format) # We shouldn't write any files if filename is None. after = tmpdir.listdir() assert before == after assert isinstance(result, typ)
def run_all(values, base, get=get_proc, num_workers = 4): full_dask = toolz.merge(val.dask for val in values) full_keys = [val._key for val in values] cache = {} if exists("{}.cache".format(base["prefix"])): with open("{}.cache".format(base["prefix"]), "r") as f: cache = json.load(f) full_dask.update(cache) dot_graph(full_dask) with ProgressBar(), NekCallback(base) as rprof: res = get(full_dask, full_keys, cache=cache, num_workers=num_workers) return res
def visualize(self, filename=None, optimize_graph=False): from dask.dot import dot_graph if optimize_graph: dsk = self._optimize(self.dask, self._keys()) else: dsk = self.dask return dot_graph(dsk, filename=filename)
def test_dot_graph_no_filename(tmpdir): # Map from format extension to expected return type. result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG} for format in result_types: before = tmpdir.listdir() result = dot_graph(dsk, filename=None, format=format) # We shouldn't write any files if filename is None. after = tmpdir.listdir() assert before == after assert isinstance(result, result_types[format])
def visualize(*args, **kwargs): """ Visualize several dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- dsk : dict(s) or collection(s) The dask graph(s) to visualize. filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. **kwargs Additional keyword arguments to forward to ``to_graphviz``. Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: http://dask.pydata.org/en/latest/optimize.html """ dsks = [arg for arg in args if isinstance(arg, dict)] args = [arg for arg in args if isinstance(arg, Base)] filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) from dask.dot import dot_graph if optimize_graph: dsks.extend([ optimization_function(arg)(ensure_dict(arg.dask), arg._keys()) for arg in args ]) else: dsks.extend([arg.dask for arg in args]) dsk = merge(dsks) return dot_graph(dsk, filename=filename, **kwargs)
def visualize(*args, **kwargs): filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) from dask.dot import dot_graph if optimize_graph: dsks = [arg._optimize(arg.dask, arg._keys()) for arg in args] else: dsks = [arg.dask for arg in args] dsk = merge(dsks) return dot_graph(dsk, filename=filename, **kwargs)
def visualize(*args, **kwargs): filename = kwargs.get('filename', 'mydask') optimize_graph = kwargs.get('optimize_graph', False) from dask.dot import dot_graph if optimize_graph: dsks = [arg._optimize(arg.dask, arg._keys()) for arg in args] else: dsks = [arg.dask for arg in args] dsk = merge(dsks) return dot_graph(dsk, filename=filename)
def test_dot_graph(): fn = 'test_dot_graph' fns = [fn + ext for ext in ['.png', '.pdf', '.dot']] try: i = dot_graph(dsk, filename=fn) assert all(os.path.exists(f) for f in fns) assert isinstance(i, Image) finally: for f in fns: if os.path.exists(f): os.remove(f) fn = 'mydask' # default, remove existing files fns = [fn + ext for ext in ['.png', '.pdf', '.dot']] for f in fns: if os.path.exists(f): os.remove(f) i = dot_graph(dsk, filename=None) assert all(not os.path.exists(f) for f in fns) assert isinstance(i, Image)
def visualize(*args, **kwargs): """ Visualize several dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- dsk : dict(s) or collection(s) The dask graph(s) to visualize. filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. **kwargs Additional keyword arguments to forward to ``to_graphviz``. Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: http://dask.pydata.org/en/latest/optimize.html """ dsks = [arg for arg in args if isinstance(arg, dict)] args = [arg for arg in args if isinstance(arg, Base)] filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) from dask.dot import dot_graph if optimize_graph: dsks.extend([arg._optimize(dict(arg.dask), arg._keys()) for arg in args]) else: dsks.extend([arg.dask for arg in args]) dsk = merge(dsks) return dot_graph(dsk, filename=filename, **kwargs)
def visualize(*args, **kwargs): dsks = [arg for arg in args if isinstance(arg, dict)] args = [arg for arg in args if isinstance(arg, Base)] filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) from dask.dot import dot_graph if optimize_graph: dsks.extend([arg._optimize(arg.dask, arg._keys()) for arg in args]) else: dsks.extend([arg.dask for arg in args]) dsk = merge(dsks) return dot_graph(dsk, filename=filename, **kwargs)
def test_dot_graph_defaults(): # Test with default args. default_name = 'mydask' default_format = 'png' target = '.'.join([default_name, default_format]) ensure_not_exists(target) try: result = dot_graph(dsk) assert os.path.isfile(target) assert isinstance(result, Image) finally: ensure_not_exists(target)
def test_tsqr(create_func): mat, data = create_func() n = mat.shape[1] q, r = csnmf.tsqr.qr(data) dot_graph(q.dask, filename='q') dot_graph(r.dask, filename='r') print q.shape q = np.array(q) r = np.array(r) print r.shape print np.linalg.norm(mat - np.dot(q, r)) assert np.allclose(mat, np.dot(q, r)) assert np.allclose(np.eye(n, n), np.dot(q.T, q)) assert np.all(r == np.triu(r)) plt.figure() plt.subplot(2, 4, 1) plt.imshow(mat, interpolation='nearest') plt.title('Original matrix') plt.subplot(2, 4, 2) plt.imshow(q, interpolation='nearest') plt.title('$\mathbf{Q}$') plt.subplot(2, 4, 3) plt.imshow(np.dot(q.T, q), interpolation='nearest') plt.title('$\mathbf{Q}^T \mathbf{Q}$') plt.subplot(2, 4, 4) plt.imshow(r, interpolation='nearest') plt.title('$\mathbf{R}$') plt.subplot(2, 4, 8) plt.spy(r) plt.title('Nonzeros in $\mathbf{R}$')
def test_filenames_and_formats(): # Test with a variety of user provided args filenames = ["mydaskpdf", "mydask.pdf", "mydask.pdf", "mydaskpdf"] formats = ["svg", None, "svg", None] targets = ["mydaskpdf.svg", "mydask.pdf", "mydask.pdf.svg", "mydaskpdf.png"] result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG} for filename, format, target in zip(filenames, formats, targets): expected_result_type = result_types[target.split(".")[-1]] result = dot_graph(dsk, filename=filename, format=format) assert os.path.isfile(target) assert isinstance(result, expected_result_type) ensure_not_exists(target)
def test_dot_graph(tmpdir, format, typ): # Use a name that the shell would interpret specially to ensure that we're # not vulnerable to shell injection when interacting with `dot`. filename = str(tmpdir.join('$(touch should_not_get_created.txt)')) target = '.'.join([filename, format]) ensure_not_exists(target) try: result = dot_graph(dsk, filename=filename, format=format) assert not os.path.exists('should_not_get_created.txt') assert os.path.isfile(target) assert isinstance(result, typ) finally: ensure_not_exists(target)
def test_dot_graph_no_filename(tmpdir): # Map from format extension to expected return type. result_types = { 'png': Image, 'jpeg': Image, 'dot': type(None), 'pdf': type(None), 'svg': SVG, } for format in result_types: before = tmpdir.listdir() result = dot_graph(dsk, filename=None, format=format) # We shouldn't write any files if filename is None. after = tmpdir.listdir() assert before == after assert isinstance(result, result_types[format])
def test_dot_graph(tmpdir): # Use a name that the shell would interpret specially to ensure that we're # not vulnerable to shell injection when interacting with `dot`. filename = str(tmpdir.join("$(touch should_not_get_created.txt)")) # Map from format extension to expected return type. result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG} for format in result_types: target = ".".join([filename, format]) ensure_not_exists(target) try: result = dot_graph(dsk, filename=filename, format=format) assert not os.path.exists("should_not_get_created.txt") assert os.path.isfile(target) assert isinstance(result, result_types[format]) finally: ensure_not_exists(target)
def test_filenames_and_formats(): # Test with a variety of user provided args filenames = ['mydaskpdf', 'mydask.pdf', 'mydask.pdf', 'mydaskpdf', 'mydask.pdf.svg'] formats = ['svg', None, 'svg', None, None] targets = ['mydaskpdf.svg', 'mydask.pdf', 'mydask.pdf.svg', 'mydaskpdf.png', 'mydask.pdf.svg'] result_types = { 'png': Image, 'jpeg': Image, 'dot': type(None), 'pdf': type(None), 'svg': SVG, } for filename, format, target in zip(filenames, formats, targets): expected_result_type = result_types[target.split('.')[-1]] result = dot_graph(dsk, filename=filename, format=format) assert os.path.isfile(target) assert isinstance(result, expected_result_type) ensure_not_exists(target)
def test_dot_graph(tmpdir): # Use a name that the shell would interpret specially to ensure that we're # not vulnerable to shell injection when interacting with `dot`. filename = str(tmpdir.join('$(touch should_not_get_created.txt)')) # Map from format extension to expected return type. result_types = { 'png': Image, 'jpeg': Image, 'dot': type(None), 'pdf': type(None), 'svg': SVG, } for format in result_types: target = '.'.join([filename, format]) ensure_not_exists(target) try: result = dot_graph(dsk, filename=filename, format=format) assert not os.path.exists('should_not_get_created.txt') assert os.path.isfile(target) assert isinstance(result, result_types[format]) finally: ensure_not_exists(target)
def visualize(dsk, state, filename='dask'): """ Visualize state of compputation as dot graph """ from dask.dot import dot_graph data, func = color_nodes(dsk, state) dot_graph(dsk, filename=filename, data_attributes=data, func_attributes=func)
# %% [markdown] # ### The following calculation uses numpy, so it releases the GIL # %% result = (da_input**2. + da_input**3.).mean(axis=0) result # %% [markdown] # ### Note that result hasn't been computed yet # # Here is a graph of how the calculation will be split among 4 threads # %% from dask.dot import dot_graph dot_graph(result.dask) # %% [markdown] # ### Now do the calculation # %% with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof,\ CacheProfiler() as cprof: answer = result.compute() # %% [markdown] # Visualize the cpu, memory and cache for the 4 threads # %% visualize([prof, rprof, cprof], min_border_top=15, min_border_bottom=15)
def test_filenames_and_formats(tmpdir, filename, format, target, expected_result_type): result = dot_graph(dsk, filename=str(tmpdir.join(filename)), format=format) assert tmpdir.join(target).exists() assert isinstance(result, expected_result_type)
def vis(blocker_list): _b = convert_ldicts_to_sdict(blocker_list) dot_graph(_b)
def feature_extraction(dirpath, suffix_seg, suffix_int, num_LMs, downsample, clustering, features, recurse=False, select_IDs='all', assign_landmarks_kwargs='default', compute_TFOR=True, transform_to_TFOR_kwargs='default', perform_CBE_TFOR_kwargs='default', compute_CFOR=True, perform_CBE_CFOR_kwargs='default', processes=None, dask_graph_path=None, profiling=False, verbose=False): """Extract latent features from fluorescence distributions of single-cell segmentations by point cloud sampling and cluster-based embedding. This is a dask pipeline that applies point-cloud sampling from `katachi.tools.assign_landmars`, transformation to the TFOR (optional) from `katachi.tools.find_TFOR` and cluster-based embedding (either on TFOR data or by constructing a CFOR, or both) from `katachi.tools.perform_CBE` to a dataset of single-cell segmentations that has been generated by `katachi.pipelines.segmentation` or an equivalent approach. WARNING: Not all options provided by this pipeline have been extensively tested. Use with prudence! Parameters ---------- dirpath : string The path (either local from cwd or global) to the directory with the input data to be processed. suffix_seg : string File suffix that identifies target segmentation files as produced by `katachi.pipelines.segmentation`. This will usually be "seg.tif" but could contain more information to distinguish different segmentations. suffix_int : string File suffix that identifies target intensity files matching the shape of the target segmentation files. Each retrieved segmentation file must have a matching intensity file. num_LMs : int The number of landmarks to extract for each cell. downsample : tuple (algorithm, output_size) or None A tuple specifying the algorithm to use for downsampling of the merged point cloud prior to cluster extraction. See `katachi.tools.perform_CBE` for more information. clustering : tuple (algorithm, n_clusters) A tuple specifying the algorithm to use for computing the clusters to use in cluster-based feature extraction. See `katachi.tools.perform_CBE` for more information. Special case: both elements of clustering (i.e. `algorithm` and `n_clusters`) may themselves be tuples. In this case, their first and second elements will be used in CBE on TFOR and CFOR, respectively. features : list of strings List containing any number of cluster features to be extracted. See `katachi.tools.perform_CBE` for more information. recurse : bool, optional, default False If True, files are searched recursively in the subdirs of fpath. select_IDs : 'all' or list of strings, optional, default 'all' If 'all' (default), all detected input files (i.e. all samples) are used. Instead, a list of strings containing IDs (as assigned by `katachi.tools.initialize`) can be passed, in which case only samples whose IDs are in the list are used. If there are IDs in the list for which no matching files were found, a warning is shown. assign_landmarks_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for assign_landmarks function. See `katachi.tools.assign_landmarks.assign_landmarks` for information about available options. See section "Prepare kwargs for landmark assignment" in this function for information on default settings. compute_TFOR : bool, optional, default True If True, the prim frame of reference is computed and CBE is performed on the TFOR landmark data. At least one of compute_TFOR or compute_CFOR must be set to True. transform_to_TFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for transform_to_TFOR function. See `katachi.tools.find_TFOR.transform_to_TFOR` for information about available options. See section "Prepare kwargs for transformation to TFOR" in this function for information on default settings. perform_CBE_TFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for cbe function applied to TFOR. See `katachi.tools.perform_CBE.cbe` for information about available options. See section "Prepare kwargs for CBE on TFOR" in this function for information on default settings. compute_CFOR : bool, optional, default True If True, the cell frame of reference is computed and CBE is performed on the CFOR landmark data. At least one of compute_TFOR or compute_CFOR must be set to True. perform_CBE_CFOR_kwargs : dict or 'default', optional, default 'default' Dictionary specifying kwargs for cbe function applied to CFOR. See `katachi.tools.perform_CBE.cbe` for information about available options. See section "Prepare kwargs for CBE on CFOR" in this function for information on default settings. processes : int or None, optional Number of processes dask may use for parallel processing. If None, half of the available CPUs are used. If set to 1, the entire code is run sequentially (but dask is still required for CBE!). dask_graph_path : string or None, optional, default None If a path (including a file ending matching a known image format, such as '.png') is specified as a string, a dask graph image is created that shows the constructed dask pipeline. Note: The resulting graph may get very large if many samples are used at the same time. profiling: bool, optional, default False If True, dask resource profiling is performed and visualized after the pipeline run is finished. This may generate a `profile.html` file in the working directory [bug in dask]. verbose : bool, optional, default False If True, more information is printed. """ #-------------------------------------------------------------------------- ### Get a list of files to run # Function to select pairs of files (seg, dir) and create paths def prepare_fpaths(dirpath, fnames): # Find segmentation files seg_names = [ fname for fname in fnames if fname.endswith(suffix_seg + ".tif") ] # Exclude files not in select_IDs if not select_IDs == 'all': seg_names = [ fname for fname in seg_names if any([fname.startswith(ID) for ID in select_IDs]) ] # Get IDs seg_IDs = [fname[:10] for fname in seg_names] # Get matching intensity files int_names = [] for ID in seg_IDs: int_name = [ fname for fname in fnames if fname.startswith(ID) and fname.endswith(suffix_int + ".tif") ] try: int_names.append(int_name[0]) except IndexError: raise IOError("Could not find matching intensity file for " + "segmentation file with ID " + ID) # Create path seg_paths = [os.path.join(dirpath, name) for name in seg_names] int_paths = [os.path.join(dirpath, name) for name in int_names] # Return results return [(seg_paths[i], int_paths[i]) for i in range(len(seg_paths))] # Remove .tif if it was specified with the suffix if suffix_seg.endswith(".tif"): suffix_seg = suffix_seg[:-4] if suffix_int.endswith(".tif"): suffix_int = suffix_int[:-4] # Run for single dir if not recurse: fnames = os.listdir(dirpath) fpaths = prepare_fpaths(dirpath, fnames) # Run for multiple subdirs if recurse: fpaths = [] for dpath, _, fnames in os.walk(dirpath): fpaths += prepare_fpaths(dpath, fnames) # Test if all samples in select_IDs are present if not select_IDs == 'all': fpaths_IDs = [os.path.split(fp[0])[1][:10] for fp in fpaths] orphan_IDs = [ID for ID in select_IDs if ID not in fpaths_IDs] if any(orphan_IDs): warn( "No matching files found for some of the IDs in select_IDs: " + ", ".join(orphan_IDs)) # Check if len(fpaths) == 0: raise IOError("No matching files found in target directory.") # Handle processes if processes is None: processes = cpu_count() // 2 # More checks if not compute_TFOR and not compute_CFOR: raise IOError("At least one of compute_TFOR or compute_CFOR must be " + "set to True.") # Report if verbose: print "Detected", len(fpaths), "target file pairs." #-------------------------------------------------------------------------- ### Prepare kwargs for landmark assignment # Default kwargs for landmark assignment la_kwargs = dict() la_kwargs['save_centroids'] = True la_kwargs['fpath_out'] = None la_kwargs['show_cells'] = None la_kwargs['verbose'] = False la_kwargs['global_prep_func'] = None la_kwargs['global_prep_params'] = None la_kwargs['local_prep_func'] = None la_kwargs['local_prep_params'] = None la_kwargs['landmark_func'] = 'default' la_kwargs['landmark_func_params'] = None # User-specified kwargs for landmark assignment if assign_landmarks_kwargs != 'default': for kw in assign_landmarks_kwargs.keys(): la_kwargs[kw] = assign_landmarks_kwargs[kw] # Safety check if la_kwargs['fpath_out'] is not None: raise IOError( "`assign_landmarks_kwargs['fpath_out']` must be set to " + "`None`, otherwise files will overwrite each other.") #-------------------------------------------------------------------------- ### Prepare kwargs for TFOR transformation # Default kwargs for transformation to TFOR TFOR_kwargs = dict() TFOR_kwargs['n_points'] = 3000 TFOR_kwargs['verbose'] = False TFOR_kwargs['show'] = False # User-specified kwargs for TFOR if transform_to_TFOR_kwargs != 'default': for kw in transform_to_TFOR_kwargs.keys(): TFOR_kwargs[kw] = transform_to_TFOR_kwargs[kw] # Safety check if not compute_TFOR and transform_to_TFOR_kwargs is not 'default': warn("Non-default kwargs were passed for transformation to TFOR but " + "compute_TFOR is set to False!") #-------------------------------------------------------------------------- ### Prepare args for CBE # Handle differing clustering inputs for TFOR and CFOR if type(clustering[0]) == tuple: clustering_TFOR = (clustering[0][0], clustering[1][0]) clustering_cfor = (clustering[0][1], clustering[1][1]) else: clustering_TFOR = clustering_cfor = clustering #-------------------------------------------------------------------------- ### Prepare kwargs for CBE on TFOR # Default kwargs for CBE cbe_TFOR_kwargs = dict() cbe_TFOR_kwargs['normalize_vol'] = None cbe_TFOR_kwargs['presample'] = None cbe_TFOR_kwargs['cfor'] = None cbe_TFOR_kwargs['standardize'] = False cbe_TFOR_kwargs['custom_feature_funcs'] = None cbe_TFOR_kwargs['dask_graph_path'] = None cbe_TFOR_kwargs['processes'] = processes cbe_TFOR_kwargs['profiling'] = False cbe_TFOR_kwargs['suffix_out'] = {'META': suffix_int} cbe_TFOR_kwargs['save_metadata'] = True cbe_TFOR_kwargs['save_presampled'] = False cbe_TFOR_kwargs['save_cfor'] = False cbe_TFOR_kwargs['verbose'] = False # User-specified kwargs for CBE if perform_CBE_TFOR_kwargs != 'default': for kw in perform_CBE_TFOR_kwargs.keys(): cbe_TFOR_kwargs[kw] = perform_CBE_TFOR_kwargs[kw] #-------------------------------------------------------------------------- ### Prepare kwargs for CBE on CFOR # Default kwargs for CBE cbe_cfor_kwargs = dict() cbe_cfor_kwargs['normalize_vol'] = True cbe_cfor_kwargs['presample'] = None cbe_cfor_kwargs['cfor'] = ('PD', 3) cbe_cfor_kwargs['standardize'] = True cbe_cfor_kwargs['custom_feature_funcs'] = None cbe_cfor_kwargs['dask_graph_path'] = None cbe_cfor_kwargs['processes'] = processes cbe_cfor_kwargs['profiling'] = False cbe_cfor_kwargs['suffix_out'] = {'META': suffix_int} cbe_cfor_kwargs['save_metadata'] = True cbe_cfor_kwargs['save_presampled'] = False cbe_cfor_kwargs['save_cfor'] = True cbe_cfor_kwargs['verbose'] = False # User-specified kwargs for CBE if perform_CBE_CFOR_kwargs != 'default': for kw in perform_CBE_CFOR_kwargs.keys(): cbe_cfor_kwargs[kw] = perform_CBE_CFOR_kwargs[kw] #-------------------------------------------------------------------------- ### If desired: run sequentially if processes == 1: if verbose: print "Processing target file pairs sequentially..." # Landmark extraction if verbose: print "--Assigning landmarks..." fpaths_lm = [] for seg_path, int_path in fpaths: assign_landmarks(seg_path, int_path, num_LMs, **la_kwargs) fpaths_lm.append((seg_path, int_path[:-4] + "_LMs.npy")) # Computing the TFOR and performing CBE on TFOR if compute_TFOR: # Run the transformation to TFOR if verbose: print "--Transforming to TFOR..." fpaths_TFOR = [] for seg_path, lm_path in fpaths_lm: transform_to_TFOR(seg_path, lm_path, **TFOR_kwargs) fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy") # Performing CBE on TFOR if verbose: print "--Performing CBE on TFOR..." cbe(fpaths_TFOR, downsample, clustering_TFOR, features, **cbe_TFOR_kwargs) # Performing CBE on CFOR if compute_CFOR: if verbose: print "--Performing CBE on CFOR..." lm_paths = [fpath[1] for fpath in fpaths_lm] cbe(lm_paths, downsample, clustering_cfor, features, **cbe_cfor_kwargs) # Done if verbose: print "Processing complete!" return #-------------------------------------------------------------------------- ### Prepare dask dict dask_graph = dict() # For each input... fpaths_lm = [] fpaths_TFOR = [] for idx, fpath in enumerate(fpaths): # Landmark extraction nodes seg_path, int_path = fpath asgn_lms = partial(assign_landmarks, **la_kwargs) dask_graph["asgn_lms_%i" % idx] = (asgn_lms, seg_path, int_path, num_LMs) lm_path = int_path[:-4] + "_LMs.npy" fpaths_lm.append(lm_path) # Transform to TFOR if compute_TFOR: # Transform to TFOR tf2TFOR = partial(transform_to_TFOR, **TFOR_kwargs) tf2TFOR_await = lambda _, s, lmp: tf2TFOR(s, lmp) dask_graph["tf2TFOR_%i" % idx] = (tf2TFOR_await, "asgn_lms_%i" % idx, seg_path, lm_path) fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy") # Perform CBE on TFOR if compute_TFOR: cbe_TFOR = partial(cbe, **cbe_TFOR_kwargs) cbe_TFOR_await = lambda _, lmp, ds, cl, fe: cbe_TFOR(lmp, ds, cl, fe) dask_graph["CBE_TFOR"] = (cbe_TFOR_await, [ "tf2TFOR_%i" % idx for idx in range(len(fpaths)) ], fpaths_TFOR, downsample, clustering_TFOR, features) # Perform CBE on CFOR if compute_CFOR: cbe_cfor = partial(cbe, **cbe_cfor_kwargs) cbe_cfor_await = lambda _, lmp, ds, cl, fe: cbe_cfor(lmp, ds, cl, fe) # Don't parallelize CBEs; wait for TFOR-CBE to finish if compute_TFOR: dask_graph["CBE_CFOR"] = (cbe_cfor_await, "CBE_TFOR", fpaths_lm, downsample, clustering_cfor, features) else: dask_graph["CBE_CFOR"] = (cbe_cfor_await, [ "asgn_lms_%i" % idx for idx in range(len(fpaths)) ], fpaths_lm, downsample, clustering_cfor, features) # Create dask graph if dask_graph_path is not None: from dask.dot import dot_graph dot_graph(dask_graph, filename=dask_graph_path) #-------------------------------------------------------------------------- ### Run in parallel (with dask) # Report if verbose: print "Processing target file pairs in parallel..." # Set number of threads dask.set_options(pool=ThreadPool(processes)) # Run the pipeline (no profiling) if not profiling: if compute_CFOR: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'CBE_CFOR') else: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'CBE_TFOR') # Run the pipeline (with resource profiling) if profiling: if compute_CFOR: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'CBE_CFOR') visualize([prof, rprof], save=False) else: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'CBE_TFOR') visualize([prof, rprof], save=False) # Report and return if verbose: print "Processing complete!" return
if np.array_equal(da_new_centroids.compute(), da_centroids.compute()): break da_centroids = da_new_centroids return da_clusters, da_centroids if __name__ == '__main__': # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=50, centers=centers, cluster_std=0.5, random_state=0) result = kmeans(X, k=10) dot_graph(result[0].dask, filename='clusters') dot_graph(result[1].dask, filename='centroids') print("Result:\nClusters") print(result[0].compute()) print("Centroids") print(result[1].compute()) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( X.tolist(), result[0].compute().tolist(), metric='euclidean'))
# - Airflow - https://airflow.apache.org/ # - KNIME - https://www.knime.com/ # - Google Tensorflow - https://www.tensorflow.org/ # - Pytorch / Torch - http://pytorch.org/ # # Concrete example # What is a DAG good for? # In[32]: import dask.array as da from dask.dot import dot_graph image_1 = da.zeros((5, 5), chunks=(5, 5)) image_2 = da.ones((5, 5), chunks=(5, 5)) dot_graph(image_1.dask) # In[33]: image_3 = image_1 + image_2 dot_graph(image_3.dask) # In[34]: image_4 = (image_1 - 10) + (image_2 * 50) dot_graph(image_4.dask) # # Let's go big # Now let's see where this can be really useful # In[35]:
def _visualize(self, optimize_graph=False): from dask.dot import dot_graph if optimize_graph: return dot_graph(optimize(self.dask, self._keys())) else: return dot_graph(self.dask)
def cbe(fpaths_lm, downsample, clustering, features, normalize_vol=False, presample=None, cfor=None, standardize='default', custom_feature_funcs=None, bw_method=None, dask_graph_path=None, processes=None, profiling=False, suffix_out='default', save_metadata=True, save_presampled=False, save_cfor=False, verbose=False, legacy=False): """Create a feature space from a set of point clouds by cluster-based embedding (CBE). This includes the following steps: 1. Loading a set of point clouds 2. Normalizing point clouds by volume (optional) 3. Down-sampling of each point cloud individually (optional) - Available options are random, kmeans or custom downsampling 4. Making point clouds invariant to spatial transformation (optional) - Also called the "Cell Frame Of Reference" (CFOR) - There are currently 2 ways of accomplishing this - Transform to pairwise distance space (PD) - Transform to PCA space (PCA) [DEPRECATED] - It is also possible to pass a custom transform function. 5. Merging point clouds 6. Downsampling of merged point clouds (optional but recommended!) - Reduces computational cost/scaling of subsequent step - Options are density-dep., kmeans, random or custom downsampling 7. Extracting cluster centers as common reference points - Options are kmeans, dbscan and custom clustering 8. Extracting "cluster features" relative to the reference points - Done with dask for effecient chaining of operations - Multiple feature options available, see below 9. Saving the resulting feature space as well as intermediate results Cluster features that can be extracted: - "kNN-distsManh" : Manhatten distance in all dimensions of each cluster to the mean point of its k nearest neighbor landmarks. - "kNN-distEuclid" : Euclidean distance of each cluster to the mean point of its k nearest neighbor landmarks. - "NN-distsManh" : Manhatten distance in all dimensions of each cluster to the nearest neighboring landmark. - "NN-distEuclid" : Euclidean distance of each cluster to the nearest neighboring landmark. - "count-near" : Number of landmarks near to the cluster, where 'near' is the mean distance of the k nearest neighbor landmarks of the cluster. - "count-assigned" : Number of landmarks assigned to the cluster during the clustering itself. - "kde" : KDE estimated from cell landmarks sampled for each cluster center. - custom features : See custom_feature_funcs in parameters. Feature computations are in part dependent on each other. To make this both efficient and readable/elegant, dask is used for chaining the feature extraction steps appropriately. At the end, features are concatenated into a single array of shape (cells, features) and then saved for each input stack separately. Parameters ---------- fpaths_lm : single string or list of strings A path or list of paths (either local from cwd or global) to npy files containing cellular landmarks as generated by `katachi.tools.assign_landmarks` or `...find_TFOR`. downsample : tuple (algorithm, output_size) or None A tuple specifying the algorithm to use for downsampling of the merged point cloud prior to cluster extraction. Available algorithms are "ddds" (density-dependent downsampling), "kmeans" (perform kmeans and use cluster centers as new points) or "random". If "default" is passed, "ddds" is used. Example: ("ddds", 200000). Alternatively, if instead of a string denoting the algorithm a callable is passed, that callable is used for downsampling. The call signature is `all_lms_ds = downsample[0](all_lms, downsample)` where all_lms is an array of shape (all_landmarks, dimensions) holding all input landmarks merged into one point cloud. Since the `downsample` tuple itself is passed, additional arguments can be specified in additional elements of that tuple. all_lms_ds must be an array of shape (output_size, dimensions). If None, no downsampling is performed. This is not recommended for inputs of relevant sizes (total landmarks > 20000). WARNING: downsampling (especially by ddds) can be very expensive for large numbers of cells. In those cases, it is recommended to first run a representative subsets of the cells and then use the resulting CBE clusters to extract features for the entire dataset (using the `previous` setting in the `clustering` argument). clustering : tuple (algorithm, n_clusters) A tuple specifying the algorithm to use for computing the clusters to use in cluster-based feature extraction. Available algorithms are "kmeans" or "dbscan". If "default" is passed, "kmeans" is used. Example: ('kmeans', 10) Alternatively, one may pass a tuple `('previous', clustering_object)`, where `clustering_object` is a previously fitted clustering instance similar to an instantiated and fitted sklearn.cluster.KMeans object. It must have the attribute `cluster_centers_`, which is an array of shape (clusters, dimensions) and the method `predict`, which given an array of shape `(all_landmarks, dimensions)` will return cluster labels for each landmark. Clustering objects from previous runs are stored in the metadata under the key `"clustobj-"+identifier`. Alternatively, if instead of a string denoting the algorithm a callable is passed, that callable is used for clustering. The call signature is `clust_labels, clust_centers = clustering[0](all_lms, clustering)` where all_lms is an array of shape (all_landmarks, dimensions) holding all input landmarks merged into one point cloud (and downsampled in the previous step). Since the `clustering` tuple itself is passed, additional arguments can be specified in additional elements of that tuple. `clust_labels` must be a 1D integer array assigning each input landmark to a corresponding cluster center. `clust_centers` must be an array of shape (clusters, dimensions) and contain the coordinates of the cluster centers. The first axis must be ordered such that the integers in `clust_labels` index it correctly. The number of clusters must match n_clusters. features : list of strings List containing any number of cluster features to be extracted. The strings noted in the explanation above are allowed. If custom feature extraction functions are passed (see below), their names must also be included in this list. Example: ["kNN-distEuclid", "count-near"] normalize_vol : bool, optional, default False If True, the volume of each input point cloud is normalized by dividing each landmark vector magnitude by the sum of all magnitudes. presample : tuple (algorithm, output_size) or None, optional, default None If not None, the algorithm specified is used to downsample each input cloud individually to output_size points. Available algorithms are "kmeans" (perform kmeans and use cluster centers as new points) or "random". Example: ('random', 50) Alternatively, if instead of a string denoting the algorithm a callable is passed, that callable is used for downsampling. The call signature is ```for cell in range(lms.shape[0]): lms_ds[cell,:,:] = presample[0](lms[cell,:,:], presample)``` where lms is an array of shape (cells, landmarks, dimensions) holding the set of input point clouds. Since the `presample` tuple itself is passed, additional arguments can be specified in additional elements of that tuple. lms_ds must be an array of shape (cells, output_size, dimensions). If None, no presampling is performed. cfor : tuple (algorithm, dimensions) or None, optional, default None A tuple specifying the algorithm to use for recasting the landmarks in a space that is invariant to spatial transformations. There are two options available: "PD" (pairwise distance transform) and "PCA" (per-cell PCA and transform). For "PD", the total complement of pairwise distances between all points is computed and then subsampled to `dimensions` by selecting a corresponding number of distance percentiles in a linear range between the 10th to the 90th percentile (inclusive). For "PCA", the number of dimensions in the resulting space is equal to the number of dimensions of the input (should be 3). The `dimensions` part of the argument is ignored (but it must still be suplied!). If "default" is passed, "PD" is used. Example 1: ('PD', 6) Example 2: ('default', 6) # defaults to 'PD' Example 3: ('PCA', 3) Alternatively, if a callable is passed instead of a stringm that callable is used for downsampling. The call signature is ```for cell in range(lms.shape[0]): lms_cfor[cell,:,:] = cfor[0](lms[cell,:,:], cfor)``` where lms is an array of shape (cells, landmarks, dimensions) holding the set of input point clouds. Since the `cfor` tuple itself is passed, additional arguments can be specified in additional elements of that tuple. lms_ds must be an array of shape (cells, output_size, dimensions). If None, no transformation is performed; cells are left in the original 3D space. standardize : bool or 'default', optional, default 'default' If True, the point cloud dimensions of the merged CFOR point cloud are standardised to zero mean and unit variance. This is also propagated to the individual clouds used for feature extraction and for saving in case the CFOR is being saved. If 'default', standardization is performed only if cfor is set to "PD". If False, no standardization is performed. custom_feature_funcs : list of tuples or None, optional, default None List used to specify one or more custom feature extraction functions. Each custom function is specified through a tuple in the list that is structured as such: `(feature_name, extraction_func, parent_names, other_params)` where `feature_name` is the name of the feature as it appears in the `features` argument, `extraction_func` is a callable, `parent_names` is a lsit of parent feature names (as they appear in the `features` argument) used as input to `extraction_func`, and `other_params` is a list of other parameters for `extraction_func`. The call signature is ```dask_graph[custom_func[0]+"_%i" % c] = (feature_name, [parent+"_%i" % c for parent in parent_names], other_params, lms[c,:,:], clust_centers, clust_labels[c]) ``` within the dask graph, where `c` is the index of a cell. The callable must therefore accept a list of parent features (can be an empty list), a list of other parameters (can alos be empty), the (preprocessed) landmarks of the given cell, the cluster centers and the cluster labels of the given cell. It must return a 1D array of float values; the feature vector for the current cell `c`. bw_method : str, scalar, callable or None, optional, default None The method used to calculate the estimator bandwidth for the gaussian kde when computing the "kde" feature. This can be ‘scott’, ‘silverman’, a scalar constant or a callable. If a scalar, this will be used directly as `kde.factor`. If a callable, it should take a gaussian_kde instance as only parameter and return a scalar. If None (default), ‘scott’ is used. This is ignored if "kde" is not in `features`. < Modified from `scipy.stats.gaussian_kde` doc string. > dask_graph_path : string or None, optional, default None If a path (including a file ending matching a known image format, such as '.png') is specified as a string, a dask graph image is created that summarizes the feature extraction pipeline for the first 3 cells. Note: If the resulting graph contains multiple separate graphs, the only relevant graph is the one leading into `fspace` as an end result. processes : int or None, optional, default None Number of processes to use in multiprocessed and dask-controlled operations. If None, a number equal to half the available PCUs is used. If `1` (one), no multiprocessing is performed and `dask.get` is used instead of `dask.threaded.get`. profiling : bool, optional, default False If True, dask resource profiling is performed and visualized after the pipeline run is finished. This may generate a `profile.html` file in the working directory [bug in dask]. suffix_out : 'default' or dict, optional, default 'default' If 'default', the ouput is saved using '_PRES', '_CFOR', '_DS', and '_CBE' as suffices for the presampled landmarks (if `presample` is not None), for the CFOR-transformed landmarks (if `cfor` is not None), for overlayed downsampling (if `downsample` is not None)(note that this is not saved explicitly but is part of the suffix for the CBE-embedded feature space), and for the CBE-embedded feature space, respectively. The suffices are chained as appropriate. If a dict is passed, each of these suffices can be specified manually using the keys 'PRES', 'CFOR', 'DS', 'CBE' and 'META'. The suffix specified in 'META' is added to all relevant metadata dictionary keys. For any suffices not specified in the suffix_out dict, the 'default' suffix is used. save_metadata : bool, optional, default True If True, cluster samples, cluster labels and a feature header are saved to the metadata of each input stack as appropriate. save_presampled : bool, optional, default False If True, the result of the presampling step is saved with the suffix "PRES" for later use. save_cfor : bool, optional, default False If True, the result of the cfor step is saved with the suffix "CFOR" for later use. verbose : bool, optional, default False If True, more information is printed. legacy : bool, optional, default False If True (and standardize is also set to True), the feature extraction is not performed in standardized space. Instead, the cluster centroids are transformed back to the un-standardized space. Triggers a deprecation warning. """ #-------------------------------------------------------------------------- ### Load data if verbose: print "Loading data..." # Handle cases of single paths if type(fpaths_lm) == str: fpaths_lm = [fpaths_lm] if len(fpaths_lm) == 1: warn( "fpaths_lm specifies only a single path. Usually, multiple paths" + " are specified so that many samples can be overlayed for" + " feature extraction!") # Import the landmark data # Note: The order of fpaths_lm is maintained and an index array is created! lms = [] lms_idx = [] for idx, fpath_lm in enumerate(fpaths_lm): try: lms_in = np.load(fpath_lm) lms.append(lms_in) lms_idx += [idx for i in range(lms_in.shape[0])] except: print "Attempting to load landmark data from " + str(fpath_lm), print "failed with this error:" raise lms_idx = np.array(lms_idx, dtype=np.int) lms = np.concatenate(lms) if verbose: print "Total input data shape:", lms.shape # Check if downsampling is specified if downsample is None: warn("It is highly recommended to use downsampling (unless the data " + "set is very small)!") # Handle processes being None if processes is None: processes = cpu_count() // 2 # Handle standardize being default if standardize == 'default': standardize = False if cfor[0] == 'PD': standardize = True # Handle legacy mode if legacy: warn("Running in LEGACY mode! This is DEPRECATED!", DeprecationWarning) #-------------------------------------------------------------------------- ### Normalize volume [per cell] if normalize_vol: if verbose: print "Normalizing volumes..." lms = vol_normalize(lms, verbose=verbose) #-------------------------------------------------------------------------- ### Individual downsampling (presampling) [per cell] if presample is not None: if verbose: print "Presampling..." # Prep lms_ps = np.zeros((lms.shape[0], presample[1], lms.shape[2])) # Random subsampling if presample[0] == 'random': for cell in range(lms.shape[0]): lms_ps[cell, :, :] = ds.random_subsample( lms[cell, :, :], presample[1]) # Kmeans-based downsampling elif presample[0] == 'kmeans': for cell in range(lms.shape[0]): lms_ps[cell, :, :] = ds.kmeans_subsample( lms[cell, :, :], presample[1]) # Custom downsampling function elif callable(presample[0]): for cell in range(lms.shape[0]): lms_ps[cell, :, :] = presample[0](lms[cell, :, :], presample) # Handle other cases else: raise ValueError("Invalid presampling method: " + str(presample[0])) # Assign the downsampled data back lms = lms_ps #-------------------------------------------------------------------------- ### Transform to "Cell Frame Of Reference" (CFOR) [per cell] if cfor is not None: if verbose: print "Transforming to CFOR..." # Prep lms_cfor = np.zeros((lms.shape[0], lms.shape[1], cfor[1])) # Pairwise distance transform if cfor[0] == 'PD' or cfor[0] == 'default': for cell in range(lms.shape[0]): lms_cfor[cell, :, :] = pd_transform(lms[cell, :, :], percentiles=cfor[1]) # PCA transform elif cfor[0] == 'PCA': for cell in range(lms.shape[0]): lms_cfor[cell, :, :] = PCA().fit_transform(lms[cell, :, :]) ## RBF transform by Nystroem embedding ## REMOVED: This does not create matched dimensions and thus cannot be ## used for this purpose. #if cfor[0] == 'RBF': # for cell in range(lms.shape[0]): # Ny = kernel_approximation.Nystroem(kernel='rbf', # gamma=1/lms.shape[1], # n_components=cfor[1], # random_state=42) # lms_cfor[cell,:,:] = Ny.fit_transform(lms[cell,:,:]) # Custom CFOR transform elif callable(cfor[0]): for cell in range(lms.shape[0]): lms_cfor[cell, :, :] = cfor[0](lms[cell, :, :], cfor) # Handle other cases else: raise ValueError("Invalid CFOR method: " + str(cfor[0])) # Assign the CFOR data back lms = lms_cfor #-------------------------------------------------------------------------- ### Collective downsampling (all cells overlayed) [altogether] # Note: This is done to improve cluster retrieval and to make it more # efficient. It does not affect the feature extraction afterwards. # Flatten cells of all samples together all_lms = lms.reshape((lms.shape[0] * lms.shape[1], lms.shape[2])) # For CFOR-PD: standardize the dimensions if standardize and not legacy: # Standardize pooled landmarks cloud_means = all_lms.mean(axis=0) cloud_stds = all_lms.std(axis=0) all_lms = (all_lms - cloud_means) / cloud_stds # Overwrite unpooled landmarks for feature extraction in standard space lms = all_lms.reshape((lms.shape[0], lms.shape[1], lms.shape[2])) # Downsampling if downsample is not None and clustering[0] != 'previous': if verbose: print "Downsampling merged cloud..." # Default is density dependent downsampling if downsample[0] == 'default' or downsample[0] == 'ddds': all_lms_ds = ds.ddds(all_lms, downsample[1], presample=downsample[1], processes=processes) # Alternative: kmeans downsampling elif downsample[0] == 'kmeans': all_lms_ds = ds.kmeans_subsample(all_lms, downsample[1]) # Alternative: random downsampling elif downsample[0] == 'random': all_lms_ds = ds.random_subsample(all_lms, downsample[1]) # Custom downsampling elif callable(downsample[0]): all_lms_ds = downsample[0](all_lms, downsample) # Handle other cases else: raise ValueError("Invalid downsampling method: " + str(downsample[0])) # No downsampling else: all_lms_ds = all_lms # LEGACY: Standardization after downsampling and without overwriting the # unpooled landmarks! if legacy and standardize: cloud_means = all_lms_ds.mean(axis=0) cloud_stds = all_lms_ds.std(axis=0) all_lms_ds = (all_lms_ds - cloud_means) / cloud_stds #-------------------------------------------------------------------------- ### Find reference points by clustering [altogether] if verbose: print "Clustering to find reference points..." # Default: kmeans clustering if clustering[0] == 'default' or clustering[0] == 'kmeans': # Perform clustering my_clust = MiniBatchKMeans(n_clusters=clustering[1], random_state=42) my_clust.fit(all_lms_ds) # Get labels and centroids clust_labels = my_clust.labels_ clust_centers = my_clust.cluster_centers_ # Predict labels for whole data set (if downsampled) if downsample is not None: clust_labels = my_clust.predict(all_lms) # To be added: DBSCAN elif clustering[0] == 'dbscan': raise NotImplementedError("And likely never will be...") # Using a given (already fitted) clustering object elif clustering[0] == 'previous': my_clust = clustering[1] clust_centers = my_clust.cluster_centers_ clust_labels = my_clust.predict(all_lms) # Custom alternative elif callable(clustering[0]): clust_labels, clust_centers = clustering[0](all_lms, clustering) # Handle other cases else: raise ValueError("Invalid clustering method: " + str(clustering[0])) # LEGACY: Back-transform of centroids to un-standardized space # In legacy, feature extraction was done on the un-standardized # space, using the back-transformed centroids if legacy and standardize: clust_centers = clust_centers * cloud_stds + cloud_means # Unpool cluster labels clust_labels = clust_labels.reshape((lms.shape[0], lms.shape[1])) #-------------------------------------------------------------------------- ### Extract features relative to reference points [per cell] if verbose: print "Extracting cluster features..." # Init dask graph dask_graph = dict() # For each cell... for c in range(lms.shape[0]): # Node to compute kdtree dask_graph["kdtree_%i" % c] = (fe.build_kdtree, lms[c, :, :]) # Nodes for the features dask_graph["kNN-distsManh_%i" % c] = (fe.feature_distsManhatten_kNN, "kdtree_%i" % c, lms[c, :, :], clust_centers) dask_graph["kNN-distEuclid_%i" % c] = (fe.feature_distEuclidean_kNN, "kNN-distsManh_%i" % c, lms.shape[2]) dask_graph["NN-distsManh_%i" % c] = (fe.feature_distsManhatten_NN, "kdtree_%i" % c, lms[c, :, :], clust_centers) dask_graph["NN-distEuclid_%i" % c] = (fe.feature_distEuclidean_NN, "NN-distsManh_%i" % c, lms.shape[2]) dask_graph["count-near_%i" % c] = (fe.feature_count_near, [ "kdtree_%i" % c, "kNN-distEuclid_%i" % c ], lms[c, :, :], clust_centers) dask_graph["count-assigned_%i" % c] = (fe.feature_count_assigned, clust_centers, clust_labels[c]) dask_graph["kde_%i" % c] = (fe.feature_kde, lms[c, :, :], clust_centers, bw_method) # Nodes for custom feature extraction functions if custom_feature_funcs is not None: for custom_func in custom_feature_funcs: custom_parents = [ parent + "_%i" % c for parent in custom_func[2] ] dask_graph[custom_func[0] + "_%i" % c] = (custom_func[1], custom_parents, custom_func[3], lms[c, :, :], clust_centers, clust_labels[c]) # Node to collect requested features for a cell dask_graph["fvector_%i" % c] = (fe.assemble_cell, [f + "_%i" % c for f in features], features) # Render example graph for first 3 cells if c == 2 and dask_graph_path is not None: from dask.dot import dot_graph dask_graph["fspace"] = (fe.assemble_fspace, ["fvector_%i" % c for c in range(3)]) dot_graph(dask_graph, filename=dask_graph_path) # Final node to put per-cell features into a feature space dask_graph["fspace"] = (fe.assemble_fspace, ["fvector_%i" % c for c in range(lms.shape[0])]) # Run without multiprocessing if processes == 1: with ProgressBar(dt=1): fspace, fheader = dask.get(dask_graph, 'fspace') # Run with multiprocessing else: # Set number of threads dask.set_options(pool=ThreadPool(processes)) # Run the pipeline (no profiling) if not profiling: with ProgressBar(dt=1): fspace, fheader = dask.threaded.get(dask_graph, 'fspace') # Run the pipeline (with resource profiling) if profiling: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: fspace, fheader = dask.threaded.get(dask_graph, 'fspace') visualize([prof, rprof], save=False) #-------------------------------------------------------------------------- ### Save [per stack], report and return if verbose: print "Saving result..." # For each stack... for sample_idx, sample_fpath in enumerate(fpaths_lm): # Prepare suffix suffix = "" # Save individually downsampled landmark distributions if desired if presample is not None and save_presampled: if suffix_out == 'default' or 'PRES' not in suffix_out.keys(): suffix = suffix + "_PRES" else: suffix = suffix + suffix_out['PRES'] np.save(sample_fpath[:-4] + suffix, lms_ps[lms_idx == sample_idx, :, :]) # Save CFOR if desired if cfor is not None and save_cfor: if suffix_out == 'default' or 'CFOR' not in suffix_out.keys(): suffix = suffix + "_CFOR" else: suffix = suffix + suffix_out['CFOR'] np.save(sample_fpath[:-4] + suffix, lms[lms_idx == sample_idx, :, :]) # Include downsampling in suffix if downsample is not None: if suffix_out == 'default' or 'DS' not in suffix_out.keys(): suffix = suffix + '_DS' else: suffix = suffix + suffix_out['DS'] # Save shape space if suffix_out == 'default' or 'CBE' not in suffix_out.keys(): suffix = suffix + "_CBE" else: suffix = suffix + suffix_out['CBE'] np.save(sample_fpath[:-4] + suffix, fspace[lms_idx == sample_idx, :]) # Save new metadata if save_metadata: # Construct metadata path dirpath, fname = os.path.split(sample_fpath) fpath_meta = os.path.join(dirpath, fname[:10] + "_stack_metadata.pkl") # Open metadata with open(fpath_meta, 'rb') as metafile: meta_dict = pickle.load(metafile) # Prepare metadata suffix if suffix_out == 'default' or 'META' not in suffix_out.keys(): if suffix[0] == '_': m_suffix = suffix[1:] else: m_suffix = suffix else: if suffix[0] == '_': m_suffix = suffix[1:] + suffix_out['META'] else: m_suffix = suffix + suffix_out['META'] # Slightly awkward addition of TFOR tag if 'TFOR' in fpaths_lm[0]: m_suffix = 'TFOR_' + m_suffix # Add new metadata meta_dict["clustobj-" + m_suffix] = my_clust meta_dict["clusters-" + m_suffix] = clust_centers meta_dict["labels-" + m_suffix] = clust_labels[lms_idx == sample_idx] meta_dict["features-" + m_suffix] = fheader # Write metadata with open(fpath_meta, 'wb') as metafile: pickle.dump(meta_dict, metafile, pickle.HIGHEST_PROTOCOL) # Report and return if verbose: print "Processing complete!" return
def visualize( *args, filename="mydask", traverse=True, optimize_graph=False, maxval=None, **kwargs ): """ Visualize several dask graphs simultaneously. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- args : object Any number of objects. If it is a dask collection (for example, a dask DataFrame, Array, Bag, or Delayed), its associated graph will be included in the output of visualize. By default, python builtin collections are also traversed to look for dask objects (for more information see the ``traverse`` keyword). Arguments lacking an associated graph will be ignored. filename : str or None, optional The name of the file to write to disk. If the provided `filename` doesn't include an extension, '.png' will be used by default. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. traverse : bool, optional By default, dask traverses builtin python collections looking for dask objects passed to ``visualize``. For large collections this can be expensive. If none of the arguments contain any dask objects, set ``traverse=False`` to avoid doing this traversal. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. color : {None, 'order', 'ages', 'freed', 'memoryincreases', 'memorydecreases', 'memorypressure'}, optional Options to color nodes. colormap: - None, the default, no colors. - 'order', colors the nodes' border based on the order they appear in the graph. - 'ages', how long the data of a node is held. - 'freed', the number of dependencies released after running a node. - 'memoryincreases', how many more outputs are held after the lifetime of a node. Large values may indicate nodes that should have run later. - 'memorydecreases', how many fewer outputs are held after the lifetime of a node. Large values may indicate nodes that should have run sooner. - 'memorypressure', the number of data held when the node is run (circle), or the data is released (rectangle). maxval : {int, float}, optional Maximum value for colormap to normalize form 0 to 1.0. Default is ``None`` will make it the max number of values collapse_outputs : bool, optional Whether to collapse output boxes, which often have empty labels. Default is False. verbose : bool, optional Whether to label output and input boxes even if the data aren't chunked. Beware: these labels can get very long. Default is False. **kwargs Additional keyword arguments to forward to ``to_graphviz``. Examples -------- >>> x.visualize(filename='dask.pdf') # doctest: +SKIP >>> x.visualize(filename='dask.pdf', color='order') # doctest: +SKIP Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See Also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: https://docs.dask.org/en/latest/optimize.html """ from dask.dot import dot_graph args, _ = unpack_collections(*args, traverse=traverse) dsk = dict(collections_to_dsk(args, optimize_graph=optimize_graph)) color = kwargs.get("color") if color in { "order", "order-age", "order-freed", "order-memoryincreases", "order-memorydecreases", "order-memorypressure", "age", "freed", "memoryincreases", "memorydecreases", "memorypressure", }: import matplotlib.pyplot as plt from dask.order import diagnostics, order o = order(dsk) try: cmap = kwargs.pop("cmap") except KeyError: cmap = plt.cm.RdBu if isinstance(cmap, str): import matplotlib.pyplot as plt cmap = getattr(plt.cm, cmap) def label(x): return str(values[x]) data_values = None if color != "order": info = diagnostics(dsk, o)[0] if color.endswith("age"): values = {key: val.age for key, val in info.items()} elif color.endswith("freed"): values = {key: val.num_dependencies_freed for key, val in info.items()} elif color.endswith("memorypressure"): values = {key: val.num_data_when_run for key, val in info.items()} data_values = { key: val.num_data_when_released for key, val in info.items() } elif color.endswith("memoryincreases"): values = { key: max(0, val.num_data_when_released - val.num_data_when_run) for key, val in info.items() } else: # memorydecreases values = { key: max(0, val.num_data_when_run - val.num_data_when_released) for key, val in info.items() } if color.startswith("order-"): def label(x): return str(o[x]) + "-" + str(values[x]) else: values = o if maxval is None: maxval = max(1, max(values.values())) colors = {k: _colorize(cmap(v / maxval, bytes=True)) for k, v in values.items()} if data_values is None: data_values = values data_colors = colors else: data_colors = { k: _colorize(cmap(v / maxval, bytes=True)) for k, v in data_values.items() } kwargs["function_attributes"] = { k: {"color": v, "label": label(k)} for k, v in colors.items() } kwargs["data_attributes"] = {k: {"color": v} for k, v in data_colors.items()} elif color: raise NotImplementedError("Unknown value color=%s" % color) return dot_graph(dsk, filename=filename, **kwargs)
def _visualize(self, optimize_graph=False): from dask.dot import dot_graph if optimize_graph: dot_graph(optimize(self.dask, self._keys())) else: dot_graph(self.dask)
B, '_id', 'ltable_id', 'rtable_id', 'id', 'id', nchunks=4, feature_table=F, attrs_after='label', show_progress=False, compute=False) # print(len(L)) # print(L.head(1)) predictions = dt.predict( table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], append=True, target_attr='predicted', inplace=False, nchunks=2, compute=False) from dmagellan.optimization.exfeatvecs_predict_sequence_opt import delay_concat, fuse_dag opt1 = delay_concat(dict(predictions.dask)) opt2 = fuse_dag(opt1) from dask.dot import dot_graph dot_graph(opt2) # print(predictions.head()) # predictions.visualize()
def test_filenames_and_formats(filename, format, target, expected_result_type): result = dot_graph(dsk, filename=filename, format=format) assert os.path.isfile(target) assert isinstance(result, expected_result_type) ensure_not_exists(target)
def fit(): pass def vis_struct(): pass t = {'raw': (get_raw,), # 'vis': (vis,), 'dark': (get_dark, 'raw'), 'dark_corrected': (sub, 'raw', 'dark'), 'calibration': (get_calibration, 'raw'), 'polarization_corrected': (pol_correct, 'calibration', 'dark_corrected'), # '2dvis': (vis2d, 'dark_corrected', 'vis'), 'mask': (make_mask, 'calibration', 'polarization_corrected'), 'iq': ( integrate, 'polarization_corrected', 'calibration', 'mask'), # 'vis1d_iq': (vis1d, 'iq', 'vis'), # 'bg_iq': (get_background, 'raw'), # 'bg_corrected_iq': (sub, 'iq', 'muxed_bg'), # 'gr': (get_gr, 'bg_corrected_iq', 'raw'), # 'vis1d_gr': (vis1d, 'gr', 'vis'), # 'candidate_structures': (get_candidates, 'raw'), # 'fit_structures': (fit, 'candidate_structures', 'gr'), # 'vis_struc': (vis_struct, 'fit_structures', 'vis') } dot_graph(t, 'xpd_pipeline2.pdf')
dsk = {} files = sorted(glob.glob("{0}/*.tif".format(data_path))) final_saves = [] for filename in files: filename_cleaned = filename.split("/")[-1].split(".")[0] dsk['threshold-{0}'.format(filename_cleaned)] = (threshold, filename) dsk['min_size-{0}'.format(filename_cleaned)] = ( min_size, 'threshold-{0}'.format(filename_cleaned)) dsk['clean-{0}'.format(filename_cleaned)] = ( clean, 'min_size-{0}'.format(filename_cleaned)) dsk['reveal-{0}'.format(filename_cleaned)] = ( reveal, 'clean-{0}'.format(filename_cleaned)) dsk['pearlite-{0}'.format(filename_cleaned)] = ( pearlite, 'reveal-{0}'.format(filename_cleaned)) dsk['ferrite-{0}'.format(filename_cleaned)] = ( ferrite, 'pearlite-{0}'.format(filename_cleaned)) dsk['cemmentite-{0}'.format(filename_cleaned)] = ( cemmentite, 'ferrite-{0}'.format(filename_cleaned)) dsk['save-{0}'.format(filename_cleaned)] = ( save, 'cemmentite-{0}'.format(filename_cleaned)) final_saves.append('save-{0}'.format(filename_cleaned)) dsk['finalize'] = (finalize, final_saves) dot_graph(dsk) with ResourceProfiler(0.25) as rprof, Profiler() as prof, CacheProfiler( ) as cprof, ProgressBar(): dak_get(dsk, 'finalize') visualize([prof, rprof, cprof])
def test_dask(): import dask.array as da x = da.ones((5, 15), chunks=(5, 5)) d = (x + 1).dask from dask.dot import dot_graph dot_graph(d, format='svg')
def visualize(*args, **kwargs): """ Visualize several low level dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- args : dict(s) or collection(s) The low level dask graph(s) to visualize. filename : str or None, optional The name of the file to write to disk. If the provided `filename` doesn't include an extension, '.png' will be used by default. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. color : {None, 'order', 'ages', 'freed', 'memoryincreases', 'memorydecreases', 'memorypressure'}, optional Options to color nodes. colormap - None, the default, no colors. - 'order', colors the nodes' border based on the order they appear in the graph. - 'ages', how long the data of a node is held. - 'freed', the number of dependencies released after running a node. - 'memoryincreases', how many more outputs are held after the lifetime of a node. Large values may indicate nodes that should have run later. - 'memorydecreases', how many fewer outputs are held after the lifetime of a node. Large values may indicate nodes that should have run sooner. - 'memorypressure', the number of data held when: - the node is run (circle) - the data is released (rectangle) collapse_outputs : bool, optional Whether to collapse output boxes, which often have empty labels. Default is False. verbose : bool, optional Whether to label output and input boxes even if the data aren't chunked. Beware: these labels can get very long. Default is False. **kwargs Additional keyword arguments to forward to ``to_graphviz``. Examples -------- >>> x.visualize(filename='dask.pdf') # doctest: +SKIP >>> x.visualize(filename='dask.pdf', color='order') # doctest: +SKIP Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See Also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: https://docs.dask.org/en/latest/optimize.html """ from dask.dot import dot_graph filename = kwargs.pop("filename", "mydask") optimize_graph = kwargs.pop("optimize_graph", False) dsks = [] args3 = [] for arg in args: if isinstance(arg, (list, tuple, set)): for a in arg: if isinstance(a, Mapping): dsks.append(a) if is_dask_collection(a): args3.append(a) else: if isinstance(arg, Mapping): dsks.append(arg) if is_dask_collection(arg): args3.append(arg) dsk = dict(collections_to_dsk(args3, optimize_graph=optimize_graph)) for d in dsks: dsk.update(d) color = kwargs.get("color") if color in { "order", "order-age", "order-freed", "order-memoryincreases", "order-memorydecreases", "order-memorypressure", "age", "freed", "memoryincreases", "memorydecreases", "memorypressure", }: import matplotlib.pyplot as plt from .order import diagnostics, order o = order(dsk) try: cmap = kwargs.pop("cmap") except KeyError: cmap = plt.cm.RdBu if isinstance(cmap, str): import matplotlib.pyplot as plt cmap = getattr(plt.cm, cmap) def label(x): return str(values[x]) data_values = None if color != "order": info = diagnostics(dsk, o)[0] if color.endswith("age"): values = {key: val.age for key, val in info.items()} elif color.endswith("freed"): values = { key: val.num_dependencies_freed for key, val in info.items() } elif color.endswith("memorypressure"): values = { key: val.num_data_when_run for key, val in info.items() } data_values = { key: val.num_data_when_released for key, val in info.items() } elif color.endswith("memoryincreases"): values = { key: max(0, val.num_data_when_released - val.num_data_when_run) for key, val in info.items() } else: # memorydecreases values = { key: max(0, val.num_data_when_run - val.num_data_when_released) for key, val in info.items() } if color.startswith("order-"): def label(x): return str(o[x]) + "-" + str(values[x]) else: values = o maxval = kwargs.pop("maxval", None) if maxval is None: maxval = max(1, max(values.values())) colors = { k: _colorize(cmap(v / maxval, bytes=True)) for k, v in values.items() } if data_values is None: data_values = values data_colors = colors else: data_colors = { k: _colorize(cmap(v / maxval, bytes=True)) for k, v in data_values.items() } kwargs["function_attributes"] = { k: { "color": v, "label": label(k) } for k, v in colors.items() } kwargs["data_attributes"] = { k: { "color": v } for k, v in data_colors.items() } elif color: raise NotImplementedError("Unknown value color=%s" % color) return dot_graph(dsk, filename=filename, **kwargs)
def test_dask_workflow_and_paramenter_sweeping(self): """ We test a workflow with dask """ import dask # runner = GlobalFakeRunner() runner = FakeRunner() # decorate functions... generate_pricedata = dfp.job_delayed(runner)(self.generate_pricedata) generate_fundata = dfp.job_delayed(runner)(self.generate_fundata) generate_riskdata = dfp.job_delayed(runner)(self.generate_riskdata) generate_predictors = dfp.job_delayed(runner)(self.generate_predictors) generate_positions = dfp.job_delayed(runner)(self.generate_positions) # declare the dataflow dsk = dict() pools = ['pool1', 'pool2', 'pool3'] for pool in pools: dsk[(pool, 'pricedata')] = generate_pricedata(pool), dsk[(pool, 'fundata')] = generate_fundata(pool), dsk[(pool, 'riskdata')] = generate_riskdata(pool, 'risk'), (pool, 'pricedata') dsk[(pool, 'pred')] = generate_predictors(pool, 'risk'), [ (pool, t) for t in ['pricedata', 'fundata', 'riskdata'] ] for max_risk in range(3): dsk[(pool, 'positions', ('max_risk', max_risk))] = generate_positions( pool, 'risk', 'momentum', 'markowitz_aversion', max_risk=max_risk), (pool, 'pred') # from dask.multiprocessing import get # from dask.threaded import get from dask. async import get_sync as get # get(dsk, [(pool,'pred') for pool in pools]) # executes in parallel # results = get(dsk, dsk.keys()) # Execute (to convert in other formats): dot mydask.dot -Teps > mydask.eps import pandas as pd jobids = dict(zip(dsk.keys(), get(dsk, dsk.keys()))) jobids_s = pd.DataFrame(jobids).ix[0] assert len(jobids) == 21 status = runner.get_status(jobids) assert status == { ('pool3', 'pred'): 'valid', ('pool2', 'positions', ('max_risk', 2)): 'invalid', ('pool2', 'riskdata'): 'valid', ('pool3', 'riskdata'): 'valid', ('pool3', 'pricedata'): 'valid', ('pool3', 'fundata'): 'valid', ('pool2', 'positions', ('max_risk', 0)): 'invalid', ('pool1', 'pred'): 'pending', ('pool2', 'pred'): 'invalid', ('pool1', 'positions', ('max_risk', 1)): 'pending', ('pool3', 'positions', ('max_risk', 0)): 'valid', ('pool3', 'positions', ('max_risk', 2)): 'valid', ('pool2', 'fundata'): 'valid', ('pool1', 'positions', ('max_risk', 2)): 'pending', ('pool1', 'positions', ('max_risk', 0)): 'pending', ('pool1', 'riskdata'): 'pending', ('pool2', 'pricedata'): 'valid', ('pool1', 'pricedata'): 'valid', ('pool1', 'fundata'): 'valid', ('pool3', 'positions', ('max_risk', 1)): 'valid', ('pool2', 'positions', ('max_risk', 1)): 'invalid' } # Plot the graph with color corresponding to the status of jobs from dask.dot import dot_graph # dot_graph(dsk) # sdfsdf def get_status_dot_attributes(v): if v == 'valid': return dict(style='filled', color='lightgreen') if v == 'invalid': return dict(style='filled', color='red') if v == 'pending': return dict(style='filled', color='lightgrey') dot_status = { k: get_status_dot_attributes(v) for k, v in status.iteritems() } dot_graph(dsk, filename='dask_graph', format='dot', data_attributes=dot_status, function_attributes=dot_status) # dot_graph(dsk, filename='dask_graph', format='png', data_attributes=dot_status, function_attributes=dot_status) # dot_graph(dsk, filename='dask_graph', format='pdf', data_attributes=dot_status, function_attributes=dot_status) dot_graph(dsk, filename='dask_graph', format='svg', data_attributes=dot_status, function_attributes=dot_status)
import dask import dask.array as da x = da.random.normal(10, 0.1, size=(2000, 2000), chunks=(100, 100)) result = x.mean() from dask.dot import dot_graph dot_graph(result.dask)
@profile def dask_compute(dask_array): res = dask_array.compute() return res # Random 3D array (drop-in NumPy replacement) x = da.random.normal(10, 0.1, size=(100, 100, 100), chunks=(50, 50, 50)) # print x.dask # Squash to 2D (DO not compute!) mean = x.mean(axis=0) # print mean.dask # See the plot! dot_graph(mean.dask) res = dask_compute(mean) # print res.shape plt.figure() image = mpimg.imread("mydask.png") plt.imshow(image) plt.figure() plt.imshow(res) plt.show()
def visualize(*args, **kwargs): """ Visualize several dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- dsk : dict(s) or collection(s) The dask graph(s) to visualize. filename : str or None, optional The name of the file to write to disk. If the provided `filename` doesn't include an extension, '.png' will be used by default. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. color : {None, 'order'}, optional Options to color nodes. Provide ``cmap=`` keyword for additional colormap collapse_outputs : bool, optional Whether to collapse output boxes, which often have empty labels. Default is False. verbose : bool, optional Whether to label output and input boxes even if the data aren't chunked. Beware: these labels can get very long. Default is False. **kwargs Additional keyword arguments to forward to ``to_graphviz``. Examples -------- >>> x.visualize(filename='dask.pdf') # doctest: +SKIP >>> x.visualize(filename='dask.pdf', color='order') # doctest: +SKIP Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See Also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: https://docs.dask.org/en/latest/optimize.html """ from dask.dot import dot_graph filename = kwargs.pop("filename", "mydask") optimize_graph = kwargs.pop("optimize_graph", False) dsks = [] args3 = [] for arg in args: if isinstance(arg, (list, tuple, set)): for a in arg: if isinstance(a, Mapping): dsks.append(a) if is_dask_collection(a): args3.append(a) else: if isinstance(arg, Mapping): dsks.append(arg) if is_dask_collection(arg): args3.append(arg) dsk = dict(collections_to_dsk(args3, optimize_graph=optimize_graph)) for d in dsks: dsk.update(d) color = kwargs.get("color") if color == "order": from .order import order import matplotlib.pyplot as plt o = order(dsk) try: cmap = kwargs.pop("cmap") except KeyError: cmap = plt.cm.RdBu if isinstance(cmap, str): import matplotlib.pyplot as plt cmap = getattr(plt.cm, cmap) mx = max(o.values()) + 1 colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()} kwargs["function_attributes"] = { k: { "color": v, "label": str(o[k]) } for k, v in colors.items() } kwargs["data_attributes"] = { k: { "color": v } for k, v in colors.items() } elif color: raise NotImplementedError("Unknown value color=%s" % color) return dot_graph(dsk, filename=filename, **kwargs)
def visualize(*args, **kwargs): """ Visualize several dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- dsk : dict(s) or collection(s) The dask graph(s) to visualize. filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. color: {None, 'order'}, optional Options to color nodes. Provide ``cmap=`` keyword for additional colormap **kwargs Additional keyword arguments to forward to ``to_graphviz``. Examples -------- >>> x.visualize(filename='dask.pdf') # doctest: +SKIP >>> x.visualize(filename='dask.pdf', color='order') # doctest: +SKIP Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See Also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: http://dask.pydata.org/en/latest/optimize.html """ from dask.dot import dot_graph filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) dsks = [arg for arg in args if isinstance(arg, dict)] args = [arg for arg in args if is_dask_collection(arg)] dsk = collections_to_dsk(args, optimize_graph=optimize_graph) for d in dsks: dsk.update(d) color = kwargs.get('color') if color == 'order': from .order import order import matplotlib.pyplot as plt o = order(dsk) try: cmap = kwargs.pop('cmap') except KeyError: cmap = plt.cm.RdBu if isinstance(cmap, str): import matplotlib.pyplot as plt cmap = getattr(plt.cm, cmap) mx = max(o.values()) + 1 colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()} kwargs['function_attributes'] = {k: {'color': v, 'label': str(o[k])} for k, v in colors.items()} kwargs['data_attributes'] = {k: {'color': v} for k, v in colors.items()} elif color: raise NotImplementedError("Unknown value color=%s" % color) return dot_graph(dsk, filename=filename, **kwargs)
def plot(self, *args, **kwargs): self.graph[0] = 'data' dot_graph(self.graph) del self.graph[0]
# - Google Tensorflow - https://www.tensorflow.org/ # - Pytorch / Torch - http://pytorch.org/ # ## Tensor Comprehensions # Facebook shows an example of why such representations are useful since they allow for the operations to be optimized later and massive performance improvements even for _fairly_ basic operations. # # ![Comprehensions](https://research.fb.com/wp-content/uploads/2018/02/tc_evol_slower.gif) # In[14]: import dask.array as da from dask.dot import dot_graph image_1 = da.zeros((5, 5), chunks=(5, 5)) image_2 = da.ones((5, 5), chunks=(5, 5)) dot_graph(image_1.dask) # In[15]: image_4 = (image_1 - 10) + (image_2 * 50) dot_graph(image_4.dask) # In[16]: image_5 = da.matmul(image_1, image_4) dot_graph(image_5.dask) # ## Image Processing # the initial examples were shown on very simple image problems. Here we can see how it looks for real imaging issues. # In[17]:
def visualize(*args, **kwargs): """ Visualize several dask graphs at once. Requires ``graphviz`` to be installed. All options that are not the dask graph(s) should be passed as keyword arguments. Parameters ---------- dsk : dict(s) or collection(s) The dask graph(s) to visualize. filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. optimize_graph : bool, optional If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False. color: {None, 'order'}, optional Options to color nodes. Provide ``cmap=`` keyword for additional colormap **kwargs Additional keyword arguments to forward to ``to_graphviz``. Examples -------- >>> x.visualize(filename='dask.pdf') # doctest: +SKIP >>> x.visualize(filename='dask.pdf', color='order') # doctest: +SKIP Returns ------- result : IPython.diplay.Image, IPython.display.SVG, or None See dask.dot.dot_graph for more information. See Also -------- dask.dot.dot_graph Notes ----- For more information on optimization see here: https://docs.dask.org/en/latest/optimize.html """ from dask.dot import dot_graph filename = kwargs.pop('filename', 'mydask') optimize_graph = kwargs.pop('optimize_graph', False) dsks = [arg for arg in args if isinstance(arg, dict)] args = [arg for arg in args if is_dask_collection(arg)] dsk = collections_to_dsk(args, optimize_graph=optimize_graph) for d in dsks: dsk.update(d) color = kwargs.get('color') if color == 'order': from .order import order import matplotlib.pyplot as plt o = order(dsk) try: cmap = kwargs.pop('cmap') except KeyError: cmap = plt.cm.RdBu if isinstance(cmap, str): import matplotlib.pyplot as plt cmap = getattr(plt.cm, cmap) mx = max(o.values()) + 1 colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()} kwargs['function_attributes'] = { k: { 'color': v, 'label': str(o[k]) } for k, v in colors.items() } kwargs['data_attributes'] = { k: { 'color': v } for k, v in colors.items() } elif color: raise NotImplementedError("Unknown value color=%s" % color) return dot_graph(dsk, filename=filename, **kwargs)
def feature_engineering(dirpath, channels, IDs=None, recurse=False, overwrite_previous=False, seg_channel="", no_lms=False, no_tfor=False, no_cfor=False, mem_d=3, M=8, save_baselines=True, processes=None, dask_graph_path=None, profiling=False, verbose=False): """Extract a series of measurements from segmented images and point clouds. This is a dask pipeline that runs the covariate extraction functions in `katachi.tools.get_image_covariates` & `katachi.tools.get_cloud_covariates` on datasets that have been initialized, segmented and feature-extracted using other katachi pipelines. WARNING: The approach used here has been developed for the Zebrafish posterior lateral line primordium. It is likely not readily applicable to other tissues! Parameters ---------- dirpath : string The path (either local from cwd or global) to the directory with the input data to be processed. channels : list A list of channels from which to extract channel-specific covariates. For each channel, a tif file must be present that ends on `channel+".tif"` and a .npy file must be present that ends either on `channel+"_LMs_TFOR.npy"` (recommended) or on `channel+"_LMs.npy"`. The channels will be used as class attributes in the output object and therefore must not contain characters incompatible with this use. IDs : list of strings or None, optional, default None If a list of strings (IDs) is given, only samples within dirpath that match this ID will be processed. recurse : bool, optional, default False If True, files are searched recursively in the subdirs of dirpath. overwrite_previous : bool, optional, default False If True and a covariate file already exists for a given sample, that file will be deleted and a completely new file will be written in its place. If False and a covariate file already exists for a given sample, the new covariates will be added to it if they have a different name. For covariates with identical names, the new will overwrite the old. seg_channel : str or "", optional, default "" If for some reason the target directories are expected to contain more than one file that ends on "_seg.tif", seg_channel can be specified to identify the correct target file, which will have the form `<basename> + seg_channel + "_seg.tif"`. Note that having multiple segmentation files in one target directory is deprecated in general. no_lms : bool, optional, default False If True, it is expected that no landmark data is available. In this case, only image covariates are computed. no_tfor : bool, optional, default False If True, it is expected that no TFOR landmark data is available. In this case, untransformed landmarks are loaded and covariates depending on TFOR covariates are not computed (specifically pcl_covars_sample and pcl_covars_tissue). no_cfor : bool, optional, default False If True, the CFOR-based moments and baseline will not be computed and no CFOR data is required at any point. mem_d : int, optional, default 3 Estimated diameter (in pixels) of the membrane region in the shell of a single cell. Used for extraction of intensity-based covariates. M : int, optional, default 8 Highest-level moments to extract from point cloud. The moments array constructed will have shape (M+1,M+1,M+1). save_baselines : bool, optional, default True Whether to save the flattened moments arrays as feature space baselines in the form (N_cells, N_features), where N_features is length (M+1)**3. If True, two files are created for each channel, one for the base moments (usually TFOR, unless no_tfor is set to True or no TFOR data is available) and one for the PD-transformed (rotationally invariant) and volume-normalized cells, suffixed "_baseline.npy" and "_volnormPDbaseline.npy", respectively. processes : int or None, optional Number of processes dask may use for parallel processing. If None, half of the available CPUs are used. If set to 1, the entire code is run sequentially (dask is not used). dask_graph_path : string or None, optional, default None If a path (including a file ending matching a known image format, such as '.png') is specified as a string, a dask graph image is created that shows the constructed dask pipeline. Note: The resulting graph may get very large if many samples are used at the same time. profiling: bool, optional, default False If True, dask resource profiling is performed and visualized after the pipeline run is finished. This may generate a `profile.html` file in the working directory [bug in dask]. verbose : bool, optional, default False If True, more information is printed. """ #-------------------------------------------------------------------------- ### Get a list of files to run if verbose: print "Retrieving matching datasets..." # Function to select suitable datasets and create paths def prepare_fpaths(fpath, fnames): # Keep only those in specified IDs if IDs is not None: fnames = [ fname for fname in fnames if any([fname.startswith(ID) for ID in IDs]) ] # Find the metadata file meta_file = None for fname in fnames: if fname.endswith("_stack_metadata.pkl"): meta_file = fname meta_path = os.path.join(fpath, meta_file) # Quit if no metadata file is found if meta_file is None: return None # Find segmentation file seg_file = [ fname for fname in fnames if fname.endswith(seg_channel + "_seg.tif") ] # Handle failure cases if len(seg_file) == 0: return None if len(seg_file) > 1: raise IOError( "More than one segmentation file (*_seg.tif) found " + "in directory " + fpath + ". Use seg_channel kwarg to " + "specify which file to use.") else: seg_file = seg_file[0] seg_path = os.path.join(fpath, seg_file) # Find TFOR segmentation landmarks tfor_path = [] if not no_tfor and not no_lms: # Search for the file tfor_file = [ fname for fname in fnames if fname.endswith(seg_channel + "_seg_LMs_TFOR.npy") ] # Give up if nothing is found if len(tfor_file) == 0: return None # Else keep the result tfor_file = tfor_file[0] tfor_path = os.path.join(fpath, tfor_file) # Find channel landmark files lm_paths = [] if not no_lms: for channel in channels: # Search for TFOR landmarks if not no_tfor: lm_file = [ fname for fname in fnames if fname.endswith(channel + "_LMs_TFOR.npy") ] else: lm_file = [] # Search for non-TFOR landmarks if len(lm_file) == 0: lm_file = [ fname for fname in fnames if fname.endswith(channel + "_LMs.npy") ] if not no_tfor: warn("No TFOR landmarks found for channel " + channel + ". " + "Using standard landmarks.") # Give up if nothing is found if not lm_file: return None # Else keep the result lm_file = lm_file[0] lm_path = os.path.join(fpath, lm_file) lm_paths.append(lm_path) # Find CFOR-transformed channel landmark files cfor_paths = [] if not no_cfor and not no_lms: for channel in channels: # Get CFOR landmark paths cfor_file = [ fname for fname in fnames if channel in fname and fname.endswith('CFOR.npy') ][0] cfor_path = os.path.join(fpath, cfor_file) cfor_paths.append(cfor_path) # Find image files img_paths = [] for channel in channels: # Search for image files img_file = [ fname for fname in fnames if fname.endswith(channel + ".tif") ] # Give up if nothing is found if not img_file: return None # Else keep the result img_file = img_file[0] img_path = os.path.join(fpath, img_file) img_paths.append(img_path) # Return the paths return { "meta_path": meta_path, "seg_path": seg_path, "tfor_path": tfor_path, "lm_paths": lm_paths, "img_paths": img_paths, "cfor_paths": cfor_paths } # Run for single dir if not recurse: fnames = os.listdir(dirpath) all_paths = [prepare_fpaths(dirpath, fnames)] if all_paths is None: raise IOError("The specified path does not contain the required " + "files (and recurse=False).") # Run for multiple subdirs if recurse: all_paths = [] for dpath, _, fnames in os.walk(dirpath): fpaths = prepare_fpaths(dpath, fnames) if fpaths is not None: all_paths.append(fpaths) if not all_paths: raise IOError("Could not find any data directories containing " + "all required files.") # Report if verbose: print "-- Retrieved", len(all_paths), "matching data sets." #-------------------------------------------------------------------------- ### If desired: run sequentially (does not use dask/multiprocessing) if processes == 1: if verbose: print "Processing target files sequentially..." # For each dataset... for paths in all_paths: # Load previously generated covariates file (if available) has_previous = False if not overwrite_previous: mroot, mfile = os.path.split(paths["meta_path"]) prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl") if os.path.isfile(prevfpath): with open(prevfpath, 'rb') as prevfile: covars = pickle.load(prevfile) has_previous = True # Load data img_seg = imread(paths["seg_path"]) if not no_lms and not no_tfor: tfor_lms = np.load(paths["tfor_path"]) with open(paths["meta_path"], 'rb') as metafile: meta_dict = pickle.load(metafile) # Extract image covariates covars = gic.get_img_covars_sample( "_", img_seg=img_seg, covars=covars if has_previous else None) covars = gic.get_img_covars_tissue("_", img_seg=img_seg, covars=covars) covars = gic.get_img_covars_cell_seg("_", '_', img_seg=img_seg, metadata=meta_dict, covars=covars) for c, channel in enumerate(channels): covars = gic.get_img_covars_cell_int("_", paths["img_paths"][c], channel, mem_d, img_seg=img_seg, covars=covars) # Extract point cloud covariates if not no_tfor and not no_lms: covars = gcc.get_pcl_covars_sample("_", "_", tfor_lms=tfor_lms, metadata=meta_dict, covars=covars) covars = gcc.get_pcl_covars_tissue("_", "_", tfor_lms=tfor_lms, metadata=meta_dict, covars=covars) if not no_lms: for c, channel in enumerate(channels): covars = gcc.get_pcl_covars_cell( paths["lm_paths"][c], channel, M=M, no_cfor=no_cfor, fpath_lms_cfor=paths["cfor_paths"][c], covars=covars) # Saving the moments as a baseline feature space if save_baselines: # Prep base path bp = paths["lm_paths"][c][:-4] # Save TFOR baseline m = covars.pcl.cell._gad(channel).moments np.save(bp + "_baseline.npy", m) # Save CFOR baseline if not no_cfor: m = covars.pcl.cell._gad(channel).moments_cfor np.save(bp + "_CFORbaseline.npy", m) # Saving the extracted covariates mroot, mfile = os.path.split(paths["meta_path"]) outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl") with open(outfpath, 'wb') as outfile: pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL) # Report and return if verbose: print "Processing complete!" return #-------------------------------------------------------------------------- ### Prepare dask dict # Note: This is slightly suboptimal because some datasets have to be # reloaded multiple times. However, it seems difficult to solve this # in a way that permits carrying them over. if verbose: print "Processing target files in parallel..." dask_graph = dict() # For each dataset... for idx, paths in enumerate(all_paths): # Getting previous covariates: function def get_previous_covariates(prevfpath): with open(prevfpath, 'rb') as prevfile: covars = pickle.load(prevfile) return covars # Get previous covars (if existing and desired) has_previous = False if not overwrite_previous: mroot, mfile = os.path.split(paths["meta_path"]) prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl") if os.path.isfile(prevfpath): dask_graph['prev_covars_%i' % idx] = (get_previous_covariates, prevfpath) has_previous = True # Extract image covariates dask_graph["img_sample_%i" % idx] = (gic.get_img_covars_sample, paths["seg_path"]) dask_graph["img_tissue_%i" % idx] = (gic.get_img_covars_tissue, paths["seg_path"]) dask_graph["img_cell_seg_%i" % idx] = (gic.get_img_covars_cell_seg, paths["seg_path"], paths["meta_path"]) for c, channel in enumerate(channels): dask_graph["img_cell_int_%s_%i" % (channel, idx)] = (gic.get_img_covars_cell_int, paths["seg_path"], paths["img_paths"][c], channel, mem_d) # Extract point cloud covariates if not no_tfor and not no_lms: dask_graph["pcl_sample_%i" % idx] = (gcc.get_pcl_covars_sample, paths["tfor_path"], paths["meta_path"]) dask_graph["pcl_tissue_%i" % idx] = (gcc.get_pcl_covars_tissue, paths["tfor_path"], paths["meta_path"]) if not no_lms: for c, channel in enumerate(channels): dask_graph["pcl_cell_%s_%i" % (channel, idx)] = (gcc.get_pcl_covars_cell, paths["lm_paths"][c], channel, M, no_cfor, paths["cfor_paths"][c]) # Saving the moments as a baseline feature space if save_baselines: # Baseline saving function def save_baseline(covars, channel, basepath, no_cfor): # Save TFOR baseline m = covars.pcl.cell._gad(channel).moments np.save(basepath + "_baseline.npy", m) # Save CFOR baseline if not no_cfor: m = covars.pcl.cell._gad(channel).moments_cfor np.save(basepath + "_CFORbaseline.npy", m) # Forward result return covars # Add to graph basepath = paths["lm_paths"][c][:-4] dask_graph["pcl_cell_blsave_%s_%i" % (channel, idx)] = (save_baseline, "pcl_cell_%s_%i" % (channel, idx), channel, basepath, no_cfor) # Merging the extracted covariates: function def merge_covariates(covars_list): covars = covars_list[0] for cv in covars_list[1:]: covars._merge(cv) return covars # Merging the extracted covariates: input name list construction covars_list = [ "img_sample_%i" % idx, "img_tissue_%i" % idx, "img_cell_seg_%i" % idx ] covars_list += [ "img_cell_int_%s_%i" % (channel, idx) for channel in channels ] if not no_tfor and not no_lms: covars_list += ["pcl_sample_%i" % idx, "pcl_tissue_%i" % idx] if save_baselines and not no_lms: covars_list += [ "pcl_cell_blsave_%s_%i" % (channel, idx) for channel in channels ] elif not no_lms: covars_list += [ "pcl_cell_%s_%i" % (channel, idx) for channel in channels ] if has_previous: covars_list += ['prev_covars_%i' % idx] # Merging the extracted covariates: dask call dask_graph["merge_results_%i" % idx] = (merge_covariates, covars_list) # Saving the extracted covariates def save_covariates(covars, outfpath): with open(outfpath, 'wb') as outfile: pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL) mroot, mfile = os.path.split(paths["meta_path"]) outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl") dask_graph["save_results_%i" % idx] = (save_covariates, "merge_results_%i" % idx, outfpath) # Collecting the results dask_graph['done'] = (lambda x: "done", [ "save_results_%i" % idx for idx in range(len(all_paths)) ]) # Saving the graph visualization if dask_graph_path is not None: from dask.dot import dot_graph dot_graph(dask_graph, filename=dask_graph_path) #-------------------------------------------------------------------------- ### Run in parallel (with dask) # If necessary: choose number of threads (half of available cores) if processes is None: processes = cpu_count() // 2 # Set number of threads dask.set_options(pool=ThreadPool(processes)) # Run the pipeline (no profiling) if not profiling: with ProgressBar(dt=1): dask.threaded.get(dask_graph, 'done') # Run the pipeline (with resource profiling) if profiling: with ProgressBar(dt=1): with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof: dask.threaded.get(dask_graph, 'done') visualize([prof, rprof], save=False) # Report and return if verbose: print "Processing complete!" return