def inline_plugins(cls, components, start_from, log): plugins = components.plugins.copy() sub_plugins = {start_from: plugins[start_from]} del plugins[start_from] # Gather all plugins that do not rechunk and which branch out as a # simple tree from the input plugin. # We'll run these all together in one process. while True: # Scan for plugins we can inline for p in plugins.values(): if (p.parallel and all([d in sub_plugins for d in p.depends_on])): for d in p.provides: sub_plugins[d] = p if d in plugins: del plugins[d] # Rescan break else: # No more plugins we can inline break if len(set(list(sub_plugins.values()))) == 1: # Just one plugin to inline: no use log.debug("Just one plugin to inline: skipping") return components # Which data types should we output? Three cases follow. outputs_to_send = set() # Case 1. Requested as a final target for p in sub_plugins.values(): outputs_to_send.update( set(components.targets).intersection(set(p.provides))) # Case 2. Requested by a plugin we did not inline for d, p in plugins.items(): outputs_to_send.update(set(p.depends_on)) outputs_to_send &= sub_plugins.keys() # Inline savers that do not require rechunking savers = components.savers sub_savers = dict() for p in sub_plugins.values(): for d in p.provides: if d not in savers: continue if p.can_rechunk(d): # Case 3. has a saver we can't inline outputs_to_send.add(d) continue remaining_savers = [] for s_i, s in enumerate(savers[d]): if not s.allow_fork: # Case 3 again, cannot inline saver outputs_to_send.add(d) remaining_savers.append(s) continue if d not in sub_savers: sub_savers[d] = [] s.is_forked = True sub_savers[d].append(s) savers[d] = remaining_savers if not len(savers[d]): del savers[d] p = cls(depends_on=sub_plugins[start_from].depends_on) p.sub_plugins = sub_plugins assert len(outputs_to_send) p.provides = tuple(outputs_to_send) p.sub_savers = sub_savers p.start_from = start_from if p.multi_output: p.dtype = { d: p.sub_plugins[d].dtype_for(d) for d in outputs_to_send } else: to_send = list(outputs_to_send)[0] p.dtype = p.sub_plugins[to_send].dtype_for(to_send) for d in p.provides: plugins[d] = p p.deps = {d: plugins[d] for d in p.depends_on} log.debug(f"Inlined plugins: {p.sub_plugins}." f"Inlined savers: {p.sub_savers}") return strax.ProcessorComponents(plugins, components.loaders, savers, components.targets)
def get_components( self, run_id: str, targets=tuple(), save=tuple()) -> strax.ProcessorComponents: """Return components for setting up a processor {get_docs} """ save = strax.to_str_tuple(save) targets = strax.to_str_tuple(targets) plugins = self._get_plugins(targets, run_id) # Get savers/loaders, and meanwhile filter out plugins that do not # have to do computation.(their instances will stick around # though the .deps attribute of plugins that do) loaders = dict() savers = collections.defaultdict(list) seen = set() to_compute = dict() def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] key = strax.CacheKey(run_id, d, p.lineage) for sb_i, sb in enumerate(self.storage): try: loaders[d] = sb.loader(key) # Found it! No need to make it or save it del plugins[d] return except strax.NotCached: continue # Not in any cache. We will be computing it. to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # We're making this OR it gets fed in. Should we save it? if p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d != targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS for sb_i, sb in enumerate(self.storage): if not sb.provides(d, write=True): continue s = sb.saver(key, p.metadata(run_id)) s.meta_only = p.save_meta_only savers[d].append(s) for d in targets: check_cache(d) plugins = to_compute intersec = list(plugins.keys() & loaders.keys()) if len(intersec): raise RuntimeError("{intersec} both computed and loaded?!") # Check all required options are available / set defaults for p in plugins.values(): self._set_plugin_config(p, tolerant=False) return strax.ProcessorComponents(plugins=plugins, loaders=loaders, savers=dict(savers), targets=targets)
def get_components( self, run_id: str, targets=tuple(), save=tuple(), time_range=None, ) -> strax.ProcessorComponents: """Return components for setting up a processor {get_docs} """ save = strax.to_str_tuple(save) targets = strax.to_str_tuple(targets) plugins = self._get_plugins(targets, run_id) n_range = None if time_range is not None: # Ensure we have one data kind if len(set([plugins[t].data_kind for t in targets])) > 1: raise NotImplementedError( "Time range selection not implemented " "for multiple data kinds.") # Which plugin provides time information? We need it to map to # row indices. for p in targets: if 'time' in plugins[p].dtype.names: break else: raise RuntimeError(f"No time info in targets, should have been" f" caught earlier??") # Find a range of row numbers that contains the time range # It's a bit too large: to # Get the n <-> time mapping in needed chunks if not self.is_stored(run_id, p): raise strax.DataNotAvailable(f"Time range selection needs time" f" info from {p}, but this data" f" is not yet available") meta = self.get_meta(run_id, p) times = np.array([c['first_time'] for c in meta['chunks']]) # Reconstruct row numbers from row counts, which are in metadata # n_end is last row + 1 in a chunk. n_start is the first. n_end = np.array([c['n'] for c in meta['chunks']]).cumsum() n_start = n_end - n_end[0] _inds = np.searchsorted(times, time_range) - 1 # Clip to prevent out-of-range times causing # negative or nonexistent indices _inds = np.clip(_inds, 0, len(n_end) - 1) n_range = n_start[_inds[0]], n_end[_inds[1]] # Get savers/loaders, and meanwhile filter out plugins that do not # have to do computation.(their instances will stick around # though the .deps attribute of plugins that do) loaders = dict() savers = collections.defaultdict(list) seen = set() to_compute = dict() def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] key = strax.DataKey(run_id, d, p.lineage) for sb_i, sf in enumerate(self.storage): try: # Bit clunky... but allows specifying executor later sf.find(key, **self._find_options) loaders[d] = partial(sf.loader, key, n_range=n_range, **self._find_options) # Found it! No need to make it del plugins[d] break except strax.DataNotAvailable: continue else: if time_range is not None: # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") if d in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies it cannot be created.") # Not in any cache. We will be computing it. to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? if time_range is not None: # No, since we're not even getting the whole data. # Without this check, saving could be attempted if the # storage converter mode is enabled. self.log.warning(f"Not saving {d} while " f"selecting a time range in the run") return if any([ len(v) > 0 for k, v in self._find_options.items() if 'fuzzy' in k ]): # In fuzzy matching mode, we cannot (yet) derive the lineage # of any data we are creating. To avoid create false # data entries, we currently do not save at all. self.log.warning(f"Not saving {d} while fuzzy matching is " f"turned on.") return if self.context_config['allow_incomplete']: self.log.warning(f"Not saving {d} while loading incomplete " f"data is allowed.") return elif p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS for sf in self.storage: if sf.readonly: continue if d not in to_compute: if not self.context_config['storage_converter']: continue try: sf.find(key, **self._find_options) # Already have this data in this backend continue except strax.DataNotAvailable: # Don't have it, so let's convert it! pass try: savers[d].append(sf.saver(key, metadata=p.metadata(run_id))) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass for d in targets: check_cache(d) plugins = to_compute intersec = list(plugins.keys() & loaders.keys()) if len(intersec): raise RuntimeError("{intersec} both computed and loaded?!") # For the plugins which will run computations, # check all required options are available or set defaults. # Also run any user-defined setup for p in plugins.values(): self._set_plugin_config(p, run_id, tolerant=False) p.setup() return strax.ProcessorComponents(plugins=plugins, loaders=loaders, savers=dict(savers), targets=targets)
def get_components(self, run_id: str, targets=tuple(), save=tuple(), time_range=None, chunk_number=None) -> strax.ProcessorComponents: """Return components for setting up a processor {get_docs} """ save = strax.to_str_tuple(save) targets = strax.to_str_tuple(targets) # Although targets is a tuple, we only support one target at the moment # TODO: just make it a string! assert len(targets) == 1, f"Found {len(targets)} instead of 1 target" if len(targets[0]) == 1: raise ValueError( f"Plugin names must be more than one letter, not {targets[0]}") plugins = self._get_plugins(targets, run_id) target = targets[0] # See above, already restricted to one target targetp = plugins[target] # Get savers/loaders, and meanwhile filter out plugins that do not # have to do computation. (their instances will stick around # though the .deps attribute of plugins that do) loaders = dict() savers = collections.defaultdict(list) seen = set() to_compute = dict() def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] # Can we load this data? loading_this_data = False key = strax.DataKey(run_id, d, p.lineage) ldr = self._get_partial_loader_for(key, chunk_number=chunk_number, time_range=time_range) if not ldr and run_id.startswith('_'): if time_range is not None: raise NotImplementedError("time range loading not yet " "supported for superruns") sub_run_spec = self.run_metadata( run_id, 'sub_run_spec')['sub_run_spec'] self.make(list(sub_run_spec.keys()), d) ldrs = [] for subrun in sub_run_spec: sub_key = strax.DataKey( subrun, d, self._get_plugins((d, ), subrun)[d].lineage) if sub_run_spec[subrun] == 'all': _subrun_time_range = None else: _subrun_time_range = sub_run_spec[subrun] ldr = self._get_partial_loader_for( sub_key, time_range=_subrun_time_range, chunk_number=chunk_number) if not ldr: raise RuntimeError( f"Could not load {d} for subrun {subrun} " f"even though we made it??") ldrs.append(ldr) def concat_loader(*args, **kwargs): for x in ldrs: yield from x(*args, **kwargs) ldr = lambda *args, **kwargs: concat_loader(*args, **kwargs) if ldr: # Found it! No need to make it or look in other frontends loading_this_data = True loaders[d] = ldr del plugins[d] else: # Data not found anywhere. We will be computing it. if (time_range is not None and plugins[d].save_when != strax.SaveWhen.NEVER): # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") if '*' in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies no new data can be created.") if d in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies it cannot be created.") to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? If not, return. if (loading_this_data and not self.context_config['storage_converter']): return if p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS # Warn about conditions that preclude saving, but the user # might not expect. if time_range is not None: # We're not even getting the whole data. # Without this check, saving could be attempted if the # storage converter mode is enabled. self.log.warning(f"Not saving {d} while " f"selecting a time range in the run") return if any([ len(v) > 0 for k, v in self._find_options.items() if 'fuzzy' in k ]): # In fuzzy matching mode, we cannot (yet) derive the # lineage of any data we are creating. To avoid creating # false data entries, we currently do not save at all. self.log.warning(f"Not saving {d} while fuzzy matching is" f" turned on.") return if self.context_config['allow_incomplete']: self.log.warning(f"Not saving {d} while loading incomplete" f" data is allowed.") return # Save the target and any other outputs of the plugin. for d_to_save in set([d] + list(p.provides)): if d_to_save in savers and len(savers[d_to_save]): # This multi-output plugin was scanned before # let's not create doubled savers assert p.multi_output continue key = strax.DataKey(run_id, d_to_save, p.lineage) for sf in self.storage: if sf.readonly: continue if loading_this_data: # Usually, we don't save if we're loading if not self.context_config['storage_converter']: continue # ... but in storage converter mode we do: try: sf.find(key, **self._find_options) # Already have this data in this backend continue except strax.DataNotAvailable: # Don't have it, so let's save it! pass # If we get here, we must try to save try: savers[d_to_save].append( sf.saver(key, metadata=p.metadata(run_id, d_to_save))) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass for d in targets: check_cache(d) plugins = to_compute intersec = list(plugins.keys() & loaders.keys()) if len(intersec): raise RuntimeError("{intersec} both computed and loaded?!") # For the plugins which will run computations, # check all required options are available or set defaults. # Also run any user-defined setup for d in plugins.values(): self._set_plugin_config(d, run_id, tolerant=False) d.setup() return strax.ProcessorComponents(plugins=plugins, loaders=loaders, savers=dict(savers), targets=targets)