def merge(cls, chunks, data_type='<UNKNOWN>'): """Create chunk by merging columns of chunks of same data kind :param chunks: Chunks to merge. None is allowed and will be ignored. :param data_type: data_type name of new created chunk. Set to <UNKNOWN> if not provided. """ chunks = [c for c in chunks if c is not None] if not chunks: raise ValueError("Need at least one chunk to merge") if len(chunks) == 1: return chunks[0] data_kinds = [c.data_kind for c in chunks] if len(set(data_kinds)) != 1: raise ValueError(f"Cannot merge chunks {chunks} of different" f" data kinds: {data_kinds}") data_kind = data_kinds[0] run_ids = [c.run_id for c in chunks] if len(set(run_ids)) != 1: raise ValueError( f"Cannot merge chunks of different run_ids: {chunks}") run_id = run_ids[0] if len(set([len(c) for c in chunks])) != 1: raise ValueError( f"Cannot merge chunks with different number of items: {chunks}") tranges = [(c.start, c.end) for c in chunks] if len(set(tranges)) != 1: raise ValueError("Cannot merge chunks with different time " f"ranges: {tranges}") start, end = tranges[0] data = strax.merge_arrs( [c.data for c in chunks], # Make sure dtype field order is consistent, regardless of the # order in which chunks are passed to merge: dtype=strax.merged_dtype( [c.dtype for c in sorted(chunks, key=lambda x: x.data_type)])) return cls( start=start, end=end, dtype=data.dtype, data_type=data_type, data_kind=data_kind, run_id=run_id, data=data, target_size_mb=max([c.target_size_mb for c in chunks]))
def do_compute(self, chunk_i=None, **kwargs): results = kwargs # Run the different plugin computations while True: for output_name, p in self.sub_plugins.items(): if output_name in results: continue # Sorting deps since otherwise input field order depends on # order in which computation happened, which might be bad? deps = sorted(p.depends_on) if any([d not in results for d in deps]): continue compute_kwargs = dict(chunk_i=chunk_i) for kind, d_of_kind in p.dependencies_by_kind().items(): compute_kwargs[kind] = strax.merge_arrs( [results[d] for d in d_of_kind]) # Store compute result(s) r = p.do_compute(**compute_kwargs) if p.multi_output: for d in r: results[d] = r[d] else: results[output_name] = r # Rescan plugins to see if we can compute anything more break else: # Nothing further to compute break for d in self.provides: assert d in results, f"Output {d} missing!" # Save anything we can through the inlined savers for d, savers in self.sub_savers: for s in savers: s.save(data=results[d], chunk_i=chunk_i) # Remove results we do not need to send for d in list(results.keys()): if d not in self.provides: del results[d] if not self.multi_output: results = results[self.provides[0]] return self._fix_output(results)
def merge_iters(iters): """Return iterator over merged arrays from several iterators :param iters: list, tuple, or dict of iters Iterators must already be synced to produce same-size chunks """ if isinstance(iters, dict): iters = list(iters.values()) iters = list(iters) if len(iters) == 1: yield from iters[0] try: while True: yield strax.merge_arrs([next(it) for it in iters]) except StopIteration: return
def multi_run(f, run_ids, *args, max_workers=None, **kwargs): """Execute f(run_id, **kwargs) over multiple runs, then return list of results. :param run_ids: list/tuple of runids :param max_workers: number of worker threads/processes to spawn Other (kw)args will be passed to f """ # Try to int all run_ids # Get a numpy array of run ids. try: run_id_numpy = np.array([int(x) for x in run_ids], dtype=np.int32) except ValueError: # If there are string id's among them, # numpy will autocast all the run ids to Unicode fixed-width run_id_numpy = np.array(run_ids) # Probably we'll want to use dask for this in the future, # to enable cut history tracking and multiprocessing. # For some reason the ProcessPoolExecutor doesn't work?? with ThreadPoolExecutor(max_workers=max_workers) as exc: futures = [exc.submit(f, r, *args, **kwargs) for r in run_ids] for _ in tqdm(as_completed(futures), desc="Loading %d runs" % len(run_ids)): pass result = [] for i, f in enumerate(futures): r = f.result() ids = np.array([run_id_numpy[i]] * len(r), dtype=[('run_id', run_id_numpy.dtype)]) r = strax.merge_arrs([ids, r]) result.append(r) return result
def merge_iters(iters): try: while True: yield strax.merge_arrs([next(it) for it in iters]) except StopIteration: return