Exemplo n.º 1
0
    def merge(cls, chunks, data_type='<UNKNOWN>'):
        """Create chunk by merging columns of chunks of same data kind

        :param chunks: Chunks to merge. None is allowed and will be ignored.
        :param data_type: data_type name of new created chunk. Set to <UNKNOWN>
        if not provided.
        """
        chunks = [c for c in chunks if c is not None]
        if not chunks:
            raise ValueError("Need at least one chunk to merge")
        if len(chunks) == 1:
            return chunks[0]

        data_kinds = [c.data_kind for c in chunks]
        if len(set(data_kinds)) != 1:
            raise ValueError(f"Cannot merge chunks {chunks} of different"
                             f" data kinds: {data_kinds}")
        data_kind = data_kinds[0]

        run_ids = [c.run_id for c in chunks]
        if len(set(run_ids)) != 1:
            raise ValueError(
                f"Cannot merge chunks of different run_ids: {chunks}")
        run_id = run_ids[0]

        if len(set([len(c) for c in chunks])) != 1:
            raise ValueError(
                f"Cannot merge chunks with different number of items: {chunks}")

        tranges = [(c.start, c.end) for c in chunks]
        if len(set(tranges)) != 1:
            raise ValueError("Cannot merge chunks with different time "
                             f"ranges: {tranges}")
        start, end = tranges[0]

        data = strax.merge_arrs(
            [c.data for c in chunks],
            # Make sure dtype field order is consistent, regardless of the
            # order in which chunks are passed to merge:
            dtype=strax.merged_dtype(
                [c.dtype
                 for c in sorted(chunks,
                                 key=lambda x: x.data_type)]))

        return cls(
            start=start,
            end=end,
            dtype=data.dtype,
            data_type=data_type,
            data_kind=data_kind,
            run_id=run_id,
            data=data,
            target_size_mb=max([c.target_size_mb for c in chunks]))
Exemplo n.º 2
0
    def do_compute(self, chunk_i=None, **kwargs):
        results = kwargs

        # Run the different plugin computations
        while True:
            for output_name, p in self.sub_plugins.items():
                if output_name in results:
                    continue
                # Sorting deps since otherwise input field order depends on
                # order in which computation happened, which might be bad?
                deps = sorted(p.depends_on)
                if any([d not in results for d in deps]):
                    continue
                compute_kwargs = dict(chunk_i=chunk_i)

                for kind, d_of_kind in p.dependencies_by_kind().items():
                    compute_kwargs[kind] = strax.merge_arrs(
                        [results[d] for d in d_of_kind])

                # Store compute result(s)
                r = p.do_compute(**compute_kwargs)
                if p.multi_output:
                    for d in r:
                        results[d] = r[d]
                else:
                    results[output_name] = r

                # Rescan plugins to see if we can compute anything more
                break

            else:
                # Nothing further to compute
                break
        for d in self.provides:
            assert d in results, f"Output {d} missing!"

        # Save anything we can through the inlined savers
        for d, savers in self.sub_savers:
            for s in savers:
                s.save(data=results[d], chunk_i=chunk_i)

        # Remove results we do not need to send
        for d in list(results.keys()):
            if d not in self.provides:
                del results[d]

        if not self.multi_output:
            results = results[self.provides[0]]

        return self._fix_output(results)
Exemplo n.º 3
0
def merge_iters(iters):
    """Return iterator over merged arrays from several iterators
    :param iters: list, tuple, or dict of iters

    Iterators must already be synced to produce same-size chunks
    """
    if isinstance(iters, dict):
        iters = list(iters.values())
    iters = list(iters)

    if len(iters) == 1:
        yield from iters[0]

    try:
        while True:
            yield strax.merge_arrs([next(it) for it in iters])
    except StopIteration:
        return
Exemplo n.º 4
0
def multi_run(f, run_ids, *args, max_workers=None, **kwargs):
    """Execute f(run_id, **kwargs) over multiple runs,
    then return list of results.

    :param run_ids: list/tuple of runids
    :param max_workers: number of worker threads/processes to spawn

    Other (kw)args will be passed to f
    """
    # Try to int all run_ids

    # Get a numpy array of run ids.
    try:
        run_id_numpy = np.array([int(x) for x in run_ids],
                                dtype=np.int32)
    except ValueError:
        # If there are string id's among them,
        # numpy will autocast all the run ids to Unicode fixed-width
        run_id_numpy = np.array(run_ids)

    # Probably we'll want to use dask for this in the future,
    # to enable cut history tracking and multiprocessing.
    # For some reason the ProcessPoolExecutor doesn't work??
    with ThreadPoolExecutor(max_workers=max_workers) as exc:
        futures = [exc.submit(f, r, *args, **kwargs)
                   for r in run_ids]
        for _ in tqdm(as_completed(futures),
                      desc="Loading %d runs" % len(run_ids)):
            pass

        result = []
        for i, f in enumerate(futures):
            r = f.result()
            ids = np.array([run_id_numpy[i]] * len(r),
                           dtype=[('run_id', run_id_numpy.dtype)])
            r = strax.merge_arrs([ids, r])
            result.append(r)
        return result
Exemplo n.º 5
0
def merge_iters(iters):
    try:
        while True:
            yield strax.merge_arrs([next(it) for it in iters])
    except StopIteration:
        return