Exemplo n.º 1
0
    def test_find_fuzzy(self):
        changed_keys = []
        rucio_local = straxen.RucioLocalFrontend(path=self.rucio_path)
        for key in self.test_keys:
            changed_key = strax.DataKey(run_id=key.run_id,
                                        data_type=key.data_type,
                                        lineage={
                                            'dtype': ['Plugin', '1.0.0', {}],
                                        })
            changed_keys += [changed_key]

            # We shouldn't find this data
            with self.assertRaises(strax.DataNotAvailable):
                rucio_local.find(changed_key)

        # Also find several shouldn't work
        find_several_keys = rucio_local.find_several(changed_keys)
        self.assertFalse(any(find_several_keys))

        # Now test fuzzy
        with self.assertWarns(UserWarning):
            find_several_keys_fuzzy = rucio_local.find_several(
                changed_keys,
                fuzzy_for=changed_keys[0].data_type,
            )
        self.assertTrue(all(find_several_keys_fuzzy))
Exemplo n.º 2
0
 def metadata(self, run_id):
     """Metadata to save along with produced data"""
     return dict(run_id=run_id,
                 data_type=self.provides,
                 data_kind=self.data_kind,
                 dtype=self.dtype,
                 lineage_hash=strax.DataKey(run_id, self.provides,
                                            self.lineage).lineage_hash,
                 compressor=self.compressor,
                 lineage=self.lineage)
Exemplo n.º 3
0
 def setUp(self) -> None:
     self.test_keys = [
         strax.DataKey(run_id=run_id,
                       data_type='dtype',
                       lineage={
                           'dtype': ['Plugin', '0.0.0', {}],
                       }) for run_id in ('-1', '-2')
     ]
     self.rucio_path = './.test_rucio'
     self.write_test_data()
Exemplo n.º 4
0
def merge(
        runid_str,  # run number padded with 0s
        dtype,  # data type 'level' e.g. records, peaklets
        st,  # strax context
        path  # path where the data is stored
):

    # get the storage path, since will need to reset later
    _storage_paths = [storage.path for storage in st.storage]

    # initialize plugin needed for processing
    plugin = st._get_plugins((dtype, ), runid_str)[dtype]
    st._set_plugin_config(plugin, runid_str, tolerant=False)
    plugin.setup()

    for keystring in plugin.provides:
        key = strax.DataKey(runid_str, keystring, plugin.lineage)
        saver = st.storage[0].saver(key, plugin.metadata(runid_str, keystring))
        # monkey patch the saver
        tmpname = os.path.split(saver.tempdirname)[1]
        dirname = os.path.split(saver.dirname)[1]
        saver.tempdirname = os.path.join(path, tmpname)
        saver.dirname = os.path.join(path, dirname)
        saver.is_forked = True
        # merge the jsons
        saver.close()

    # change the storage frontend to use the merged data
    st.storage[0] = strax.DataDirectory(path)

    # rechunk the data if we can
    for keystring in plugin.provides:
        rechunk = True
        if isinstance(plugin.rechunk_on_save, immutabledict):
            if not plugin.rechunk_on_save[keystring]:
                rechunk = False
        else:
            if not plugin.rechunk_on_save:
                rechunk = False

        if rechunk:
            print(f"Rechunking {keystring}")
            st.copy_to_frontend(runid_str, keystring, 1, rechunk=True)
        else:
            print(
                f"Not rechunking {keystring}. Just copy to the staging directory."
            )
            key = st.key_for(runid_str, keystring)
            src = os.path.join(st.storage[0].path, str(key))
            dest = os.path.join(st.storage[1].path, str(key))
            shutil.copytree(src, dest)

    # reset in case we need to merge more data
    st.storage = [strax.DataDirectory(path) for path in _storage_paths]
Exemplo n.º 5
0
 def metadata(self, run_id, data_type):
     """Metadata to save along with produced data"""
     if not data_type in self.provides:
         raise RuntimeError(f"{data_type} not in {self.provides}?")
     return dict(run_id=run_id,
                 data_type=data_type,
                 data_kind=self.data_kind_for(data_type),
                 dtype=self.dtype_for(data_type),
                 lineage_hash=strax.DataKey(run_id, data_type,
                                            self.lineage).lineage_hash,
                 compressor=self.compressor,
                 lineage=self.lineage)
Exemplo n.º 6
0
 def setUpClass(cls) -> None:
     """
     For testing purposes, slightly alter the RucioFrontend such that
      we can run tests outside of dali too
     """
     # Some non-existing keys that we will try finding in the test cases.
     cls.test_keys = [
         strax.DataKey(run_id=run_id,
                       data_type='dtype',
                       lineage={
                           'dtype': ['Plugin', '0.0.0.', {}],
                       }) for run_id in ('-1', '-2')
     ]
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description="Combine strax output")
    parser.add_argument('dataset', help='Run number', type=int)
    parser.add_argument('dtype', help='dtype to combine')
    parser.add_argument('--context', help='Strax context')
    parser.add_argument('--input_path',
                        help='path where the temp directory is')
    parser.add_argument('--output_path',
                        help='final location of combined data')

    args = parser.parse_args()

    if os.path.exists(args.output_path):
        raise (FileExistsError("Output path %s already exists" %
                               args.output_path))

    runid = args.dataset
    runid_str = "%06d" % runid
    dtype = args.dtype
    path = args.input_path
    output_path = args.output_path

    # get context
    st = eval(f'straxen.contexts.{args.context}()')
    st.storage = [strax.DataDirectory(output_path)]

    # initialize plugin needed for processing
    plugin = st._get_plugins((dtype, ), runid_str)[dtype]
    st._set_plugin_config(plugin, runid_str, tolerant=False)
    plugin.setup()

    # setup rucio client
    rc = RucioSummoner()

    for keystring in plugin.provides:
        dirname = f"{runid_str}-{keystring}-{hash}"
        upload_path = os.path.join(output_path, dirname)

        key = strax.DataKey(runid_str, keystring, plugin.lineage)
        saver = st.storage[0].saver(key, plugin.metadata(runid_str, keystring))
        saver.is_forked = True

        tmpdir, tmpname = os.path.split(saver.tempdirname)
        rmtree(saver.tempdirname)
        copytree(os.path.join(path, tmpname), saver.tempdirname)
        saver.is_forked = True
        saver.close()
Exemplo n.º 8
0
    def setUpClass(cls) -> None:
        """
        For testing purposes, slightly alter the RucioFrontend such that
         we can run tests outside of dali too
        """
        if not straxen.utilix_is_configured():
            return
        if 'rcc' not in socket.getfqdn():
            # If we are not on RCC, for testing, add some dummy site
            straxen.RucioFrontend.local_rses = {
                'UC_DALI_USERDISK': r'.rcc.',
                'test_rucio': f'{socket.getfqdn()}'
            }
            straxen.RucioFrontend.get_rse_prefix = lambda *x: 'test_rucio'

        # Some non-existing keys that we will try finding in the test cases.
        cls.test_keys = [
            strax.DataKey(run_id=run_id,
                          data_type='dtype',
                          lineage={
                              'dtype': ['Plugin', '0.0.0.', {}],
                          }) for run_id in ('-1', '-2')
        ]
Exemplo n.º 9
0
def keys_for_runs(self,
                  target: str,
                  run_ids: ty.Union[np.ndarray, list, tuple, str]
                  ) -> ty.List[strax.DataKey]:
    """
    Get the data-keys for a multitude of runs. If use_per_run_defaults
        is False which it preferably is (#246), getting many keys should
        be fast as we only only compute the lineage once.

    :param run_ids: Runs to get datakeys for
    :param target: datatype requested
    :return: list of datakeys of the target for the given runs.
    """
    run_ids = strax.to_str_tuple(run_ids)

    if self.context_config['use_per_run_defaults']:
        return [self.key_for(r, target) for r in run_ids]
    elif len(run_ids):
        # Get the lineage once, for the context specifies that the
        # defaults  may not change!
        p = self._get_plugins((target,), run_ids[0])[target]
        return [strax.DataKey(r, target, p.lineage) for r in run_ids]
    else:
        return []
Exemplo n.º 10
0
 def _key_for(self, run_id, target):
     p = self._get_plugins((target, ), run_id)[target]
     return strax.DataKey(run_id, target, p.lineage)
Exemplo n.º 11
0
        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]
            key = strax.DataKey(run_id, d, p.lineage)

            for sb_i, sf in enumerate(self.storage):
                try:
                    # Bit clunky... but allows specifying executor later
                    sf.find(key, **self._find_options)
                    loaders[d] = partial(sf.loader,
                                         key,
                                         n_range=n_range,
                                         **self._find_options)
                    # Found it! No need to make it
                    del plugins[d]
                    break
                except strax.DataNotAvailable:
                    continue
            else:
                if time_range is not None:
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                if d in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies it cannot be created.")
                # Not in any cache. We will be computing it.
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data?
            if time_range is not None:
                # No, since we're not even getting the whole data.
                # Without this check, saving could be attempted if the
                # storage converter mode is enabled.
                self.log.warning(f"Not saving {d} while "
                                 f"selecting a time range in the run")
                return
            if any([
                    len(v) > 0 for k, v in self._find_options.items()
                    if 'fuzzy' in k
            ]):
                # In fuzzy matching mode, we cannot (yet) derive the lineage
                # of any data we are creating. To avoid create false
                # data entries, we currently do not save at all.
                self.log.warning(f"Not saving {d} while fuzzy matching is "
                                 f"turned on.")
                return
            if self.context_config['allow_incomplete']:
                self.log.warning(f"Not saving {d} while loading incomplete "
                                 f"data is allowed.")
                return

            elif p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            for sf in self.storage:
                if sf.readonly:
                    continue
                if d not in to_compute:
                    if not self.context_config['storage_converter']:
                        continue
                    try:
                        sf.find(key, **self._find_options)
                        # Already have this data in this backend
                        continue
                    except strax.DataNotAvailable:
                        # Don't have it, so let's convert it!
                        pass
                try:
                    savers[d].append(sf.saver(key,
                                              metadata=p.metadata(run_id)))
                except strax.DataNotAvailable:
                    # This frontend cannot save. Too bad.
                    pass
Exemplo n.º 12
0
        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]

            # Can we load this data, or must we compute it?
            loading_this_data = False
            key = strax.DataKey(run_id, d, p.lineage)
            for sb_i, sf in enumerate(self.storage):
                try:
                    # Partial is clunky... but allows specifying executor later
                    # Since it doesn't run until later, we must do a find now
                    # that we can still handle DataNotAvailable
                    sf.find(key, **self._find_options)
                    loaders[d] = partial(sf.loader,
                                         key,
                                         n_range=n_range,
                                         **self._find_options)
                except strax.DataNotAvailable:
                    continue
                else:
                    # Found it! No need to make it or look in other frontends
                    loading_this_data = True
                    del plugins[d]
                    break
            else:
                # Data not found anywhere. We will be computing it.
                if time_range is not None and not d.startswith('_temp'):
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                if d in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies it cannot be created.")
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data? If not, return.
            if (loading_this_data
                    and not self.context_config['storage_converter']):
                return
            if p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            # Warn about conditions that preclude saving, but the user
            # might not expect.
            if time_range is not None:
                # We're not even getting the whole data.
                # Without this check, saving could be attempted if the
                # storage converter mode is enabled.
                self.log.warning(f"Not saving {d} while "
                                 f"selecting a time range in the run")
                return
            if any([
                    len(v) > 0 for k, v in self._find_options.items()
                    if 'fuzzy' in k
            ]):
                # In fuzzy matching mode, we cannot (yet) derive the
                # lineage of any data we are creating. To avoid creating
                # false data entries, we currently do not save at all.
                self.log.warning(f"Not saving {d} while fuzzy matching is"
                                 f" turned on.")
                return
            if self.context_config['allow_incomplete']:
                self.log.warning(f"Not saving {d} while loading incomplete"
                                 f" data is allowed.")
                return

            # Save the target and any other outputs of the plugin.
            for d_to_save in set([d] + list(p.provides)):
                if d_to_save in savers and len(savers[d_to_save]):
                    # This multi-output plugin was scanned before
                    # let's not create doubled savers
                    assert p.multi_output
                    continue

                key = strax.DataKey(run_id, d_to_save, p.lineage)

                for sf in self.storage:
                    if sf.readonly:
                        continue
                    if loading_this_data:
                        # Usually, we don't save if we're loading
                        if not self.context_config['storage_converter']:
                            continue
                        # ... but in storage converter mode we do:
                        try:
                            sf.find(key, **self._find_options)
                            # Already have this data in this backend
                            continue
                        except strax.DataNotAvailable:
                            # Don't have it, so let's save it!
                            pass
                    # If we get here, we must try to save
                    try:
                        savers[d_to_save].append(
                            sf.saver(key,
                                     metadata=p.metadata(run_id, d_to_save)))
                    except strax.DataNotAvailable:
                        # This frontend cannot save. Too bad.
                        pass
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description="Upload combined output to rucio")
    parser.add_argument('dataset', help='Run number', type=int)
    parser.add_argument('dtype', help='dtype to upload')
    parser.add_argument('rse', help='Target RSE')
    parser.add_argument('--context', help='Strax context')

    args = parser.parse_args()

    tmp_path = tempfile.mkdtemp()


    runid = args.dataset
    runid_str = "%06d" % runid
    dtype = args.dtype
    rse = args.rse

    # get context
    st = eval(f'straxen.contexts.{args.context}()')
    st.storage = [strax.DataDirectory(tmp_path)]

    plugin = st._get_plugins((dtype,), runid_str)[dtype]

    rc = RucioSummoner()

    for keystring in plugin.provides:
        key = strax.DataKey(runid_str, keystring, plugin.lineage)
        hash = key.lineage_hash
        # TODO check with utilix DB call that the hashes match?

        dirname = f"{runid_str}-{keystring}-{hash}"
        upload_path = os.path.join('combined', dirname)


        print(f"Uploading {dirname}")
        os.listdir(upload_path)

        # make a rucio DID
        did = make_did(runid, keystring, hash)

        # check if a rule already exists for this DID
        rucio_rule = rc.GetRule(upload_structure=did)

        # if not in rucio already and no rule exists, upload into rucio
        if not rucio_rule['exists']:
            result = rc.Upload(did,
                               upload_path,
                               rse,
                               lifetime=None)

            # check that upload was successful
            new_rule = rc.GetRule(upload_structure=did, rse=rse)

            # TODO check number of files

            new_data_dict={}
            new_data_dict['location'] = rse
            new_data_dict['did'] = did
            new_data_dict['status'] = "transferred"
            new_data_dict['host'] = "rucio-catalogue"
            new_data_dict['type'] = keystring
            new_data_dict['lifetime'] = new_rule['expires'],
            new_data_dict['protocol'] = 'rucio'
            new_data_dict['creation_time'] = datetime.datetime.utcnow().isoformat()
            new_data_dict['checksum'] = 'shit'
            db.update_data(runid, new_data_dict)
        else:
            print(f"Rucio rule already exists for {did}")
Exemplo n.º 14
0
        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]
            key = strax.DataKey(run_id, d, p.lineage)

            for sb_i, sf in enumerate(self.storage):
                try:
                    loaders[d] = sf.loader(key,
                                           n_range=n_range,
                                           **self._fuzzy_options)
                    # Found it! No need to make it
                    del plugins[d]
                    break
                except strax.DataNotAvailable:
                    continue
            else:
                if time_range is not None:
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                # Not in any cache. We will be computing it.
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data?
            if time_range is not None:
                # No, since we're not even getting the whole data
                return
            elif p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            for sf in self.storage:
                if sf.readonly:
                    continue
                if d not in to_compute:
                    if not self.context_config['storage_converter']:
                        continue
                    try:
                        sf.find(key, **self._fuzzy_options)
                        # Already have this data in this backend
                        continue
                    except strax.DataNotAvailable:
                        pass
                try:
                    savers[d].append(
                        sf.saver(key,
                                 metadata=p.metadata(run_id),
                                 meta_only=p.save_meta_only))
                except strax.DataNotAvailable:
                    # This frontend cannot save. Too bad.
                    pass
Exemplo n.º 15
0
        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]

            # Can we load this data?
            loading_this_data = False
            key = strax.DataKey(run_id, d, p.lineage)

            ldr = self._get_partial_loader_for(key,
                                               chunk_number=chunk_number,
                                               time_range=time_range)

            if not ldr and run_id.startswith('_'):
                if time_range is not None:
                    raise NotImplementedError("time range loading not yet "
                                              "supported for superruns")

                sub_run_spec = self.run_metadata(
                    run_id, 'sub_run_spec')['sub_run_spec']
                self.make(list(sub_run_spec.keys()), d)

                ldrs = []
                for subrun in sub_run_spec:
                    sub_key = strax.DataKey(
                        subrun, d,
                        self._get_plugins((d, ), subrun)[d].lineage)
                    if sub_run_spec[subrun] == 'all':
                        _subrun_time_range = None
                    else:
                        _subrun_time_range = sub_run_spec[subrun]
                    ldr = self._get_partial_loader_for(
                        sub_key,
                        time_range=_subrun_time_range,
                        chunk_number=chunk_number)
                    if not ldr:
                        raise RuntimeError(
                            f"Could not load {d} for subrun {subrun} "
                            f"even though we made it??")
                    ldrs.append(ldr)

                def concat_loader(*args, **kwargs):
                    for x in ldrs:
                        yield from x(*args, **kwargs)

                ldr = lambda *args, **kwargs: concat_loader(*args, **kwargs)

            if ldr:
                # Found it! No need to make it or look in other frontends
                loading_this_data = True
                loaders[d] = ldr
                del plugins[d]
            else:
                # Data not found anywhere. We will be computing it.
                if (time_range is not None
                        and plugins[d].save_when != strax.SaveWhen.NEVER):
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                if '*' in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies no new data can be created.")
                if d in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies it cannot be created.")
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data? If not, return.
            if (loading_this_data
                    and not self.context_config['storage_converter']):
                return
            if p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            # Warn about conditions that preclude saving, but the user
            # might not expect.
            if time_range is not None:
                # We're not even getting the whole data.
                # Without this check, saving could be attempted if the
                # storage converter mode is enabled.
                self.log.warning(f"Not saving {d} while "
                                 f"selecting a time range in the run")
                return
            if any([
                    len(v) > 0 for k, v in self._find_options.items()
                    if 'fuzzy' in k
            ]):
                # In fuzzy matching mode, we cannot (yet) derive the
                # lineage of any data we are creating. To avoid creating
                # false data entries, we currently do not save at all.
                self.log.warning(f"Not saving {d} while fuzzy matching is"
                                 f" turned on.")
                return
            if self.context_config['allow_incomplete']:
                self.log.warning(f"Not saving {d} while loading incomplete"
                                 f" data is allowed.")
                return

            # Save the target and any other outputs of the plugin.
            for d_to_save in set([d] + list(p.provides)):
                if d_to_save in savers and len(savers[d_to_save]):
                    # This multi-output plugin was scanned before
                    # let's not create doubled savers
                    assert p.multi_output
                    continue

                key = strax.DataKey(run_id, d_to_save, p.lineage)

                for sf in self.storage:
                    if sf.readonly:
                        continue
                    if loading_this_data:
                        # Usually, we don't save if we're loading
                        if not self.context_config['storage_converter']:
                            continue
                        # ... but in storage converter mode we do:
                        try:
                            sf.find(key, **self._find_options)
                            # Already have this data in this backend
                            continue
                        except strax.DataNotAvailable:
                            # Don't have it, so let's save it!
                            pass
                    # If we get here, we must try to save
                    try:
                        savers[d_to_save].append(
                            sf.saver(key,
                                     metadata=p.metadata(run_id, d_to_save)))
                    except strax.DataNotAvailable:
                        # This frontend cannot save. Too bad.
                        pass
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        description="Strax Processing With Outsource")
    parser.add_argument('dataset', help='Run number', type=int)
    parser.add_argument('--input_dtype', help='strax input')
    parser.add_argument('--output_dtype', help='strax output')
    parser.add_argument('--context', help='name of context')
    parser.add_argument('--chunks', nargs='*', help='chunk ids to download')

    args = parser.parse_args()

    # directory where we will be putting everything
    data_dir = './data'

    # get context
    st = eval(f'straxen.contexts.{args.context}()')
    st.storage = [strax.DataDirectory(data_dir)]

    runid = args.dataset
    in_dtype = args.input_dtype
    out_dtype = args.output_dtype
    hash = db.get_hash(args.context, in_dtype)

    # download the input data
    admix.download(runid,
                   in_dtype,
                   hash,
                   chunks=args.chunks,
                   location=data_dir)

    runid_str = "%06d" % runid
    input_metadata = st.get_metadata(runid_str, in_dtype)
    input_key = strax.DataKey(runid_str, in_dtype, input_metadata['lineage'])

    # initialize plugin needed for processing
    plugin = st._get_plugins((out_dtype, ), runid_str)[out_dtype]
    st._set_plugin_config(plugin, runid_str, tolerant=False)
    plugin.setup()

    # setup savers
    savers = dict()
    for keystring in plugin.provides:
        key = strax.DataKey(runid_str, keystring, plugin.lineage)
        saver = st.storage[0].saver(key, plugin.metadata(runid, keystring))
        saver.is_forked = True
        savers[keystring] = saver

    # setup a few more variables
    backend = st.storage[0].backends[0]
    dtype = literal_eval(input_metadata['dtype'])
    chunk_kwargs = dict(data_type=input_metadata['data_type'],
                        data_kind=input_metadata['data_kind'],
                        dtype=dtype)

    # process the chunks
    for chunk in args.chunks:
        # read in the input data for this chunk
        in_data = backend._read_and_format_chunk(
            backend_key=st.storage[0].find(input_key)[1],
            metadata=input_metadata,
            chunk_info=input_metadata['chunks'][int(chunk)],
            dtype=dtype,
            time_range=None,
            chunk_construction_kwargs=chunk_kwargs)

        # process this chunk
        output_data = plugin.do_compute(chunk_i=chunk, **{in_dtype: in_data})

        # save the output -- you have to loop because there could be > 1 output dtypes
        for keystring, strax_chunk in output_data.items():
            savers[keystring].save(strax_chunk, chunk_i=int(chunk))
Exemplo n.º 17
0
def process(runid,
            out_dtype,
            st,
            chunks,
            close_savers=False,
            tmp_path='.tmp_for_strax'
            ):
    runid_str = "%06d" % runid
    t0 = time.time()

    # initialize plugin needed for processing this output type
    plugin = st._get_plugins((out_dtype,), runid_str)[out_dtype]
    st._set_plugin_config(plugin, runid_str, tolerant=False)
    plugin.setup()

    # now move on to processing
    # if we didn't pass any chunks, we process the whole thing -- otherwise just do the chunks we listed
    if chunks is None:
        print("Chunks is none -- processing whole thing!")
        # then we just process the whole thing
        for keystring in plugin.provides:
            print(f"Making {keystring}")
            st.make(runid_str, keystring,
                    max_workers=8,
                    allow_multiple=True,
                    )
            print(f"DONE processing {keystring}")

    # process chunk-by-chunk
    else:
        # setup savers
        savers = dict()
        for keystring in plugin.provides:
            print(f"Making {keystring}")
            key = strax.DataKey(runid_str, keystring, plugin.lineage)
            saver = st.storage[0].saver(key, plugin.metadata(runid, keystring))
            saver.is_forked = True
            savers[keystring] = saver

        # setup a few more variables
        # TODO not sure exactly how this works when an output plugin depends on >1 plugin
        # maybe that doesn't matter?
        in_dtype = plugin.depends_on[0]
        input_metadata = st.get_metadata(runid_str, in_dtype)
        input_key = strax.DataKey(runid_str, in_dtype, input_metadata['lineage'])
        backend = st.storage[0].backends[0]
        dtype = literal_eval(input_metadata['dtype'])
        chunk_kwargs = dict(data_type=input_metadata['data_type'],
                            data_kind=input_metadata['data_kind'],
                            dtype=dtype)

        for chunk in chunks:
            # read in the input data for this chunk
            chunk_info = None
            for chunk_md in input_metadata['chunks']:
                if chunk_md['chunk_i'] == int(chunk):
                    chunk_info = chunk_md
                    break
            assert chunk_info is not None, f"Could not find chunk_id: {chunk}"
            in_data = backend._read_and_format_chunk(backend_key=st.storage[0].find(input_key)[1],
                                                     metadata=input_metadata,
                                                     chunk_info=chunk_info,
                                                     dtype=dtype,
                                                     time_range=None,
                                                     chunk_construction_kwargs=chunk_kwargs
                                                    )
            # process this chunk
            output_data = plugin.do_compute(chunk_i=chunk, **{in_dtype: in_data})

            # save the output -- you have to loop because there could be > 1 output dtypes
            for keystring, strax_chunk in output_data.items():
                savers[keystring].save(strax_chunk, chunk_i=int(chunk))

        if close_savers:
            for dtype, saver in savers.items():
                # copy the metadata to a tmp directory
                tmpdir = os.path.join(tmp_path, os.path.split(saver.tempdirname)[1])
                os.makedirs(tmpdir, exist_ok=True)
                for file in os.listdir(saver.tempdirname):
                    if file.endswith('json'):
                        src = os.path.join(saver.tempdirname, file)
                        dest = os.path.join(tmpdir, file)
                        copyfile(src, dest)
                saver.close()
    process_time = time.time() - t0
    print(f"=== Processing time for {out_dtype}: {process_time/60:0.2f} minutes === ")