Exemplo n.º 1
0
class S3File(io.IOBase):
    """File like proxy for s3 files, manages upload and download of locally managed temporary file
    """

    def __init__(self, bucket, key, mode='w+b', *args, **kwargs):
        super(S3File, self).__init__(*args, **kwargs)
        self.bucket = bucket
        self.key = key
        self.mode = mode
        self.path = self.bucket + '/' + self.key

        # converts mode to readable/writable to enable the temporary file to have S3 data
        # read or written to it even if the S3File is read/write/append
        # i.e. "r" => "r+", "ab" => "a+b"
        updatable_mode = re.sub(r'^([rwa]+)(b?)$', r'\1+\2', mode)
        self._tempfile = TemporaryFile(updatable_mode)

        try:
            with s3errors(self.path):
                if 'a' in mode:
                    # File is in an appending mode, start with the content in file
                    s3.Object(bucket, key).download_fileobj(self._tempfile)
                    self.seek(0, os.SEEK_END)
                elif 'a' not in mode and 'w' not in mode and 'x' not in mode:
                    # file is not in a create mode, so it is in read mode
                    # start with the content in the file, and seek to the beginning
                    s3.Object(bucket, key).download_fileobj(self._tempfile)
                    self.seek(0, os.SEEK_SET)
        except Exception:
            self.close()
            raise

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        try:
            if self.writable():
                self.seek(0)
                with s3errors(self.path):
                    s3.Object(self.bucket, self.key).upload_fileobj(self._tempfile)
        finally:
            self._tempfile.close()

    @property
    def closed(self):
        return self._tempfile.closed

    def fileno(self):
        return self._tempfile.fileno()

    def flush(self):
        return self._tempfile.flush()

    def isatty(self):
        return self._tempfile.isatty()

    def readable(self):
        return 'r' in self.mode or '+' in self.mode

    def read(self, n=-1):
        if not self.readable():
            raise IOError('not open for reading')
        return self._tempfile.read(n)

    def readinto(self, b):
        return self._tempfile.readinto(b)

    def readline(self, limit=-1):
        if not self.readable():
            raise IOError('not open for reading')
        return self._tempfile.readline(limit)

    def readlines(self, hint=-1):
        if not self.readable():
            raise IOError('not open for reading')
        return self._tempfile.readlines(hint)

    def seek(self, offset, whence=os.SEEK_SET):
        self._tempfile.seek(offset, whence)
        return self.tell()

    def seekable(self):
        return True

    def tell(self):
        return self._tempfile.tell()

    def writable(self):
        return 'w' in self.mode or 'a' in self.mode or '+' in self.mode or 'x' in self.mode

    def write(self, b):
        if not self.writable():
            raise IOError('not open for writing')
        self._tempfile.write(b)
        return len(b)

    def writelines(self, lines):
        if not self.writable():
            raise IOError('not open for writing')
        return self._tempfile.writelines(lines)

    def truncate(self, size=None):
        if not self.writable():
            raise IOError('not open for writing')

        if size is None:
            size = self.tell()

        self._tempfile.truncate(size)
        return size
Exemplo n.º 2
0
class Normalizer:
    """
    provide an iterator which returns h5 files with a normalizer function.
    """
    signal_name_template_met_filter = (
        '{}-{stop_mass_gev}-{lsp_mass_gev}-TMF{met_filter_gev:.0f}')
    signal_name_template = '{}-{stop_mass_gev}-{lsp_mass_gev}'

    def __init__(self,meta_path, hfiles, lumi_fb=20.3, quiet=False):
        self.hfiles = hfiles
        with open(meta_path) as yml:
            self.filter_meta = yaml.load(yml)
        self._lumi_fb = lumi_fb
        self.signal_prestring = 'scharm'
        self.outstream = sys.stdout
        self.bugstream = sys.stderr
        if quiet:
            self.outstream = TemporaryFile('w+')
            self.bugstream = TemporaryFile('w+')
        self.out_prepend = ''

    def _get_matched_signame(self,ds):
        """
        this is slightly hackish, but does the right thing by renaming the
        physics type.
        """
        finder = re.compile(datasets.scharm_re)
        stop_finder = re.compile(datasets.stop_re)
        stop_finder2 = re.compile(datasets.stop2_re)
        for finder in [finder, stop_finder, stop_finder2]:
            try:
                found = finder.search(ds).group
                generator_info = {
                    'stop_mass_gev': int(found(1)),
                    'lsp_mass_gev': int(found(2)),
                    }
                namestring = self.signal_name_template
                return namestring.format(self.signal_prestring,
                                         **generator_info)
            except AttributeError:
                pass
        return None

    def _get_physics_type(self, file_meta):
        full_name = file_meta['full_name']
        if not 'physics_type' in file_meta:
            if full_name.startswith('data'):
                physics_type = 'data'
            else:
                raise OSError('got unknown physics in {}'.format(full_name))
        else:
            physics_type = file_meta['physics_type']

        if 'signal' in physics_type:
            physics_type = self._get_matched_signame(full_name)
        if not physics_type:
            raise OSError("couldn't classify {}".format(full_name))
        return physics_type

    def _check_for_bugs(self, ds):
        full_name = ds['full_name']
        if not 'total_xsec_fb' in ds and not full_name.startswith('data'):
            self.bugstream.write(
                'no cross section for {}, skipping\n'.format(full_name))
            return True

        if 'n_corrupted_files' in ds:
            self.bugstream.write(
                '{} bad files in {}\n'.format(ds['n_corrupted_files'],
                                              full_name))

        return False

    def _get_hist_scaler(self, file_meta):
        """
        Factory of scalar factories: the returned function
        returns scalar after it's been called on the h5 file
        """
        ds_name = file_meta['full_name']

        if ds_name.startswith('data'):
            def scalar_fact(hfile):
                return 1.0
        else:
            filteff = file_meta['filteff']
            xsec = file_meta['total_xsec_fb']
            kfactor = file_meta.get('kfactor',1)
            n_before_sel = xsec * kfactor * filteff * self._lumi_fb
            def scalar_fact(hfile):
                sum_evt_weight = hfile.attrs['total_event_weight']
                return n_before_sel / sum_evt_weight
        return scalar_fact

    def _print_prog(self, filenum, numfiles):
        if self.outstream and self.outstream.isatty():
            self.outstream.write(
                '\r{}adding file {} of {}'.format(
                    self.out_prepend, filenum, numfiles))
            self.outstream.flush()

    def __iter__(self):
        """
        iterates over files, gets (physics_type, file, normalization) tuples
        """
        stats = StatsCounter()

        numfiles = len(self.hfiles)
        for filenum, f in enumerate(self.hfiles):
            self._print_prog(filenum, numfiles)
            meta_name = basename(splitext(f)[0])

            file_meta = self.filter_meta[meta_name]
            if self._check_for_bugs(file_meta):
                continue

            scaler_fact = self._get_hist_scaler(file_meta)
            physics_type = self._get_physics_type(file_meta)

            with h5py.File(f,'r') as hfile:
                stats.update(physics_type, file_meta, hfile)
                norm = scaler_fact(hfile)
                yield physics_type, hfile, norm

        if self.outstream and self.outstream.isatty():
            self.outstream.write('\n')
        stats.write_to(self.bugstream)

    def byid(self):
        """iterates, gets (dsid, file, normalization)"""

        numfiles = len(self.hfiles)
        for filenum, f in enumerate(self.hfiles):
            self._print_prog(filenum, numfiles)
            meta_name = basename(splitext(f)[0])

            file_meta = self.filter_meta[meta_name]
            if self._check_for_bugs(file_meta):
                continue

            scaler_fact = self._get_hist_scaler(file_meta)
            dsid = meta_name[1:]

            with h5py.File(f,'r') as hfile:
                norm = scaler_fact(hfile)
                yield int(dsid), hfile, norm