class S3File(io.IOBase): """File like proxy for s3 files, manages upload and download of locally managed temporary file """ def __init__(self, bucket, key, mode='w+b', *args, **kwargs): super(S3File, self).__init__(*args, **kwargs) self.bucket = bucket self.key = key self.mode = mode self.path = self.bucket + '/' + self.key # converts mode to readable/writable to enable the temporary file to have S3 data # read or written to it even if the S3File is read/write/append # i.e. "r" => "r+", "ab" => "a+b" updatable_mode = re.sub(r'^([rwa]+)(b?)$', r'\1+\2', mode) self._tempfile = TemporaryFile(updatable_mode) try: with s3errors(self.path): if 'a' in mode: # File is in an appending mode, start with the content in file s3.Object(bucket, key).download_fileobj(self._tempfile) self.seek(0, os.SEEK_END) elif 'a' not in mode and 'w' not in mode and 'x' not in mode: # file is not in a create mode, so it is in read mode # start with the content in the file, and seek to the beginning s3.Object(bucket, key).download_fileobj(self._tempfile) self.seek(0, os.SEEK_SET) except Exception: self.close() raise def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): try: if self.writable(): self.seek(0) with s3errors(self.path): s3.Object(self.bucket, self.key).upload_fileobj(self._tempfile) finally: self._tempfile.close() @property def closed(self): return self._tempfile.closed def fileno(self): return self._tempfile.fileno() def flush(self): return self._tempfile.flush() def isatty(self): return self._tempfile.isatty() def readable(self): return 'r' in self.mode or '+' in self.mode def read(self, n=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.read(n) def readinto(self, b): return self._tempfile.readinto(b) def readline(self, limit=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.readline(limit) def readlines(self, hint=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.readlines(hint) def seek(self, offset, whence=os.SEEK_SET): self._tempfile.seek(offset, whence) return self.tell() def seekable(self): return True def tell(self): return self._tempfile.tell() def writable(self): return 'w' in self.mode or 'a' in self.mode or '+' in self.mode or 'x' in self.mode def write(self, b): if not self.writable(): raise IOError('not open for writing') self._tempfile.write(b) return len(b) def writelines(self, lines): if not self.writable(): raise IOError('not open for writing') return self._tempfile.writelines(lines) def truncate(self, size=None): if not self.writable(): raise IOError('not open for writing') if size is None: size = self.tell() self._tempfile.truncate(size) return size
class Normalizer: """ provide an iterator which returns h5 files with a normalizer function. """ signal_name_template_met_filter = ( '{}-{stop_mass_gev}-{lsp_mass_gev}-TMF{met_filter_gev:.0f}') signal_name_template = '{}-{stop_mass_gev}-{lsp_mass_gev}' def __init__(self,meta_path, hfiles, lumi_fb=20.3, quiet=False): self.hfiles = hfiles with open(meta_path) as yml: self.filter_meta = yaml.load(yml) self._lumi_fb = lumi_fb self.signal_prestring = 'scharm' self.outstream = sys.stdout self.bugstream = sys.stderr if quiet: self.outstream = TemporaryFile('w+') self.bugstream = TemporaryFile('w+') self.out_prepend = '' def _get_matched_signame(self,ds): """ this is slightly hackish, but does the right thing by renaming the physics type. """ finder = re.compile(datasets.scharm_re) stop_finder = re.compile(datasets.stop_re) stop_finder2 = re.compile(datasets.stop2_re) for finder in [finder, stop_finder, stop_finder2]: try: found = finder.search(ds).group generator_info = { 'stop_mass_gev': int(found(1)), 'lsp_mass_gev': int(found(2)), } namestring = self.signal_name_template return namestring.format(self.signal_prestring, **generator_info) except AttributeError: pass return None def _get_physics_type(self, file_meta): full_name = file_meta['full_name'] if not 'physics_type' in file_meta: if full_name.startswith('data'): physics_type = 'data' else: raise OSError('got unknown physics in {}'.format(full_name)) else: physics_type = file_meta['physics_type'] if 'signal' in physics_type: physics_type = self._get_matched_signame(full_name) if not physics_type: raise OSError("couldn't classify {}".format(full_name)) return physics_type def _check_for_bugs(self, ds): full_name = ds['full_name'] if not 'total_xsec_fb' in ds and not full_name.startswith('data'): self.bugstream.write( 'no cross section for {}, skipping\n'.format(full_name)) return True if 'n_corrupted_files' in ds: self.bugstream.write( '{} bad files in {}\n'.format(ds['n_corrupted_files'], full_name)) return False def _get_hist_scaler(self, file_meta): """ Factory of scalar factories: the returned function returns scalar after it's been called on the h5 file """ ds_name = file_meta['full_name'] if ds_name.startswith('data'): def scalar_fact(hfile): return 1.0 else: filteff = file_meta['filteff'] xsec = file_meta['total_xsec_fb'] kfactor = file_meta.get('kfactor',1) n_before_sel = xsec * kfactor * filteff * self._lumi_fb def scalar_fact(hfile): sum_evt_weight = hfile.attrs['total_event_weight'] return n_before_sel / sum_evt_weight return scalar_fact def _print_prog(self, filenum, numfiles): if self.outstream and self.outstream.isatty(): self.outstream.write( '\r{}adding file {} of {}'.format( self.out_prepend, filenum, numfiles)) self.outstream.flush() def __iter__(self): """ iterates over files, gets (physics_type, file, normalization) tuples """ stats = StatsCounter() numfiles = len(self.hfiles) for filenum, f in enumerate(self.hfiles): self._print_prog(filenum, numfiles) meta_name = basename(splitext(f)[0]) file_meta = self.filter_meta[meta_name] if self._check_for_bugs(file_meta): continue scaler_fact = self._get_hist_scaler(file_meta) physics_type = self._get_physics_type(file_meta) with h5py.File(f,'r') as hfile: stats.update(physics_type, file_meta, hfile) norm = scaler_fact(hfile) yield physics_type, hfile, norm if self.outstream and self.outstream.isatty(): self.outstream.write('\n') stats.write_to(self.bugstream) def byid(self): """iterates, gets (dsid, file, normalization)""" numfiles = len(self.hfiles) for filenum, f in enumerate(self.hfiles): self._print_prog(filenum, numfiles) meta_name = basename(splitext(f)[0]) file_meta = self.filter_meta[meta_name] if self._check_for_bugs(file_meta): continue scaler_fact = self._get_hist_scaler(file_meta) dsid = meta_name[1:] with h5py.File(f,'r') as hfile: norm = scaler_fact(hfile) yield int(dsid), hfile, norm