def test_lz4frame_open_write_read_text(): data = u'This is a test string' with lz4frame.open('testfile', mode='wt') as fp: fp.write(data) with lz4frame.open('testfile', mode='rt') as fp: data_out = fp.read() assert data_out == data
def test_lz4frame_open_write_read_text(): data = u'This is a test string' with lz4frame.open('testfile', mode='wt') as fp: fp.write(data) with lz4frame.open('testfile', mode='rt') as fp: data_out = fp.read() assert data_out == data
def test_lz4frame_open_write_read(data, compression_level, block_linked, block_checksum, block_size, content_checksum, auto_flush, store_size, return_bytearray): kwargs = {} if store_size is True: kwargs['source_size'] = len(data) kwargs['compression_level'] = compression_level kwargs['block_size'] = block_size kwargs['block_linked'] = block_linked kwargs['content_checksum'] = content_checksum kwargs['block_checksum'] = block_checksum kwargs['auto_flush'] = auto_flush kwargs['return_bytearray'] = return_bytearray kwargs['mode'] = 'wb' with lz4frame.open('testfile', **kwargs) as fp: fp.write(data) with lz4frame.open('testfile', mode='r') as fp: data_out = fp.read() assert data_out == data
def test_lz4frame_open_write_read( data, compression_level, block_linked, block_checksum, block_size, content_checksum, auto_flush, store_size, return_bytearray): kwargs = {} if store_size is True: kwargs['source_size'] = len(data) kwargs['compression_level'] = compression_level kwargs['block_size'] = block_size kwargs['block_linked'] = block_linked kwargs['content_checksum'] = content_checksum kwargs['block_checksum'] = block_checksum kwargs['auto_flush'] = auto_flush kwargs['return_bytearray'] = return_bytearray kwargs['mode'] = 'wb' with lz4frame.open('testfile', **kwargs) as fp: fp.write(data) with lz4frame.open('testfile', mode='r') as fp: data_out = fp.read() assert data_out == data
def test_lz4frame_open_write_read_text_iter(): data = u'This is a test string' with lz4frame.open('testfile', mode='wt') as fp: fp.write(data) data_out = '' with lz4frame.open('testfile', mode='rt') as fp: for line in fp: data_out += line assert data_out == data
def test_lz4frame_open_write_read_text_iter(): data = u'This is a test string' with lz4frame.open('testfile', mode='wt') as fp: fp.write(data) data_out = '' with lz4frame.open('testfile', mode='rt') as fp: for line in fp: data_out += line assert data_out == data
def iter_yaml_lz4_reports(fn): """Iterate YAML reports from a lz4 file """ assert str(fn).endswith("lz4") fd = lz4frame.open(fn) blobgen = stream_yaml_blobs(fd) off, header = next(blobgen) headsha = hashlib.sha1(header) # XXX: bad header kills whole bucket header = yaml.load(header, Loader=CLoader) if not header.get("report_id"): header["report_id"] = generate_report_id(header) for off, entry in blobgen: entry_len = len(entry) esha = headsha.copy() esha.update(entry) esha = esha.digest() try: entry = yaml.load(entry, Loader=CLoader) if not entry: # e.g. '---\nnull\n...\n' continue if "test_start_time" in entry and "test_start_time" in header: header.pop("test_start_time") entry.update(header) yield off, entry_len, esha, entry, None except Exception as exc: yield off, entry_len, esha, None, exc fd.close()
def writeout_measurement(msm_jstr, fn, update): """Safely write measurement to disk """ # Different processes might be trying to write the same file at the same # time due to naming collisions. Use a safe tmpfile and atomic link # NamedTemporaryFile creates files with permissions 600 # but we want other users (Nginx) to be able to read the measurement suffix = ".{}.tmp".format(os.getpid()) with NamedTemporaryFile(suffix=suffix, dir=conf.msmtdir) as f: with lz4frame.open(f, "w") as lzf: lzf.write(msm_jstr) # os.fsync(lzf.fileno()) final_fname = conf.msmtdir.joinpath(fn) try: os.chmod(f.name, 0o644) os.link(f.name, final_fname) metrics.incr("msmt_output_file_created") except FileExistsError: if update: # update access time - used for cache cleanup # no need to overwrite the file os.utime(final_fname) metrics.incr("msmt_output_file_updated") else: log.info("Refusing to overwrite %s", final_fname) metrics.incr("report_id_input_file_collision") metrics.incr("msmt_output_file_skipped") os.utime(final_fname) metrics.incr("wrote_uncompressed_bytes", len(msm_jstr))
def test_normalize_yaml_dns_consistency_2018(cans): can = cans["yaml18"] canfn = can.as_posix() day = canfn.split("/")[1] rfn = canfn.split("/", 1)[1][:-4] # remove testdata/ and .lz4 with lz4frame.open(can) as f: for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)): ujson.dumps(entry) # ensure it's serializable
def test_lz4frame_flush(): data_1 = b"This is a..." data_2 = b" test string!" with lz4frame.open("testfile", mode="w") as fp_write: fp_write.write(data_1) fp_write.flush() fp_write.write(data_2) with lz4frame.open("testfile", mode="r") as fp_read: assert fp_read.read() == data_1 fp_write.flush() with lz4frame.open("testfile", mode="r") as fp_read: assert fp_read.read() == data_1 + data_2
def load_multiple(fn, touch=True) -> tuple: """Load contents of cans. Decompress tar archives if found. Yields measurements one by one as: (string of JSON, None) or (None, msmt dict) """ if touch: os.utime(fn) # update access time - used for cache cleanup # TODO: handle: # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed if fn.endswith(".tar.lz4"): with lz4frame.open(fn) as f: tf = tarfile.TarFile(fileobj=f) while True: m = tf.next() if m is None: break log.debug("Loading nested %s", m.name) k = tf.extractfile(m) if m.name.endswith(".json"): for line in k: yield (line, None) elif m.name.endswith(".yaml"): continue # FIXME bucket_tstamp = "FIXME" for msm in iter_yaml_msmt_normalized(k, bucket_tstamp): yield (None, msm) elif fn.endswith(".json.lz4"): with lz4frame.open(fn) as f: for line in f: yield (line, None) elif fn.endswith(".yaml.lz4"): with lz4frame.open(fn) as f: raise Exception("Unsupported format: YAML") bucket_tstamp = "FIXME" for msm in iter_yaml_msmt_normalized(f, bucket_tstamp): metrics.incr("yaml_normalization") yield (None, msm) else: raise RuntimeError(fn)
def openfile(filename, attr): """ Open file with different compression types """ if filename.lower().endswith('.bz2'): return bz2.BZ2File(filename, attr) if filename.lower().endswith('.xz'): return lzma.open(filename, attr) if filename.lower().endswith('.gz'): return gzip.open(filename, attr) if filename.lower().endswith('.lz4'): return frame.open(filename, attr) return open(filename, attr)
def get_fd(): for idx in get_next_index(): fn = f"{args.output}_{idx:015d}" if args.lz4: outfile = lz4f.open(f"{fn}.lz4", mode='wt', encoding='utf-8', compression_level=16) elif args.gzip: outfile = gzip.open(f"{fn}.gz", "wt", encoding='utf-8') else: outfile = open(args.output, "w") yield (idx, fn, outfile)
def test_normalize_yaml_2016(cans): can = cans["yaml16"] canfn = can.as_posix() assert canfn.startswith("testdata/2016-07-07/20160706T000046Z-GB") day = canfn.split("/")[1] rfn = canfn.split("/", 1)[1][:-4] # remove testdata/ and .lz4 with lz4frame.open(can) as f: for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)): ujson.dumps(entry) # ensure it's serializable if n == 0: with open("fastpath/tests/data/yaml16_0.json") as f: exp = ujson.load(f) assert entry == exp elif n > 20: break
def test_normalize_yaml_dns_consistency_2017(cans): can = cans["yaml17"] canfn = can.as_posix() day = canfn.split("/")[1] rfn = canfn.split("/", 1)[1][:-4] # remove testdata/ and .lz4 # s3://ooni-data/autoclaved/jsonl.tar.lz4/2017-12-21/20171220T153044Z-BE-AS5432-dns_consistency-mnKRlHuqk8Eo6XMJt5ZkVQrgReaEXPEWaO9NafgXxSVIhAswTXT7QJc6zhsuttpK-0.1.0-probe.yaml.lz4 # lz4cat <fn> | head -n1 | jq -S . > fastpath/tests/data/yaml17_0.json with lz4frame.open(can) as f: for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)): ujson.dumps(entry) # ensure it's serializable if n == 0: with open("fastpath/tests/data/yaml17_0.json") as f: exp = ujson.load(f) assert entry == exp elif n > 20: break
def open_maybe_gzip(filename, mode='r'): # this _must_ be a str filename = str(filename) if filename.endswith(h5_constants.GZIP_SUFFIX): raw = gzip.open(filename, mode + 'b', 2) elif filename.endswith(h5_constants.LZ4_SUFFIX): raw = lz4.open(filename, mode + 'b') else: return open(filename, mode) bufsize = 1024 * 1024 # 1MB of buffering if mode == 'r': return io.BufferedReader(raw, buffer_size=bufsize) elif mode == 'w': return io.BufferedWriter(raw, buffer_size=bufsize) else: raise ValueError("Unsupported mode for compression: %s" % mode)
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]: """Load contents of cans. Decompress tar archives if found. Yields measurements one by one as: (string of JSON, None, None) or (None, msmt dict, None) """ # TODO: handle: # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed if fn.endswith(".tar.lz4"): with lz4frame.open(fn) as f: tf = tarfile.TarFile(fileobj=f) while True: m = tf.next() if m is None: # end of tarball break log.debug("Loading nested %s", m.name) k = tf.extractfile(m) assert k is not None if m.name.endswith(".json"): for line in k: yield (line, None, None) elif m.name.endswith(".yaml"): bucket_tstamp = fn.split("/")[-2] rfn = f"{bucket_tstamp}/" + fn.split("/")[-1] for msm in iter_yaml_msmt_normalized( k, bucket_tstamp, rfn): metrics.incr("yaml_normalization") yield (None, msm, None) elif fn.endswith(".json.lz4"): with lz4frame.open(fn) as f: for line in f: yield (line, None, None) elif fn.endswith(".yaml.lz4"): with lz4frame.open(fn) as f: bucket_tstamp = fn.split("/")[-2] rfn = f"{bucket_tstamp}/" + fn.split("/")[-1] for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn): metrics.incr("yaml_normalization") yield (None, msm, None) elif fn.endswith(".tar.gz"): # minican with missing gzipping :( tf = tarfile.open(fn) while True: m = tf.next() if m is None: # end of tarball tf.close() break log.debug("Loading %s", m.name) k = tf.extractfile(m) assert k is not None if not m.name.endswith(".post"): log.error("Unexpected filename") continue try: j = ujson.loads(k.read()) except Exception: log.error(repr(k[:100]), exc_info=1) fmt = j.get("format", "") if fmt == "json": msm = j.get("content", {}) yield (None, msm, None) elif fmt == "yaml": log.info("Skipping YAML") else: log.info("Ignoring invalid post") elif fn.endswith("/index.json.gz"): pass else: raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")
def test_lz4frame_open_write_read_defaults(data): with lz4frame.open('testfile', mode='wb') as fp: fp.write(data) with lz4frame.open('testfile', mode='r') as fp: data_out = fp.read() assert data_out == data
def test_lz4frame_open_write(data): with lz4frame.open('testfile', mode='wb') as fp: fp.write(data)
from collections import defaultdict import gzip import lz4.frame as lz4f import cloudpickle as cpkl import json import re import os import uproot import numpy as np from coffea import hist from coffea.hist import export import processmap with lz4f.open("hists.cpkl.lz4") as fin: hists_unmapped = cpkl.load(fin) hists = {} for key, val in hists_unmapped.items(): if isinstance(val, hist.Hist): hists[key] = processmap.apply(val) if os.path.exists("templates.root"): os.remove("templates.root") fout = uproot.create("templates.root") nodata = re.compile("(?!data_obs)") h = hists['templates_signalregion'][nodata] lumi = 41.1 h.scale({p: lumi for p in h[nodata].identifiers('process')}, axis="process")
1.27212, -0.000571640, 8.37289e-07, -5.20433e-10, 1.45375e-13, -1.50389e-17 ]) def msd_weight(pt, eta): genw = gpar[0] + gpar[1] * np.power(pt * gpar[2], -gpar[3]) ptpow = np.power.outer(pt, np.arange(cpar.size)) cenweight = np.dot(ptpow, cpar) forweight = np.dot(ptpow, fpar) weight = np.where(np.abs(eta) < 1.3, cenweight, forweight) return genw * weight corrections['msdweight'] = msd_weight with lz4f.open("correction_files/pileup_mc.cpkl.lz4", "rb") as fin: pileup_corr = cloudpickle.load(fin) with uproot.open( "correction_files/pileup_Cert_294927-306462_13TeV_PromptReco_Collisions17_withVar.root" ) as fin_pileup: norm = lambda x: x / x.sum() data_pu = norm(fin_pileup["pileup"].values) data_pu_puUp = norm(fin_pileup["pileup_plus"].values) data_pu_puDown = norm(fin_pileup["pileup_minus"].values) pileup_corr_puUp = {} pileup_corr_puDown = {} for k in pileup_corr.keys(): mc_pu = norm(pileup_corr[k].value) mask = mc_pu > 0.
def test_lz4frame_open_write_read_defaults(data): with lz4frame.open('testfile', mode='wb') as fp: fp.write(data) with lz4frame.open('testfile', mode='r') as fp: data_out = fp.read() assert data_out == data
else: hout['sumw'][dataset] += np.sum(df['scale1fb']) return hout def postprocess(self, accumulator): # set everything to 1/fb scale lumi = 1000 # [1/pb] scale = {} for dataset, dataset_sumw in accumulator['sumw'].items(): scale[dataset] = lumi * self._corrections['xsections'][ dataset] / dataset_sumw.value for h in accumulator.values(): if isinstance(h, hist.Hist): h.scale(scale, axis="dataset") return accumulator if __name__ == '__main__': with lz4f.open("corrections.cpkl.lz4", mode="rb") as fin: corrections = cloudpickle.load(fin) processor_instance = BoostedHbbProcessor(corrections=corrections) with lz4f.open('boostedHbbProcessor.cpkl.lz4', mode='wb', compression_level=5) as fout: cloudpickle.dump(processor_instance, fout)
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]: """Load contents of legacy cans and minicans. Decompress tar archives if found. Yields measurements one by one as: (string of JSON, None, uid) or (None, msmt dict, uid) The uid is either taken from the filename or generated by trivial_id for legacy cans """ # TODO: split this and handle legacy cans and post/minicans independently if fn.endswith(".tar.lz4"): # Legacy lz4 cans with lz4frame.open(fn) as f: tf = tarfile.TarFile(fileobj=f) while True: m = tf.next() if m is None: # end of tarball break log.debug("Loading nested %s", m.name) k = tf.extractfile(m) assert k is not None if m.name.endswith(".json"): for line in k: msm = ujson.loads(line) msmt_uid = trivial_id(msm) yield (None, msm, msmt_uid) elif m.name.endswith(".yaml"): bucket_tstamp = fn.split("/")[-2] rfn = f"{bucket_tstamp}/" + fn.split("/")[-1] for msm in iter_yaml_msmt_normalized( k, bucket_tstamp, rfn): metrics.incr("yaml_normalization") msmt_uid = trivial_id(msm) yield (None, msm, msmt_uid) elif fn.endswith(".json.lz4"): # Legacy lz4 json files with lz4frame.open(fn) as f: for line in f: msm = ujson.loads(line) msmt_uid = trivial_id(msm) yield (None, msm, msmt_uid) elif fn.endswith(".yaml.lz4"): # Legacy lz4 yaml files with lz4frame.open(fn) as f: bucket_tstamp = fn.split("/")[-2] rfn = f"{bucket_tstamp}/" + fn.split("/")[-1] for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn): metrics.incr("yaml_normalization") msmt_uid = trivial_id(msm) yield (None, msm, msmt_uid) elif fn.endswith(".tar.gz"): # minican with missing gzipping :( tf = tarfile.open(fn) while True: m = tf.next() if m is None: # end of tarball tf.close() break log.debug("Loading %s", m.name) k = tf.extractfile(m) assert k is not None if not m.name.endswith(".post"): log.error("Unexpected filename") continue try: j = ujson.loads(k.read()) except Exception: log.error(repr(k[:100]), exc_info=1) continue fmt = j.get("format", "") if fmt == "json": msm = j.get("content", {}) # extract msmt_uid from filename e.g: # ... /20210614004521.999962_JO_signal_68eb19b439326d60.post msmt_uid = m.name.rsplit("/", 1)[1] msmt_uid = msmt_uid[:-5] yield (None, msm, msmt_uid) elif fmt == "yaml": log.info("Skipping YAML") else: log.info("Ignoring invalid post") elif fn.endswith("/index.json.gz"): pass else: raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")
import lz4.frame as lz4f import cloudpickle from coffea import hist with lz4f.open("hists.cpkl.lz4", mode="r", compression_level=5) as fin: hists = cloudpickle.load(fin) fig, ax, _ = hist.plot1d(hists["sr_met"],overlay="dataset") # ax.set_xscale('log') ax.set_yscale('log') ax.set_ylim(0.1, 1e5) fig.savefig("test.pdf") # print(hists)
default=None, help='Filename for the pyinstrument HTML profile output') args = parser.parse_args() # Set a list of preloaded columns, to profile the execution separately from the uproot deserialization preload_items = {} with open(args.samplejson) as fin: samplefiles = json.load(fin) sample = samplefiles[args.sample] filelist = [] for dataset, files in sample.items(): for file in files[:args.limit]: filelist.append((dataset, file)) with lz4f.open(args.processor, mode="rb") as fin: processor_instance = cloudpickle.load(fin) combined_accumulator = processor.dict_accumulator({ 'stats': processor.dict_accumulator({ 'nentries': processor.accumulator(0), 'bytesread': processor.accumulator(0), 'sumworktime': processor.accumulator(0.), 'columns_accessed': processor.set_accumulator(), }), 'job':
def get_pileup(item): dataset, filename = item file = uproot.open(filename) puhist = file["Pu"] pileup = processor.accumulator(np.zeros_like(puhist.values)) pileup += puhist.values sumwhist = file["SumWeights"] sumw = processor.accumulator(np.zeros(1)) sumw += sumwhist.values[0] return processor.dict_accumulator({ 'pileup': processor.dict_accumulator({dataset: pileup}), 'sumw': processor.dict_accumulator({dataset: sumw}), }) final_accumulator = processor.dict_accumulator({ 'pileup': processor.dict_accumulator(), 'sumw': processor.dict_accumulator(), }) processor.futures_executor(filelist, get_pileup, final_accumulator, workers=8) with lz4f.open("correction_files/pileup_mc.cpkl.lz4", "wb") as fout: cloudpickle.dump(final_accumulator['pileup'], fout) with lz4f.open("correction_files/sumw_mc.cpkl.lz4", "wb") as fout: cloudpickle.dump(final_accumulator['sumw'], fout)
def test_lz4frame_open_write(data): with lz4frame.open('testfile', mode='wb') as fp: fp.write(data)