示例#1
0
def test_lz4frame_open_write_read_text():
    data = u'This is a test string'
    with lz4frame.open('testfile', mode='wt') as fp:
        fp.write(data)
    with lz4frame.open('testfile', mode='rt') as fp:
        data_out = fp.read()
    assert data_out == data
示例#2
0
def test_lz4frame_open_write_read_text():
    data = u'This is a test string'
    with lz4frame.open('testfile', mode='wt') as fp:
        fp.write(data)
    with lz4frame.open('testfile', mode='rt') as fp:
        data_out = fp.read()
    assert data_out == data
示例#3
0
def test_lz4frame_open_write_read(data, compression_level, block_linked,
                                  block_checksum, block_size, content_checksum,
                                  auto_flush, store_size, return_bytearray):

    kwargs = {}

    if store_size is True:
        kwargs['source_size'] = len(data)

    kwargs['compression_level'] = compression_level
    kwargs['block_size'] = block_size
    kwargs['block_linked'] = block_linked
    kwargs['content_checksum'] = content_checksum
    kwargs['block_checksum'] = block_checksum
    kwargs['auto_flush'] = auto_flush
    kwargs['return_bytearray'] = return_bytearray
    kwargs['mode'] = 'wb'

    with lz4frame.open('testfile', **kwargs) as fp:
        fp.write(data)

    with lz4frame.open('testfile', mode='r') as fp:
        data_out = fp.read()

    assert data_out == data
示例#4
0
def test_lz4frame_open_write_read(
        data,
        compression_level,
        block_linked,
        block_checksum,
        block_size,
        content_checksum,
        auto_flush,
        store_size,
        return_bytearray):

    kwargs = {}

    if store_size is True:
        kwargs['source_size'] = len(data)

    kwargs['compression_level'] = compression_level
    kwargs['block_size'] = block_size
    kwargs['block_linked'] = block_linked
    kwargs['content_checksum'] = content_checksum
    kwargs['block_checksum'] = block_checksum
    kwargs['auto_flush'] = auto_flush
    kwargs['return_bytearray'] = return_bytearray
    kwargs['mode'] = 'wb'

    with lz4frame.open('testfile', **kwargs) as fp:
        fp.write(data)

    with lz4frame.open('testfile', mode='r') as fp:
        data_out = fp.read()

    assert data_out == data
示例#5
0
def test_lz4frame_open_write_read_text_iter():
    data = u'This is a test string'
    with lz4frame.open('testfile', mode='wt') as fp:
        fp.write(data)
    data_out = ''
    with lz4frame.open('testfile', mode='rt') as fp:
        for line in fp:
            data_out += line
    assert data_out == data
示例#6
0
def test_lz4frame_open_write_read_text_iter():
    data = u'This is a test string'
    with lz4frame.open('testfile', mode='wt') as fp:
        fp.write(data)
    data_out = ''
    with lz4frame.open('testfile', mode='rt') as fp:
        for line in fp:
            data_out += line
    assert data_out == data
示例#7
0
def iter_yaml_lz4_reports(fn):
    """Iterate YAML reports from a lz4 file
    """
    assert str(fn).endswith("lz4")

    fd = lz4frame.open(fn)
    blobgen = stream_yaml_blobs(fd)

    off, header = next(blobgen)
    headsha = hashlib.sha1(header)
    # XXX: bad header kills whole bucket
    header = yaml.load(header, Loader=CLoader)
    if not header.get("report_id"):
        header["report_id"] = generate_report_id(header)

    for off, entry in blobgen:
        entry_len = len(entry)
        esha = headsha.copy()
        esha.update(entry)
        esha = esha.digest()
        try:
            entry = yaml.load(entry, Loader=CLoader)
            if not entry:  # e.g. '---\nnull\n...\n'
                continue
            if "test_start_time" in entry and "test_start_time" in header:
                header.pop("test_start_time")
            entry.update(header)
            yield off, entry_len, esha, entry, None
        except Exception as exc:
            yield off, entry_len, esha, None, exc

    fd.close()
示例#8
0
def writeout_measurement(msm_jstr, fn, update):
    """Safely write measurement to disk
    """
    # Different processes might be trying to write the same file at the same
    # time due to naming collisions. Use a safe tmpfile and atomic link
    # NamedTemporaryFile creates files with permissions 600
    # but we want other users (Nginx) to be able to read the measurement

    suffix = ".{}.tmp".format(os.getpid())
    with NamedTemporaryFile(suffix=suffix, dir=conf.msmtdir) as f:
        with lz4frame.open(f, "w") as lzf:
            lzf.write(msm_jstr)
            # os.fsync(lzf.fileno())

            final_fname = conf.msmtdir.joinpath(fn)
            try:
                os.chmod(f.name, 0o644)
                os.link(f.name, final_fname)
                metrics.incr("msmt_output_file_created")
            except FileExistsError:
                if update:
                    # update access time - used for cache cleanup
                    # no need to overwrite the file
                    os.utime(final_fname)
                    metrics.incr("msmt_output_file_updated")
                else:
                    log.info("Refusing to overwrite %s", final_fname)
                    metrics.incr("report_id_input_file_collision")
                    metrics.incr("msmt_output_file_skipped")
                    os.utime(final_fname)

    metrics.incr("wrote_uncompressed_bytes", len(msm_jstr))
def test_normalize_yaml_dns_consistency_2018(cans):
    can = cans["yaml18"]
    canfn = can.as_posix()
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
示例#10
0
def test_lz4frame_flush():
    data_1 = b"This is a..."
    data_2 = b" test string!"

    with lz4frame.open("testfile", mode="w") as fp_write:
        fp_write.write(data_1)
        fp_write.flush()

        fp_write.write(data_2)

        with lz4frame.open("testfile", mode="r") as fp_read:
            assert fp_read.read() == data_1

        fp_write.flush()

        with lz4frame.open("testfile", mode="r") as fp_read:
            assert fp_read.read() == data_1 + data_2
示例#11
0
def load_multiple(fn, touch=True) -> tuple:
    """Load contents of cans. Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None) or (None, msmt dict)
    """
    if touch:
        os.utime(fn)  # update access time - used for cache cleanup

    # TODO: handle:
    # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed
    if fn.endswith(".tar.lz4"):
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                if m.name.endswith(".json"):
                    for line in k:
                        yield (line, None)

                elif m.name.endswith(".yaml"):
                    continue  # FIXME
                    bucket_tstamp = "FIXME"
                    for msm in iter_yaml_msmt_normalized(k, bucket_tstamp):
                        yield (None, msm)

    elif fn.endswith(".json.lz4"):
        with lz4frame.open(fn) as f:
            for line in f:
                yield (line, None)

    elif fn.endswith(".yaml.lz4"):
        with lz4frame.open(fn) as f:
            raise Exception("Unsupported format: YAML")
            bucket_tstamp = "FIXME"
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp):
                metrics.incr("yaml_normalization")
                yield (None, msm)

    else:
        raise RuntimeError(fn)
示例#12
0
def openfile(filename, attr):
    """
    Open file with different compression types
    """
    if filename.lower().endswith('.bz2'):
        return bz2.BZ2File(filename, attr)
    if filename.lower().endswith('.xz'):
        return lzma.open(filename, attr)
    if filename.lower().endswith('.gz'):
        return gzip.open(filename, attr)
    if filename.lower().endswith('.lz4'):
        return frame.open(filename, attr)
    return open(filename, attr)
示例#13
0
def get_fd():
    for idx in get_next_index():
        fn = f"{args.output}_{idx:015d}"
        if args.lz4:
            outfile = lz4f.open(f"{fn}.lz4",
                                mode='wt',
                                encoding='utf-8',
                                compression_level=16)
        elif args.gzip:
            outfile = gzip.open(f"{fn}.gz", "wt", encoding='utf-8')
        else:
            outfile = open(args.output, "w")
        yield (idx, fn, outfile)
示例#14
0
def test_normalize_yaml_2016(cans):
    can = cans["yaml16"]
    canfn = can.as_posix()
    assert canfn.startswith("testdata/2016-07-07/20160706T000046Z-GB")
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
            if n == 0:
                with open("fastpath/tests/data/yaml16_0.json") as f:
                    exp = ujson.load(f)
                assert entry == exp
            elif n > 20:
                break
示例#15
0
def test_normalize_yaml_dns_consistency_2017(cans):
    can = cans["yaml17"]
    canfn = can.as_posix()
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    # s3://ooni-data/autoclaved/jsonl.tar.lz4/2017-12-21/20171220T153044Z-BE-AS5432-dns_consistency-mnKRlHuqk8Eo6XMJt5ZkVQrgReaEXPEWaO9NafgXxSVIhAswTXT7QJc6zhsuttpK-0.1.0-probe.yaml.lz4
    # lz4cat <fn> | head -n1 | jq -S . > fastpath/tests/data/yaml17_0.json
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
            if n == 0:
                with open("fastpath/tests/data/yaml17_0.json") as f:
                    exp = ujson.load(f)
                assert entry == exp
            elif n > 20:
                break
示例#16
0
def open_maybe_gzip(filename, mode='r'):
    # this _must_ be a str
    filename = str(filename)
    if filename.endswith(h5_constants.GZIP_SUFFIX):
        raw = gzip.open(filename, mode + 'b', 2)
    elif filename.endswith(h5_constants.LZ4_SUFFIX):
        raw = lz4.open(filename, mode + 'b')
    else:
        return open(filename, mode)

    bufsize = 1024 * 1024  # 1MB of buffering
    if mode == 'r':
        return io.BufferedReader(raw, buffer_size=bufsize)
    elif mode == 'w':
        return io.BufferedWriter(raw, buffer_size=bufsize)
    else:
        raise ValueError("Unsupported mode for compression: %s" % mode)
示例#17
0
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]:
    """Load contents of cans. Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None, None) or (None, msmt dict, None)
    """
    # TODO: handle:
    # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed
    if fn.endswith(".tar.lz4"):
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    # end of tarball
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                assert k is not None
                if m.name.endswith(".json"):
                    for line in k:
                        yield (line, None, None)

                elif m.name.endswith(".yaml"):
                    bucket_tstamp = fn.split("/")[-2]
                    rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
                    for msm in iter_yaml_msmt_normalized(
                            k, bucket_tstamp, rfn):
                        metrics.incr("yaml_normalization")
                        yield (None, msm, None)

    elif fn.endswith(".json.lz4"):
        with lz4frame.open(fn) as f:
            for line in f:
                yield (line, None, None)

    elif fn.endswith(".yaml.lz4"):
        with lz4frame.open(fn) as f:
            bucket_tstamp = fn.split("/")[-2]
            rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn):
                metrics.incr("yaml_normalization")
                yield (None, msm, None)

    elif fn.endswith(".tar.gz"):
        # minican with missing gzipping :(
        tf = tarfile.open(fn)
        while True:
            m = tf.next()
            if m is None:
                # end of tarball
                tf.close()
                break
            log.debug("Loading %s", m.name)
            k = tf.extractfile(m)
            assert k is not None
            if not m.name.endswith(".post"):
                log.error("Unexpected filename")
                continue

            try:
                j = ujson.loads(k.read())
            except Exception:
                log.error(repr(k[:100]), exc_info=1)

            fmt = j.get("format", "")
            if fmt == "json":
                msm = j.get("content", {})
                yield (None, msm, None)

            elif fmt == "yaml":
                log.info("Skipping YAML")

            else:
                log.info("Ignoring invalid post")

    elif fn.endswith("/index.json.gz"):
        pass

    else:
        raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")
示例#18
0
def test_lz4frame_open_write_read_defaults(data):
    with lz4frame.open('testfile', mode='wb') as fp:
        fp.write(data)
    with lz4frame.open('testfile', mode='r') as fp:
        data_out = fp.read()
    assert data_out == data
示例#19
0
def test_lz4frame_open_write(data):
    with lz4frame.open('testfile', mode='wb') as fp:
        fp.write(data)
示例#20
0
from collections import defaultdict
import gzip
import lz4.frame as lz4f
import cloudpickle as cpkl
import json
import re
import os

import uproot
import numpy as np

from coffea import hist
from coffea.hist import export
import processmap

with lz4f.open("hists.cpkl.lz4") as fin:
    hists_unmapped = cpkl.load(fin)

hists = {}
for key, val in hists_unmapped.items():
    if isinstance(val, hist.Hist):
        hists[key] = processmap.apply(val)

if os.path.exists("templates.root"):
    os.remove("templates.root")
fout = uproot.create("templates.root")

nodata = re.compile("(?!data_obs)")
h = hists['templates_signalregion'][nodata]
lumi = 41.1
h.scale({p: lumi for p in h[nodata].identifiers('process')}, axis="process")
示例#21
0
    1.27212, -0.000571640, 8.37289e-07, -5.20433e-10, 1.45375e-13, -1.50389e-17
])


def msd_weight(pt, eta):
    genw = gpar[0] + gpar[1] * np.power(pt * gpar[2], -gpar[3])
    ptpow = np.power.outer(pt, np.arange(cpar.size))
    cenweight = np.dot(ptpow, cpar)
    forweight = np.dot(ptpow, fpar)
    weight = np.where(np.abs(eta) < 1.3, cenweight, forweight)
    return genw * weight


corrections['msdweight'] = msd_weight

with lz4f.open("correction_files/pileup_mc.cpkl.lz4", "rb") as fin:
    pileup_corr = cloudpickle.load(fin)

with uproot.open(
        "correction_files/pileup_Cert_294927-306462_13TeV_PromptReco_Collisions17_withVar.root"
) as fin_pileup:
    norm = lambda x: x / x.sum()
    data_pu = norm(fin_pileup["pileup"].values)
    data_pu_puUp = norm(fin_pileup["pileup_plus"].values)
    data_pu_puDown = norm(fin_pileup["pileup_minus"].values)

    pileup_corr_puUp = {}
    pileup_corr_puDown = {}
    for k in pileup_corr.keys():
        mc_pu = norm(pileup_corr[k].value)
        mask = mc_pu > 0.
示例#22
0
def test_lz4frame_open_write_read_defaults(data):
    with lz4frame.open('testfile', mode='wb') as fp:
        fp.write(data)
    with lz4frame.open('testfile', mode='r') as fp:
        data_out = fp.read()
    assert data_out == data
            else:
                hout['sumw'][dataset] += np.sum(df['scale1fb'])
        return hout

    def postprocess(self, accumulator):
        # set everything to 1/fb scale
        lumi = 1000  # [1/pb]

        scale = {}
        for dataset, dataset_sumw in accumulator['sumw'].items():
            scale[dataset] = lumi * self._corrections['xsections'][
                dataset] / dataset_sumw.value

        for h in accumulator.values():
            if isinstance(h, hist.Hist):
                h.scale(scale, axis="dataset")

        return accumulator


if __name__ == '__main__':
    with lz4f.open("corrections.cpkl.lz4", mode="rb") as fin:
        corrections = cloudpickle.load(fin)

    processor_instance = BoostedHbbProcessor(corrections=corrections)

    with lz4f.open('boostedHbbProcessor.cpkl.lz4',
                   mode='wb',
                   compression_level=5) as fout:
        cloudpickle.dump(processor_instance, fout)
示例#24
0
文件: s3feeder.py 项目: ooni/pipeline
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]:
    """Load contents of legacy cans and minicans.
    Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None, uid) or (None, msmt dict, uid)
    The uid is either taken from the filename or generated by trivial_id for
    legacy cans
    """
    # TODO: split this and handle legacy cans and post/minicans independently
    if fn.endswith(".tar.lz4"):
        # Legacy lz4 cans
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    # end of tarball
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                assert k is not None
                if m.name.endswith(".json"):
                    for line in k:
                        msm = ujson.loads(line)
                        msmt_uid = trivial_id(msm)
                        yield (None, msm, msmt_uid)

                elif m.name.endswith(".yaml"):
                    bucket_tstamp = fn.split("/")[-2]
                    rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
                    for msm in iter_yaml_msmt_normalized(
                            k, bucket_tstamp, rfn):
                        metrics.incr("yaml_normalization")
                        msmt_uid = trivial_id(msm)
                        yield (None, msm, msmt_uid)

    elif fn.endswith(".json.lz4"):
        # Legacy lz4 json files
        with lz4frame.open(fn) as f:
            for line in f:
                msm = ujson.loads(line)
                msmt_uid = trivial_id(msm)
                yield (None, msm, msmt_uid)

    elif fn.endswith(".yaml.lz4"):
        # Legacy lz4 yaml files
        with lz4frame.open(fn) as f:
            bucket_tstamp = fn.split("/")[-2]
            rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn):
                metrics.incr("yaml_normalization")
                msmt_uid = trivial_id(msm)
                yield (None, msm, msmt_uid)

    elif fn.endswith(".tar.gz"):
        # minican with missing gzipping :(
        tf = tarfile.open(fn)
        while True:
            m = tf.next()
            if m is None:
                # end of tarball
                tf.close()
                break
            log.debug("Loading %s", m.name)
            k = tf.extractfile(m)
            assert k is not None
            if not m.name.endswith(".post"):
                log.error("Unexpected filename")
                continue

            try:
                j = ujson.loads(k.read())
            except Exception:
                log.error(repr(k[:100]), exc_info=1)
                continue

            fmt = j.get("format", "")
            if fmt == "json":
                msm = j.get("content", {})
                # extract msmt_uid from filename e.g:
                # ... /20210614004521.999962_JO_signal_68eb19b439326d60.post
                msmt_uid = m.name.rsplit("/", 1)[1]
                msmt_uid = msmt_uid[:-5]
                yield (None, msm, msmt_uid)

            elif fmt == "yaml":
                log.info("Skipping YAML")

            else:
                log.info("Ignoring invalid post")

    elif fn.endswith("/index.json.gz"):
        pass

    else:
        raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")
示例#25
0
import lz4.frame as lz4f
import cloudpickle
from coffea import hist

with lz4f.open("hists.cpkl.lz4", mode="r", compression_level=5) as fin:
    hists = cloudpickle.load(fin)



fig, ax, _ = hist.plot1d(hists["sr_met"],overlay="dataset")
# ax.set_xscale('log')
ax.set_yscale('log')
ax.set_ylim(0.1, 1e5)
fig.savefig("test.pdf")

# print(hists)
示例#26
0
        default=None,
        help='Filename for the pyinstrument HTML profile output')
    args = parser.parse_args()

    # Set a list of preloaded columns, to profile the execution separately from the uproot deserialization
    preload_items = {}

    with open(args.samplejson) as fin:
        samplefiles = json.load(fin)
    sample = samplefiles[args.sample]
    filelist = []
    for dataset, files in sample.items():
        for file in files[:args.limit]:
            filelist.append((dataset, file))

    with lz4f.open(args.processor, mode="rb") as fin:
        processor_instance = cloudpickle.load(fin)

    combined_accumulator = processor.dict_accumulator({
        'stats':
        processor.dict_accumulator({
            'nentries':
            processor.accumulator(0),
            'bytesread':
            processor.accumulator(0),
            'sumworktime':
            processor.accumulator(0.),
            'columns_accessed':
            processor.set_accumulator(),
        }),
        'job':
示例#27
0
def get_pileup(item):
    dataset, filename = item
    file = uproot.open(filename)
    puhist = file["Pu"]
    pileup = processor.accumulator(np.zeros_like(puhist.values))
    pileup += puhist.values
    sumwhist = file["SumWeights"]
    sumw = processor.accumulator(np.zeros(1))
    sumw += sumwhist.values[0]
    return processor.dict_accumulator({
        'pileup':
        processor.dict_accumulator({dataset: pileup}),
        'sumw':
        processor.dict_accumulator({dataset: sumw}),
    })


final_accumulator = processor.dict_accumulator({
    'pileup':
    processor.dict_accumulator(),
    'sumw':
    processor.dict_accumulator(),
})
processor.futures_executor(filelist, get_pileup, final_accumulator, workers=8)

with lz4f.open("correction_files/pileup_mc.cpkl.lz4", "wb") as fout:
    cloudpickle.dump(final_accumulator['pileup'], fout)

with lz4f.open("correction_files/sumw_mc.cpkl.lz4", "wb") as fout:
    cloudpickle.dump(final_accumulator['sumw'], fout)
示例#28
0
def test_lz4frame_open_write(data):
    with lz4frame.open('testfile', mode='wb') as fp:
        fp.write(data)