Exemplo n.º 1
0
def parallel_draw(drawer, jobs, mode, ncores, batch_opts):
    if len(jobs) == 0:
        return

    njobs = ncores
    if mode in ["multiprocessing"]:
        njobs = len(jobs)

    grouped_jobs = [list(x) for x in np.array_split(jobs, njobs)]
    tasks = [{
        "task": multidraw,
        "args": (drawer, args),
        "kwargs": {}
    } for args in grouped_jobs]

    if mode == "multiprocessing" and ncores == 0:
        pysge.local_submit(tasks)
    elif mode == "multiprocessing":
        pysge.mp_submit(tasks, ncores=ncores)
    elif mode == "sge":
        pysge.sge_submit(
            tasks,
            "zdb-draw",
            "_ccsp_temp/",
            options=batch_opts,
            sleep=5,
            request_resubmission_options=True,
            return_files=True,
        )
Exemplo n.º 2
0
def submit_tasks(tasks, mode="multiprocessing", ncores=0, batch_opts=""):
    if mode == "multiprocessing" and ncores == 0:
        results = pysge.local_submit(tasks)
    elif mode == "multiprocessing":
        results = pysge.mp_submit(tasks, ncores=ncores)
    elif mode == "sge":
        results = pysge.sge_submit(
            tasks,
            "zdb",
            "_ccsp_temp/",
            options=batch_opts,
            sleep=5,
            request_resubmission_options=True,
            return_files=True,
        )
    elif mode == "condor":
        import conpy
        results = conpy.condor_submit(
            "zdb",
            "_ccsp_temp/",
            tasks=tasks,
            options=batch_opts,
            sleep=5,
            request_resubmission_options=True,
        )
    return results
Exemplo n.º 3
0
def run(
    sequence, datasets, name, outdir, tempdir, mode, batch_opts, ncores,
    nblocks_per_dataset, nblocks_per_process, nfiles_per_dataset,
    nfiles_per_process, blocksize, cachesize, quiet, dryrun, sample,
    predetermined_nevents_in_file,
):
    process = AtUproot(
        outdir,
        quiet = quiet,
        max_blocks_per_dataset = nblocks_per_dataset,
        max_blocks_per_process = nblocks_per_process,
        max_files_per_dataset = nfiles_per_dataset,
        max_files_per_process = nfiles_per_process,
        nevents_per_block = blocksize,
        predetermined_nevents_in_file=predetermined_nevents_in_file,
        branch_cache = LFUCache(int(cachesize*1024**3), get_size),
    )
    tasks = process.run(datasets, sequence)

    if mode=="multiprocessing" and ncores==0:
        results = pysge.local_submit(tasks)
    elif mode=="multiprocessing":
        results = pysge.mp_submit(tasks, ncores=ncores)
    elif mode=="sge":
        results = pysge.sge_submit(
            tasks, name, tempdir, options=batch_opts, dryrun=dryrun,
            sleep=5, request_resubmission_options=True,
            return_files=True,
        )
    return results
Exemplo n.º 4
0
def main():
    options = parse_args()
    mode = options.mode
    njobs = options.ncores

    # setup jobs
    with open(options.config, 'r') as f:
        cfg = yaml.full_load(f)

    # group jobs
    files = cfg["files"]
    if options.nfiles > 0:
        files = files[:options.nfiles]
    if mode in ["multiprocessing"] or njobs < 0:
        njobs = len(files)

    grouped_files = [list(x) for x in np.array_split(files, njobs)]
    tasks = [
        {"task": df_skim, "args": (fs,cfg,options.output.format(idx)), "kwargs": {}}
        for idx, fs in enumerate(grouped_files)
    ]

    if mode=="multiprocessing" and options.ncores==0:
        results = pysge.local_submit(tasks)
    elif mode=="multiprocessing":
        results = pysge.mp_submit(tasks, ncores=options.ncores)
    elif mode=="sge":
        results = pysge.sge_submit(
            "zdb", "_ccsp_temp/", tasks=tasks, options=options.sge_opts,
            sleep=5, request_resubmission_options=True,
        )
    print("Finished!")
Exemplo n.º 5
0
def parallel_draw(draw, jobs, options):
    if len(jobs) == 0:
        return

    mode = options.mode
    njobs = options.ncores
    if options.mode in ["multiprocessing"]:
        njobs = len(jobs) + 1

    jobs = [list(x) for x in np.array_split(jobs, njobs)]
    tasks = [{
        "task": multidraw,
        "args": (draw, args),
        "kwargs": {}
    } for args in jobs]

    if mode == "multiprocessing" and options.ncores == 0:
        results = pysge.local_submit(tasks)
    elif mode == "multiprocessing":
        results = pysge.mp_submit(tasks, ncores=options.ncores)
    elif mode == "sge":
        results = pysge.sge_submit(
            tasks,
            "zdb",
            "_ccsp_temp/",
            options=options.sge_opts,
            request_resubmission_options=True,
            return_files=True,
        )
    else:
        results = []
Exemplo n.º 6
0
def main():
    options = parse_args()

    results = pysge.sge_resume(
        "zdb",
        options.path,
        options=options.sge_opts,
        sleep=5,
        request_resubmission_options=True,
    )

    njobs = options.ncores
    if options.mode in ["multiprocessing"] or options.ncores < 0:
        njobs = len(results)

    grouped_args = [list(x) for x in np.array_split(results, njobs)]
    tasks = [{
        "task": df_open_merge,
        "args": (args, ),
        "kwargs": {
            "quiet": True
        }
    } for args in grouped_args]

    if options.mode == "multiprocessing" and options.ncores == 0:
        merge_results = pysge.local_submit(tasks)
        df = pd.DataFrame()
        for result in merge_results:
            df = df_merge(df, result)
    elif options.mode == "multiprocessing":
        merge_results = pysge.mp_submit(tasks, ncores=options.ncores)
        df = pd.DataFrame()
        for result in merge_results:
            df = df_merge(df, result)
    elif options.mode == "sge":
        merge_results = pysge.sge_submit(
            "zdb-merge",
            "_ccsp_temp/",
            tasks=tasks,
            options=options.sge_opts,
            sleep=5,
            request_resubmission_options=True,
        )
        df = df_open_merge(merge_results)
    else:
        df = pd.DataFrame()

    print(df)
    path, table = options.output.split(":")
    df.to_hdf(
        path,
        table,
        format='table',
        append=False,
        complevel=9,
        complib='zlib',
    )
Exemplo n.º 7
0
outpaths = [
    "data/hists_qcd_estimation.h5:DataAggEvents",
    "data/hists_qcd_estimation.h5:MCAggEvents",
    "data/hists_qcd_estimation.h5:MCAggEvents_jes",
    "data/hists_qcd_estimation.h5:MCAggEvents_jer",
    "data/hists_qcd_estimation.h5:MCAggEvents_unclust",
    "data/hists_qcd_estimation.h5:MCAggEvents_lepscales",
]

tasks = []
for idx, outpath in enumerate(outpaths):
    start = 10 * idx
    stop = min(10 * (idx + 1), len(paths))

    tasks.append({
        "task":
        job,
        "args": (
            [os.path.abspath(p) for p in paths[start:stop]],
            os.path.abspath(outpath),
        ),
        "kwargs": {},
    })

pysge.sge_submit(
    tasks,
    "merge",
    "_ccsp_temp",
    options="-q hep.q -l h_vmem=24G -pe hep.pe 8",
)
Exemplo n.º 8
0
def analyse(
    config,
    mode="multiprocesing",
    ncores=0,
    nfiles=-1,
    batch_opts="",
    output=None,
    chunksize=500000,
    merge_opts={},
):
    if len(output.split(":")) != 2:
        raise ValueError(
            "The output kwarg should be None or a string with the format "
            "'{file_name}:{table_name}' instead of " + "{}".format(output))

    njobs = ncores

    # setup jobs
    with open(config, 'r') as f:
        cfg = yaml.full_load(f)

    # group jobs
    files = cfg["files"]
    if nfiles > 0:
        files = files[:nfiles]
    if mode in ["multiprocessing"] or njobs < 0:
        njobs = len(files)

    grouped_files = [list(x) for x in np.array_split(files, njobs)]
    tasks = [{
        "task": df_process,
        "args": (fs, cfg["query"]),
        "kwargs": {
            "chunksize": chunksize
        },
    } for fs in grouped_files]
    results = submit_tasks(tasks,
                           mode=mode,
                           ncores=ncores,
                           batch_opts=batch_opts)
    if mode == 'multiprocessing':
        df = functools.reduce(lambda x, y: df_merge(x, y), results)
    else:
        # grouped multi-merge
        merge_njobs = merge_opts.get("ncores", 100)
        grouped_merges = [
            list(x) for x in np.array_split(results, merge_njobs)
        ]
        tasks = [{
            "task": df_open_merge,
            "args": (r, ),
            "kwargs": {},
        } for r in grouped_merges]
        merge_mode = merge_opts.get("mode", "multiprocessing")
        if merge_mode == "multiprocessing" and ncores == 0:
            semimerged_results = pysge.local_submit(tasks)
            df = functools.reduce(lambda x, y: df_merge(x, y), results)
        elif mode == "multiprocessing":
            semimerged_results = pysge.mp_submit(tasks, ncores=ncores)
            df = functools.reduce(lambda x, y: df_merge(x, y), results)
        elif mode == "sge":
            semimerged_results = pysge.sge_submit(
                tasks,
                "zdb-merge",
                "_ccsp_temp",
                options=merge_opts.get("batch_opts", "-q hep.q"),
                sleep=5,
                request_resubmission_options=True,
                return_files=True,
            )
            df = df_open_merge(semimerged_results)

    if output is not None:
        path, table = output.split(":")
        df.to_hdf(
            path,
            table,
            format='table',
            append=False,
            complevel=9,
            complib='zlib',
        )
    else:
        return df
Exemplo n.º 9
0
def main():
    with open("data_v2.txt", 'r') as f:
        datain = f.read()

    datasets, tasks = [], []
    for block in datain.split("\n\n"):
        if len(block) == 0:
            continue
        lines = block.split("\n")

        das = lines[1]
        files = sorted(list(set(lines[3].split(" "))))
        summary = eval(lines[5])[0]

        parent = get_parent(das)
        print(parent)
        #runyear, runletter, ver = get_runera(das)

        tasks.extend([{
            "task": get_nevents_sumweights,
            "args": (f'{xrd_redir}{p}', ),
            "kwargs": {
                "param": None
            },
        } for p in files])

        isdata = True
        tree = "Events"
        xsec = None

        datasets.append({
            "name": parent,
            "parent": parent,
            "isdata": isdata,
            "nevents": int(summary["nevents"]),
            "sumweights": None,
            "files": [f'{xrd_redir}{f}' for f in files],
            "file_nevents": [],
            "DAS": das,
            "tree": tree,
            "xsection": xsec,
        })

    #results = pysge.local_submit(tasks)
    #results = pysge.mp_submit(tasks, 6)
    results = pysge.sge_submit(tasks, "dasq", "_ccsp_temp")

    all_files_results = {}
    for r in results:
        all_files_results.update(r)

    new_datasets = []
    for d in datasets:
        tot_nevts = 0
        tot_sumw = 0.
        fnevts = []
        for p in d["files"]:
            #nevts, sumw = all_files_results[p]
            nevts = all_files_results[p]
            sumw = nevts
            tot_nevts += nevts
            tot_sumw += sumw
            fnevts.append(nevts)
        if tot_nevts != d["nevents"]:
            print("Mismatch in nevents from files {} and summary {} for {}".
                  format(
                      tot_nevts,
                      d["nevents"],
                      d["DAS"],
                  ))
        new_datasets.append({
            "name": d["name"],
            "parent": d["parent"],
            "isdata": d["isdata"],
            "nevents": tot_nevts,
            "sumweights": tot_sumw,
            "files": d["files"],
            "file_nevents": fnevts,
            "DAS": d["DAS"],
            "tree": d["tree"],
            "xsection": d["xsection"],
        })

    with open("data_v2.yaml", 'w') as f:
        yaml.dump(new_datasets, f, indent=4)