Пример #1
0
def run_multicore(fn, items, config, parallel=None):
    """Run the function using multiple cores on the given items to process.
    """
    if len(items) == 0:
        return []
    if parallel is None or "num_jobs" not in parallel:
        if parallel is None:
            parallel = {
                "type": "local",
                "cores": config["algorithm"].get("num_cores", 1)
            }
        sysinfo = system.get_info({}, parallel)
        parallel = resources.calculate(
            parallel,
            items,
            sysinfo,
            config,
            parallel.get("multiplier", 1),
            max_multicore=int(parallel.get("max_multicore", sysinfo["cores"])))
    items = [
        config_utils.add_cores_to_config(x, parallel["cores_per_job"])
        for x in items
    ]
    if joblib is None:
        raise ImportError("Need joblib for multiprocessing parallelization")
    out = []
    for data in joblib.Parallel(parallel["num_jobs"],
                                batch_size=1,
                                backend="multiprocessing")(
                                    joblib.delayed(fn)(x) for x in items):
        if data:
            out.extend(data)
    return out
Пример #2
0
def start(parallel,
          items,
          config,
          dirs=None,
          name=None,
          multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling
      cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be used
      to process less multicore usage when jobs run faster on more single cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(
            os.path.join(dirs["work"], "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    parallel = resources.calculate(
        parallel,
        items,
        sysinfo,
        config,
        multiplier=multiplier,
        max_multicore=int(max_multicore or sysinfo.get("cores", 1)))
    try:
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Пример #3
0
def run_multicore(fn, items, config, parallel=None):
    """Run the function using multiple cores on the given items to process.
    """
    if len(items) == 0:
        return []
    if parallel is None or "num_jobs" not in parallel:
        if parallel is None:
            parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1)}
        sysinfo = system.get_info({}, parallel)
        parallel = resources.calculate(
            parallel,
            items,
            sysinfo,
            config,
            parallel.get("multiplier", 1),
            max_multicore=int(parallel.get("max_multicore", sysinfo["cores"])),
        )
    items = [config_utils.add_cores_to_config(x, parallel["cores_per_job"]) for x in items]
    if joblib is None:
        raise ImportError("Need joblib for multiprocessing parallelization")
    out = []
    for data in joblib.Parallel(parallel["num_jobs"])(joblib.delayed(fn)(x) for x in items):
        if data:
            out.extend(data)
    return out
Пример #4
0
def start(parallel, items, config, dirs=None, name=None, multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote
    functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up
    clusters for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid
    underscheduling cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be
    used to process less multicore usage when jobs run faster on more single
    cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"],
                                                         "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    max_multicore = int(max_multicore or sysinfo.get("cores", 1))
    parallel = resources.calculate(parallel, items, sysinfo, config,
                                   multiplier=multiplier,
                                   max_multicore=max_multicore)
    try:
        view = None
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            parallel["checkpointed"] = True
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        if view is not None:
            ipython.stop(view)
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Пример #5
0
def _calculate_resources(data, args, resources):
    parallel = clargs.to_parallel(args)
    config = data[0][0]['config']
    config['resources'].update({resources['name']: {'memory': "%sg" % resources['mem'], 'cores': resources['cores']}})
    parallel.update({'progs': [resources['name']]})
    # parallel = log.create_base_logger(config, parallel)
    # log.setup_local_logging(config, parallel)
    log.setup_log(config, parallel)
    dirs = {'work': os.path.abspath(os.getcwd())}
    system.write_info(dirs, parallel, config)
    sysinfo = system.machine_info()[0]
    log.logger.info("Number of items %s" % len(data))
    parallel = res.calculate(parallel, data, sysinfo, config)
    log.logger.info(parallel)
    # print parallel
    # raise
    return parallel
                                     "bcbio_system.yaml")
    except ValueError as err:
        print(err)
        print(
            "WARNING: Attempting to read bcbio_system.yaml in the current directory."
        )
        system_config = "bcbio_system.yaml"

    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
        res = {'cores': args.cores_per_job}
        config["algorithm"] = {"num_cores": args.cores_per_job}
        config["resources"].update({'sambamba': res, 'samtools': res})
        config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log")
    parallel = clargs.to_parallel(args)
    parallel.update({'progs': ['samtools', 'sambamba']})
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = {'work': os.path.abspath(os.getcwd())}
    system.write_info(dirs, parallel, config)
    sysinfo = system.machine_info()[0]
    samples = _get_samples_to_process(args.csv, out_dir, config,
                                      args.force_single, args.separators)
    parallel = resources.calculate(parallel, [samples], sysinfo, config)

    with prun.start(parallel, samples, config, dirs) as run_parallel:
        with profile.report("prepare bcbio samples", dirs):
            samples = run_parallel("prepare_bcbio_samples", samples)

    create_new_csv(samples, args)
Пример #7
0
        system_config = args.galaxy
    with open(system_config) as in_handle:
        config = yaml.load(in_handle)

    parallel = clargs.to_parallel(args)
    parallel.update({'progs': args.progs})
    dirs = {'work': os.path.abspath(os.getcwd())}
    if args.sys_info.find(";") > -1:
        info = args.sys_info.split(";")
        sysinfo = {'cores': int(info[0]), 'memory': float(info[1])}
    else:
        if utils.file_exists(args.sys_info):
            sysinfo = yaml.load(open(args.sys_info))[0]
    print "system info %s" % sysinfo
    samples = []
    pipelines, config = _pair_samples_with_pipelines(args.yaml_file, config)
    for s in pipelines:
        samples = [item for item in pipelines[s]]
    print "number of samples %s" % len(samples)
    print "after calculate fn"
    parallel = resources.calculate(parallel, samples, sysinfo, config)
    print parallel
    if args.fixed:
        print "after fixed ipython fn"
        parallel = ipython_fn(parallel, config)
    else:
        print "after ipython fn"
        parallel = ipython_current(parallel, config)
    print parallel
    ipc_fn(parallel)