Exemplo n.º 1
0
def static_avocado_dag(job, inputs, sample, output_dir, suffix=''):
    """
    A Toil job function performing Avocado preprocessing on a single sample
    """
    inputs.sample = sample
    inputs.output_dir = output_dir
    inputs.suffix = suffix

    if inputs.master_ip is not None or inputs.run_local:
        # Static, external Spark cluster
        spark_on_toil = False
        spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip,
                                   inputs, spark_on_toil)
        job.addChild(spark_work)
    else:
        # Dynamic subclusters, i.e. Spark-on-Toil
        spark_on_toil = True
        cores = multiprocessing.cpu_count()
        master_ip = spawn_spark_cluster(
            job,
            inputs.num_nodes - 1,
            cores=cores,
            memory=inputs.memory,
            sparkMasterContainer="fnothaft/apache-spark-master",
            sparkWorkerContainer="fnothaft/apache-spark-worker")
        spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs,
                                   spark_on_toil)
        job.addChild(spark_work)
Exemplo n.º 2
0
def _count(job, workers):

    # if we are on Mac OS X and using docker-machine to run docker, we need to
    # get the IP of the docker-machine box
    #
    # this is necessary because docker-machine runs docker in a virtualbox
    # vm which has a different IP address from localhost
    ip = None
    if os.uname()[0] == "Darwin":
        # check what machines docker-machine is running
        # strip leading and trailing whitespace, and split lines
        machines = check_output(["docker-machine",
                                 "ls"]).strip().rstrip().split("\n")

        # we take the first docker-machine environment that is running
        # this means two lines including the header
        if len(machines) != 2:
            raise RuntimeError(
                'Expected a single docker-machine to be running.'
                'Got %d:\n%r.' % (len(machines) - 1, machines))

        machine = machines[1].split()[0]
        ip = check_output(["docker-machine", "ip", machine]).strip().rstrip()

    # set up cluster
    masterHostname = spawn_spark_cluster(job,
                                         workers,
                                         cores=1,
                                         overrideLeaderIP=ip)

    job.addChildJobFn(_count_child, masterHostname)
Exemplo n.º 3
0
def kmer_dag(job,
             input_file,
             output_path,
             kmer_length,
             spark_conf,
             workers,
             cores,
             memory,
             sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or
            (workers is not None and cores is not None and memory is not None and spark_conf is not None),
            "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).")

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job,
                                              sudo,
                                              workers,
                                              cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload,
                      masterHostname,
                      input_file, output_file, kmer_length,
                      spark_conf, memory, sudo)
Exemplo n.º 4
0
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers,
             cores, memory, sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or (
        workers is not None and cores is not None and memory is not None
        and spark_conf is not None
    ), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf)."
            )

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job, sudo, workers, cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload, masterHostname, input_file,
                      output_file, kmer_length, spark_conf, memory, sudo)
Exemplo n.º 5
0
def static_adam_preprocessing_dag(job, inputs, sample, output_dir, suffix=''):
    """
    A Toil job function performing ADAM preprocessing on a single sample
    """
    inputs.sample = sample
    inputs.output_dir = output_dir
    inputs.suffix = suffix

    if inputs.master_ip is not None or inputs.run_local:
        if not inputs.run_local and inputs.master_ip == 'auto':
            # Static, standalone Spark cluster managed by uberscript
            spark_on_toil = False
            scale_up = job.wrapJobFn(scale_external_spark_cluster, 1)
            job.addChild(scale_up)
            spark_work = job.wrapJobFn(download_run_and_upload,
                                       inputs.master_ip, inputs, spark_on_toil)
            scale_up.addChild(spark_work)
            scale_down = job.wrapJobFn(scale_external_spark_cluster, -1)
            spark_work.addChild(scale_down)
        else:
            # Static, external Spark cluster
            spark_on_toil = False
            spark_work = job.wrapJobFn(download_run_and_upload,
                                       inputs.master_ip, inputs, spark_on_toil)
            job.addChild(spark_work)
    else:
        # Dynamic subclusters, i.e. Spark-on-Toil
        spark_on_toil = True
        cores = multiprocessing.cpu_count()
        master_ip = spawn_spark_cluster(
            job,
            False,  # Sudo
            inputs.num_nodes - 1,
            cores=cores,
            memory=inputs.memory)
        spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs,
                                   spark_on_toil)
        job.addChild(spark_work)
def static_adam_preprocessing_dag(job, inputs, sample, output_dir, suffix=''):
    """
    A Toil job function performing ADAM preprocessing on a single sample
    """
    inputs.sample = sample
    inputs.output_dir = output_dir
    inputs.suffix = suffix

    if inputs.master_ip is not None or inputs.run_local:
        if not inputs.run_local and inputs.master_ip == 'auto':
            # Static, standalone Spark cluster managed by uberscript
            spark_on_toil = False
            scale_up = job.wrapJobFn(scale_external_spark_cluster, 1)
            job.addChild(scale_up)
            spark_work = job.wrapJobFn(download_run_and_upload,
                                       inputs.master_ip, inputs, spark_on_toil)
            scale_up.addChild(spark_work)
            scale_down = job.wrapJobFn(scale_external_spark_cluster, -1)
            spark_work.addChild(scale_down)
        else:
            # Static, external Spark cluster
            spark_on_toil = False
            spark_work = job.wrapJobFn(download_run_and_upload,
                                       inputs.master_ip, inputs, spark_on_toil)
            job.addChild(spark_work)
    else:
        # Dynamic subclusters, i.e. Spark-on-Toil
        spark_on_toil = True
        cores = multiprocessing.cpu_count()
        master_ip = spawn_spark_cluster(job,
                                        False, # Sudo
                                        inputs.num_nodes-1,
                                        cores=cores,
                                        memory=inputs.memory)
        spark_work = job.wrapJobFn(download_run_and_upload,
                                   master_ip, inputs, spark_on_toil)
        job.addChild(spark_work)