Exemplo n.º 1
0
def _count(job, workers):

    # if we are on Mac OS X and using docker-machine to run docker, we need to
    # get the IP of the docker-machine box
    #
    # this is necessary because docker-machine runs docker in a virtualbox
    # vm which has a different IP address from localhost
    ip = None
    if os.uname()[0] == "Darwin":
        # check what machines docker-machine is running
        # strip leading and trailing whitespace, and split lines
        machines = check_output(["docker-machine", "ls"]).strip().rstrip().split("\n")

        # we take the first docker-machine environment that is running
        # this means two lines including the header
        if len(machines) != 2:
            raise RuntimeError(
                "Expected a single docker-machine to be running." "Got %d:\n%r." % (len(machines) - 1, machines)
            )

        machine = machines[1].split()[0]
        ip = check_output(["docker-machine", "ip", machine]).strip().rstrip()

    # set up cluster
    masterHostname = spawn_spark_cluster(job, False, workers, cores=1, overrideLeaderIP=ip)

    job.addChildJobFn(_count_child, masterHostname)
Exemplo n.º 2
0
def kmer_dag(job,
             input_file,
             output_path,
             kmer_length,
             spark_conf,
             workers,
             cores,
             memory,
             sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or
            (workers is not None and cores is not None and memory is not None and spark_conf is not None),
            "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).")

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job,
                                              sudo,
                                              workers,
                                              cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload,
                      masterHostname,
                      input_file, output_file, kmer_length,
                      spark_conf, memory, sudo)