def static_avocado_dag(job, inputs, sample, output_dir, suffix=''): """ A Toil job function performing Avocado preprocessing on a single sample """ inputs.sample = sample inputs.output_dir = output_dir inputs.suffix = suffix if inputs.master_ip is not None or inputs.run_local: # Static, external Spark cluster spark_on_toil = False spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip, inputs, spark_on_toil) job.addChild(spark_work) else: # Dynamic subclusters, i.e. Spark-on-Toil spark_on_toil = True cores = multiprocessing.cpu_count() master_ip = spawn_spark_cluster( job, inputs.num_nodes - 1, cores=cores, memory=inputs.memory, sparkMasterContainer="fnothaft/apache-spark-master", sparkWorkerContainer="fnothaft/apache-spark-worker") spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs, spark_on_toil) job.addChild(spark_work)
def _count(job, workers): # if we are on Mac OS X and using docker-machine to run docker, we need to # get the IP of the docker-machine box # # this is necessary because docker-machine runs docker in a virtualbox # vm which has a different IP address from localhost ip = None if os.uname()[0] == "Darwin": # check what machines docker-machine is running # strip leading and trailing whitespace, and split lines machines = check_output(["docker-machine", "ls"]).strip().rstrip().split("\n") # we take the first docker-machine environment that is running # this means two lines including the header if len(machines) != 2: raise RuntimeError( 'Expected a single docker-machine to be running.' 'Got %d:\n%r.' % (len(machines) - 1, machines)) machine = machines[1].split()[0] ip = check_output(["docker-machine", "ip", machine]).strip().rstrip() # set up cluster masterHostname = spawn_spark_cluster(job, workers, cores=1, overrideLeaderIP=ip) job.addChildJobFn(_count_child, masterHostname)
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers, cores, memory, sudo): ''' Optionally launches a Spark cluster and then runs ADAM to count k-mers on an input file. :param job: Toil job :param input_file: URL/path to input file to count k-mers on :param output_path: URL/path to save k-mer counts at :param kmer_length: The length of k-mer substrings to count. :param spark_conf: Optional Spark configuration. If set, workers should \ not be set. :param workers: Optional number of Spark workers to launch. If set, \ spark_conf should not be set, and cores and memory should be set. :param cores: Number of cores per Spark worker. Must be set if workers is \ set. :param memory: Amount of memory to provided to Spark workers. Must be set \ if workers is set. :param sudo: Whether or not to run Spark containers with sudo. :type job: toil.Job :type input_file: string :type output_path: string :type kmer_length: int or string :type spark_conf: string or None :type workers: int or None :type cores: int or None :type memory: int or None :type sudo: boolean ''' require((spark_conf is not None and workers is None) or (workers is not None and cores is not None and memory is not None and spark_conf is not None), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).") # if we do not have a spark configuration, then we must spawn a cluster if spark_conf is None: master_hostname = spawn_spark_cluster(job, sudo, workers, cores) else: spark_conf = shlex.split(spark_conf) job.addChildJobFn(download_count_upload, masterHostname, input_file, output_file, kmer_length, spark_conf, memory, sudo)
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers, cores, memory, sudo): ''' Optionally launches a Spark cluster and then runs ADAM to count k-mers on an input file. :param job: Toil job :param input_file: URL/path to input file to count k-mers on :param output_path: URL/path to save k-mer counts at :param kmer_length: The length of k-mer substrings to count. :param spark_conf: Optional Spark configuration. If set, workers should \ not be set. :param workers: Optional number of Spark workers to launch. If set, \ spark_conf should not be set, and cores and memory should be set. :param cores: Number of cores per Spark worker. Must be set if workers is \ set. :param memory: Amount of memory to provided to Spark workers. Must be set \ if workers is set. :param sudo: Whether or not to run Spark containers with sudo. :type job: toil.Job :type input_file: string :type output_path: string :type kmer_length: int or string :type spark_conf: string or None :type workers: int or None :type cores: int or None :type memory: int or None :type sudo: boolean ''' require((spark_conf is not None and workers is None) or ( workers is not None and cores is not None and memory is not None and spark_conf is not None ), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf)." ) # if we do not have a spark configuration, then we must spawn a cluster if spark_conf is None: master_hostname = spawn_spark_cluster(job, sudo, workers, cores) else: spark_conf = shlex.split(spark_conf) job.addChildJobFn(download_count_upload, masterHostname, input_file, output_file, kmer_length, spark_conf, memory, sudo)
def static_adam_preprocessing_dag(job, inputs, sample, output_dir, suffix=''): """ A Toil job function performing ADAM preprocessing on a single sample """ inputs.sample = sample inputs.output_dir = output_dir inputs.suffix = suffix if inputs.master_ip is not None or inputs.run_local: if not inputs.run_local and inputs.master_ip == 'auto': # Static, standalone Spark cluster managed by uberscript spark_on_toil = False scale_up = job.wrapJobFn(scale_external_spark_cluster, 1) job.addChild(scale_up) spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip, inputs, spark_on_toil) scale_up.addChild(spark_work) scale_down = job.wrapJobFn(scale_external_spark_cluster, -1) spark_work.addChild(scale_down) else: # Static, external Spark cluster spark_on_toil = False spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip, inputs, spark_on_toil) job.addChild(spark_work) else: # Dynamic subclusters, i.e. Spark-on-Toil spark_on_toil = True cores = multiprocessing.cpu_count() master_ip = spawn_spark_cluster( job, False, # Sudo inputs.num_nodes - 1, cores=cores, memory=inputs.memory) spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs, spark_on_toil) job.addChild(spark_work)
def static_adam_preprocessing_dag(job, inputs, sample, output_dir, suffix=''): """ A Toil job function performing ADAM preprocessing on a single sample """ inputs.sample = sample inputs.output_dir = output_dir inputs.suffix = suffix if inputs.master_ip is not None or inputs.run_local: if not inputs.run_local and inputs.master_ip == 'auto': # Static, standalone Spark cluster managed by uberscript spark_on_toil = False scale_up = job.wrapJobFn(scale_external_spark_cluster, 1) job.addChild(scale_up) spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip, inputs, spark_on_toil) scale_up.addChild(spark_work) scale_down = job.wrapJobFn(scale_external_spark_cluster, -1) spark_work.addChild(scale_down) else: # Static, external Spark cluster spark_on_toil = False spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip, inputs, spark_on_toil) job.addChild(spark_work) else: # Dynamic subclusters, i.e. Spark-on-Toil spark_on_toil = True cores = multiprocessing.cpu_count() master_ip = spawn_spark_cluster(job, False, # Sudo inputs.num_nodes-1, cores=cores, memory=inputs.memory) spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs, spark_on_toil) job.addChild(spark_work)