def download_sample(job, sample, config):
    """
    Download sample and store unique attributes

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list sample: Information pertaining to a sample: filetype, paired/unpaired, UUID, and URL
    :param Namespace config: Argparse Namespace object containing argument inputs
    """
    # Create copy of config that is sample specific
    config = argparse.Namespace(**vars(config))
    config.file_type, config.paired, config.uuid, config.url = sample
    config.paired = True if config.paired == 'paired' else False
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    disk = '2G' if config.ci_test else '20G'
    job.fileStore.logToMaster('UUID: {}\nURL: {}\nPaired: {}\nFile Type: {}\nCores: {}\nCIMode: {}'.format(
        config.uuid, config.url, config.paired, config.file_type, config.cores, config.ci_test))
    # Download or locate local file and place in the jobStore
    tar_id, r1_id, r2_id = None, None, None
    if config.file_type == 'tar':
        tar_id = job.addChildJobFn(download_url_job, config.url, cghub_key_path=config.gtkey,
                                   s3_key_path=config.ssec, disk=disk).rv()
    else:
        if config.paired:
            require(len(config.url.split(',')) == 2, 'Fastq pairs must have 2 URLS separated by comma')
            r1_url, r2_url = config.url.split(',')
            r1_id = job.addChildJobFn(download_url_job, r1_url, cghub_key_path=config.gtkey,
                                      s3_key_path=config.ssec, disk=disk).rv()
            r2_id = job.addChildJobFn(download_url_job, r2_url, cghub_key_path=config.gtkey,
                                      s3_key_path=config.ssec, disk=disk).rv()
        else:
            r1_id = job.addChildJobFn(download_url_job, config.url, cghub_key_path=config.gtkey,
                                      s3_key_path=config.ssec, disk=disk).rv()
    job.addFollowOnJobFn(preprocessing_declaration, config, tar_id, r1_id, r2_id)
Exemplo n.º 2
0
def kmer_dag(job,
             input_file,
             output_path,
             kmer_length,
             spark_conf,
             workers,
             cores,
             memory,
             sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or
            (workers is not None and cores is not None and memory is not None and spark_conf is not None),
            "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).")

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job,
                                              sudo,
                                              workers,
                                              cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload,
                      masterHostname,
                      input_file, output_file, kmer_length,
                      spark_conf, memory, sudo)
def generate_file(file_path, generate_func):
    """
    Checks file existance, generates file, and provides message

    :param str file_path: File location to generate file
    :param function generate_func: Function used to generate file
    """
    require(not os.path.exists(file_path), file_path + ' already exists!')
    with open(file_path, 'w') as f:
        f.write(generate_func())
    print('\t{} has been generated in the current working directory.'.format(os.path.basename(file_path)))
Exemplo n.º 4
0
def s3am_upload(fpath, s3_dir, num_cores=1, s3_key_path=None):
    """
    Uploads a file to s3 via S3AM
    For SSE-C encryption: provide a path to a 32-byte file

    :param str fpath: Path to file to upload
    :param str s3_dir: Ouptut S3 path. Format: s3://bucket/[directory]
    :param int num_cores: Number of cores to use for up/download with S3AM
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    """
    require(s3_dir.startswith('s3://'), 'Format of s3_dir (s3://) is incorrect: {}'.format(s3_dir))
    s3_dir = os.path.join(s3_dir, os.path.basename(fpath))
    _s3am_with_retry(num_cores, file_path=fpath, s3_url=s3_dir, mode='upload', s3_key_path=s3_key_path)
def process_sample_tar(job, config, tar_id):
    """
    Converts sample.tar(.gz) into a fastq pair or single fastq if single-ended.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str tar_id: fileStoreID of the tarball
    :return: FileStoreID from Cutadapt
    :rtype: str
    """
    job.fileStore.logToMaster('Processing sample: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    r1_id, r2_id = None, None
    # I/O
    job.fileStore.readGlobalFile(tar_id, os.path.join(work_dir, 'sample.tar'))
    tar_path = os.path.join(work_dir, 'sample.tar')
    # Untar File and concat
    subprocess.check_call(['tar', '-xvf', tar_path, '-C', work_dir], stderr=PIPE, stdout=PIPE)
    os.remove(os.path.join(work_dir, 'sample.tar'))
    fastqs = []
    for root, subdir, files in os.walk(work_dir):
        fastqs.extend([os.path.join(root, x) for x in files])
    if config.paired:
        r1 = sorted([x for x in fastqs if 'R1' in x])
        r2 = sorted([x for x in fastqs if 'R2' in x])
        if not r1 or not r2:
            r1 = sorted([x for x in fastqs if '_1' in x])
            r2 = sorted([x for x in fastqs if '_2' in x])
        require(len(r1) == len(r2), 'Check fastq naming, uneven number of pairs found: r1: {}, r2: {}'.format(r1, r2))
        # Concatenate fastqs
        command = 'zcat' if r1[0].endswith('.gz') and r2[0].endswith('.gz') else 'cat'
        with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f1:
            p1 = subprocess.Popen([command] + r1, stdout=f1)
        with open(os.path.join(work_dir, 'R2.fastq'), 'w') as f2:
            p2 = subprocess.Popen([command] + r2, stdout=f2)
        p1.wait()
        p2.wait()
        r1_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fastq'))
        r2_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2.fastq'))
    else:
        command = 'zcat' if fastqs[0].endswith('.gz') else 'cat'
        with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f:
            subprocess.check_call([command] + fastqs, stdout=f)
        r1_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fastq'))
    job.fileStore.deleteGlobalFile(tar_id)
    # Start cutadapt step
    disk = '2G' if config.ci_test else '125G'
    if config.cutadapt:
        return job.addChildJobFn(cutadapt, r1_id, r2_id, config.fwd_3pr_adapter, config.rev_3pr_adapter, disk=disk).rv()
    else:
        return r1_id, r2_id
def main():
    """
    GATK Pre-processing Script
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the GATK preprocessing pipeline')
    group = parser_run.add_mutually_exclusive_group(required=True)
    parser_run.add_argument('--config', default='gatk_preprocessing.config', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config".')
    group.add_argument('--manifest', default='bwa-alignment-manifest.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s".')
    group.add_argument('--sample', default=None, nargs='2', type=str,
                       help='Space delimited sample UUID and BAM file in the format: uuid url.')
    parser_run.add_argument('--output-dir', default=None, help='Full path to directory or filename where '
                                                               'final results will be output')    
    parser_run.add_argument('-s', '--suffix', default='.bqsr', help='Additional suffix to add to the names of the output files')
    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'gatk-preprocessing.config'), generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'gatk-preprocessing-manifest.csv'), generate_manifest)

    # Pipeline execution
    elif options.command == 'run':
        require(os.path.exists(options.config), '{} not found. Please run '
                                             '"generate-config"'.format(options.config))
        if not options.sample:
            require(os.path.exists(options.manifest), '{} not found and no sample provided. Please '
                                                       'run "generate-manifest"'.format(options.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)
        if options.manifest:
            inputs.manifest = options.manifest

    inputs.cpu_count = multiprocessing.cpu_count() # FIXME: should not be called from toil-leader, see #186
    inputs.memory =  '15'

    # Launch Pipeline
    Job.Runner.startToil(Job.wrapJobFn(download_gatk_files, inputs, options.sample, options.output_dir, options.suffix), options)
def parse_samples(path_to_manifest=None, sample_urls=None):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :param list sample_urls: Sample URLs
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    if sample_urls:
        for url in sample_urls:
            samples.append(['tar', 'paired', os.path.basename(url.split('.')[0]), url])
    elif path_to_manifest:
        with open(path_to_manifest, 'r') as f:
            for line in f.readlines():
                if not line.isspace() and not line.startswith('#'):
                    sample = line.strip().split('\t')
                    require(len(sample) == 4, 'Bad manifest format! '
                                              'Expected 4 tab separated columns, got: {}'.format(sample))
                    file_type, paired, uuid, url = sample
                    require(file_type == 'tar' or file_type == 'fq',
                            '1st column must be "tar" or "fq": {}'.format(sample[0]))
                    require(paired == 'paired' or paired == 'single',
                            '2nd column must be "paired" or "single": {}'.format(sample[1]))
                    if file_type == 'fq' and paired == 'paired':
                        require(len(url.split(',')) == 2, 'Fastq pair requires two URLs separated'
                                                          ' by a comma: {}'.format(url))
                    samples.append(sample)
    return samples
def parse_manifest(path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(len(sample) == 3, 'Bad manifest format! '
                                          'Expected 3 tab separated columns, got: {}'.format(sample))
                uuid, normal, tumor = sample
                for url in [normal, tumor]:
                    require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url))
                samples.append(sample)
    return samples
Exemplo n.º 9
0
def parse_manifest(manifest_path):
    """
    Parse manifest file

    :param str manifest_path: Path to manifest file
    :return: samples
    :rtype: list[str, list]
    """
    samples = []
    with open(manifest_path, 'r') as f:
        for line in f:
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(2 <= len(sample) <= 3, 'Bad manifest format! '
                                               'Expected UUID\tURL1\t[URL2] (tab separated), got: {}'.format(sample))
                uuid = sample[0]
                urls = sample[1:]
                for url in urls:
                    require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url))
                samples.append([uuid, urls])
    return samples
Exemplo n.º 10
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter triming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()
    if r2_id:
        require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.")
    # Retrieve files
    parameters = ['-a', fwd_3pr_adapter,
                  '-m', '35']
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['-A', rev_3pr_adapter,
                           '-o', '/data/R1_cutadapt.fastq',
                           '-p', '/data/R2_cutadapt.fastq',
                           '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])
    # Call: CutAdapt
    docker_call(tool='quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2',
                work_dir=work_dir, parameters=parameters)
    # Write to fileStore
    if r1_id and r2_id:
        r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2_cutadapt.fastq'))
    else:
        r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = None
    return r1_cut_id, r2_cut_id
Exemplo n.º 11
0
def parse_config(job, shared_ids, inputs):
    """
    Stores the UUID and urls associated with the input files to be retrieved.
    Configuration file has one sample per line, with the following format:  UUID,1st_url,2nd_url

    :param JobFunctionWrappingJob job: Passed by Toil automatically
    :param dict shared_ids: stores FileStoreIDs
    :param Namespace inputs: Input arguments
    """
    samples = []

    with open(inputs.config, 'r') as f_in:
        for line in f_in:
            if line.strip():
                line = line.strip().split(',')
                require(len(line) == 3, 'Improper formatting. Expected UUID,URl1,URL2. Received: {}'.format(line))
                uuid = line[0]
                urls = line[1:]
                mock_bam = '.'.join(line[1].split('.')[:-2])[:-2] + ".bam"
                samples.append((uuid, urls, mock_bam))
    inputs.maxCores = int(inputs.maxCores) if inputs.maxCores else sys.maxint
    inputs.cores = min(inputs.maxCores, multiprocessing.cpu_count())
    job.fileStore.logToMaster('Parsed configuration file.')
    job.addChildJobFn(map_job, download_sample, samples, inputs, shared_ids, cores=1, disk=inputs.file_size)
Exemplo n.º 12
0
def main():

    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the ADAM preprocessing pipeline')
    parser_run.add_argument('--config', default='adam_preprocessing.config', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--sample', help='The full s3 url of the input SAM or BAM file')
    parser_run.add_argument('--output-dir', default=None,
                            help='full path where final results will be output')
    parser_run.add_argument('-s', '--suffix', default='',
                            help='Additional suffix to add to the names of the output files')

    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    cwd = os.getcwd()
    if args.command == 'generate-config':
        generate_file(os.path.join(cwd, 'adam-preprocessing.config'), generate_config)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             'generate-config'.format(args.config))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)

        require(not (inputs.master_ip and inputs.num_nodes),
            'Only one of master_ip and num_nodes can be provided.')

        if not hasattr(inputs, 'master_ip'):
            require(inputs.num_nodes > 1,
                'num_nodes allocates one Spark/HDFS master and n-1 workers, and '
                'thus must be greater than 1. %d was passed.' % inputs.num_nodes)

        for arg in [inputs.dbsnp, inputs.memory]:
            require(arg, 'Required argument {} missing from config'.format(arg))

            Job.Runner.startToil(Job.wrapJobFn(static_adam_preprocessing_dag, inputs,
                                               args.sample, args.output_dir), args)
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil exome pipeline

    Perform variant / indel analysis given a pair of tumor/normal BAM files.
    Samples are optionally preprocessed (indel realignment and base quality score recalibration)
    The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel.

    General usage:
    1. Type "toil-exome generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-exome run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline

    Structure of variant pipeline (per sample)

           1 2 3 4          14 -------
           | | | |          |        |
        0 --------- 5 ----- 15 -------- 17
                    |       |        |
                   ---      16 -------
                   | |
                   6 7
                   | |
                   8 9
                   | |
                  10 11
                   | |
                  12 13

    0 = Start node
    1 = reference index
    2 = reference dict
    3 = normal bam index
    4 = tumor bam index
    5 = pre-processing node / DAG declaration
    6,7 = RealignerTargetCreator
    8,9 = IndelRealigner
    10,11 = BaseRecalibration
    12,13 = PrintReads
    14 = MuTect
    15 = Pindel
    16 = MuSe
    17 = Consolidate Output and move/upload results
    ==================================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline')
    parser_run.add_argument('--config', default='config-toil-exome.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--manifest', default='manifest-toil-exome.tsv', type=str,
                            help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--normal', default=None, type=str,
                            help='URL for the normal BAM. URLs can take the form: http://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--tumor', default=None, type=str,
                            help='URL for the tumor BAM. URLs can take the form: http://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the'
                                                                   '"--tumor" and "--normal" option')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if args.normal or args.tumor or args.uuid:
            require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied')
            samples = [[args.uuid, args.normal, args.tumor]]
        else:
            samples = parse_manifest(args.manifest)
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Exome pipeline sanity checks
        if config.preprocessing:
            require(config.reference and config.phase and config.mills and config.dbsnp,
                    'Missing inputs for preprocessing, check config file.')
        if config.run_mutect:
            require(config.reference and config.dbsnp and config.cosmic,
                    'Missing inputs for MuTect, check config file.')
        if config.run_pindel:
            require(config.reference, 'Missing input (reference) for Pindel.')
        if config.run_muse:
            require(config.reference and config.dbsnp,
                    'Missing inputs for MuSe, check config file.')
        require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program), None), program + ' must be installed on every node.'.format(program))

        # Launch Pipeline
        Job.Runner.startToil(Job.wrapJobFn(download_shared_files, samples, config), args)
def generate_file(file_path, generate_func):
    require(not os.path.exists(file_path), file_path + ' already exists!')
    with open(file_path, 'w') as f:
        f.write(generate_func())
    print('\t{} has been generated in the current working directory.'.format(os.path.basename(file_path)))
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General usage:
    1. Type "toil-rnaseq generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-rnaseq run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/rnaseq_cgl

    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 7

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the RNA-seq pipeline')
    group = parser_run.add_mutually_exclusive_group(required=True)
    parser_run.add_argument('--config', default='config-toil-rnaseq.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    group.add_argument('--manifest', default='manifest-toil-rnaseq.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s"')
    group.add_argument('--samples', default=None, nargs='+', type=str,
                       help='Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain '
                            'fastq files. URLs follow the format: http://foo.com/sample.tar, '
                            'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.'
                            'Samples passed in this way will be assumed to be paired end, if using single-end data, '
                            'please use the manifest option.')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if not args.samples:
            require(os.path.exists(args.manifest), '{} not found and no samples provided. Please '
                                                   'run "toil-rnaseq generate-manifest"'.format(args.manifest))
            samples = parse_samples(path_to_manifest=args.manifest)
        else:
            samples = parse_samples(sample_urls=args.samples)
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Config sanity checks
        require(config.kallisto_index or config.star_index,
                'URLs not provided for Kallisto or STAR, so there is nothing to do!')
        if config.star_index or config.rsem_ref:
            require(config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: '
                                                           '{}, RSEM: {}'.format(config.star_index, config.rsem_ref))
        require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, '
                                                           'otherwise sample output is not stored anywhere!')
        for input in [x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x]:
            require(urlparse(input).scheme in schemes,
                    'Input in config must have the appropriate URL prefix: {}'.format(schemes))
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program), None), program + ' must be installed on every node.'.format(program))

        # Start the workflow by using map_job() to run the pipeline for each sample
        Job.Runner.startToil(Job.wrapJobFn(map_job, download_sample, samples, config), args)
Exemplo n.º 16
0
                       help='Space delimited sample UUID and BAM file in the format: uuid url')
    parser_run.add_argument('--output-dir', default=None, help='Full path to directory or filename where '
                                                               'final results will be output')    
    parser_run.add_argument('-s', '--suffix', default='.bqsr', help='Additional suffix to add to the names of the output files')
    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'gatk-preprocessing.config'), generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'gatk-preprocessing-manifest.tsv'), generate_manifest)

    # Pipeline execution
    elif options.command == 'run':
        require(os.path.exists(options.config), '{} not found. Please run '
                                             '"generate-config"'.format(options.config))
        if not options.sample:
            require(os.path.exists(options.manifest), '{} not found and no sample provided. Please '
                                                       'run "generate-manifest"'.format(options.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)
        if options.manifest:
            inputs.manifest = options.manifest
    
    inputs.cpu_count = multiprocessing.cpu_count() # FIXME: should not be called from toil-leader, see #186
    inputs.memory = '15'
    
    Job.Runner.startToil(Job.wrapJobFn(batch_start, inputs, options.sample, options.output_dir, options.suffix), options)
Exemplo n.º 17
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil BWA pipeline

    Alignment of fastq reads via BWA-kit

    General usage:
    1. Type "toil-bwa generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-bwa run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/bwa_alignment

    Structure of the BWA pipeline (per sample)

        0 --> 1

    0 = Download sample
    1 = Run BWA-kit
    ===================================================================
    :Dependencies:
    cURL:       apt-get install curl
    Toil:       pip install toil
    Docker:     wget -qO- https://get.docker.com/ | sh

    Optional:
    S3AM:       pip install --s3am (requires ~/.boto config file)
    Boto:       pip install boto
    """
    # Define Parser object and add to Toil
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the BWA alignment pipeline')
    group = parser_run.add_mutually_exclusive_group(required=True)
    parser_run.add_argument('--config', default='config-toil-bwa.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config".')
    group.add_argument('--manifest', default='manifest-toil-bwa.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s".')
    group.add_argument('--sample', nargs='+', action=required_length(2, 3),
                       help='Space delimited sample UUID and fastq files in the format: uuid url1 [url2].')
    # Print docstring help if no arguments provided
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-bwa.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-bwa.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run generate-config'.format(args.config))
        if not args.sample:
            args.sample = None
            require(os.path.exists(args.manifest), '{} not found and no sample provided. '
                                                   'Please run "generate-manifest"'.format(args.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        samples = [args.sample] if args.sample else parse_manifest(args.manifest)
        # Sanity checks
        require(config.ref, 'Missing URL for reference file: {}'.format(config.ref))
        require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
        # Launch Pipeline
        Job.Runner.startToil(Job.wrapJobFn(download_reference_files, config, samples), args)