Пример #1
0
def star_index(job, args):
    download_url(url=args.ref, name='ref.fa', work_dir=job.tempDir)
    download_url(url=args.gtf, name='annotation.gtf', work_dir=job.tempDir)

    # Run STAR to generate index
    star_dir = os.path.join(job.tempDir, args.star_name)
    os.mkdir(star_dir)
    parameters = [
        '--runThreadN',
        str(args.cores), '--runMode', 'genomeGenerate', '--genomeDir',
        '/data/' + args.star_name, '--genomeFastaFiles', 'ref.fa',
        '--sjdbGTFfile', 'annotation.gtf'
    ]
    dockerCall(
        job,
        tool=
        'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
        workDir=job.tempDir,
        parameters=parameters)

    # Compress starIndex into a tarball
    star_tar = '{}.tar.gz'.format(args.star_name)
    tarball_files(star_tar, file_paths=[star_dir], output_dir=job.tempDir)

    # Move to output dir or return
    tar_path = os.path.join(job.tempDir, star_tar)
    if _move_instead_of_return:
        move_files([tar_path], args.output_dir)
    else:
        return job.fileStore.readGlobalFile(tar_path)
Пример #2
0
def convert_bam_to_fastq(job,
                         bam_path,
                         check_paired=True,
                         ignore_validation_errors=True):
    """
    Converts BAM to a pair of FASTQ files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam_path: Path to BAM
    :param bool check_paired: If True, checks whether BAM is paired-end
    :param bool ignore_validation_errors: If True, ignores validation errors from picardTools
    :return: FileStoreIDs for R1 and R2
    :rtype: tuple
    """
    if check_paired:
        assert_bam_is_paired_end(job, bam_path)

    work_dir = os.path.dirname(os.path.abspath(bam_path))
    parameters = [
        'SamToFastq', 'I={}'.format(docker_path(bam_path)), 'F=/data/R1.fq',
        'F2=/data/R2.fq'
    ]
    if ignore_validation_errors:
        parameters.append('VALIDATION_STRINGENCY=SILENT')
    dockerCall(job=job,
               workDir=work_dir,
               parameters=parameters,
               tool=picardtools_version)
    r1 = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fq'))
    r2 = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2.fq'))
    return r1, r2
Пример #3
0
def _testDockerCleanFn(job, workDir, detached=None, rm=None, defer=None, containerName=None):
    """
    Test function for test docker_clean.  Runs a container with given flags and then dies leaving
    behind a zombie container
    :param toil.job.Job job: job
    :param workDir: See `work_dir=` in :func:`dockerCall`
    :param bool rm: See `rm=` in :func:`dockerCall`
    :param bool detached: See `detached=` in :func:`dockerCall`
    :param int defer: See `defer=` in :func:`dockerCall`
    :param str containerName: See `container_name=` in :func:`dockerCall`
    :return:
    """
    dockerParameters = ['--log-driver=none', '-v', os.path.abspath(workDir) + ':/data',
                        '--name', containerName]
    if detached:
        dockerParameters.append('-d')
    if rm:
        dockerParameters.append('--rm')

    def killSelf():
        test_file = os.path.join(workDir, 'test.txt')
        # This will kill the worker once we are sure the docker container started
        while not os.path.exists(test_file):
            _log.debug('Waiting on the file created by spooky_container.')
            time.sleep(1)
        # By the time we reach here, we are sure the container is running.
        os.kill(os.getpid(), signal.SIGKILL)  # signal.SIGINT)
    t = Thread(target=killSelf)
    # Make it a daemon thread so that thread failure doesn't hang tests.
    t.daemon = True
    t.start()
    dockerCall(job, tool='quay.io/ucsc_cgl/spooky_test', workDir=workDir, defer=defer, dockerParameters=dockerParameters)
Пример #4
0
def rsem_index(job, args):
    download_url(url=args.ref, name='ref.fa', work_dir=job.tempDir)
    download_url(url=args.gtf, name='annotation.gtf', work_dir=job.tempDir)

    # Run RSEM to generate reference
    rsem_dir = os.path.join(job.tempDir, args.rsem_name)
    os.mkdir(rsem_dir)
    docker_parameters = [
        '--entrypoint', 'rsem-prepare-reference', '-v',
        '{}:/data'.format(job.tempDir), '--rm', '--log-driver=none'
    ]
    parameters = [
        '-p',
        str(args.cores), '--gtf', '/data/annotation.gtf', '/data/ref.fa',
        os.path.join('/data', args.rsem_name, args.rsem_name)
    ]
    dockerCall(
        job,
        tool=
        'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21',
        parameters=parameters,
        dockerParameters=docker_parameters)

    # Compress rsemRef into a tarball
    rsem_tar = '{}.tar.gz'.format(args.rsem_name)
    tarball_files(rsem_tar, file_paths=[rsem_dir], output_dir=job.tempDir)

    # Move to output dir
    tar_path = os.path.join(job.tempDir, rsem_tar)
    if _move_instead_of_return:
        move_files([tar_path], args.output_dir)
    else:
        return job.fileStore.readGlobalFile(tar_path)
Пример #5
0
def run_sambamba_sort(job, bam, sort_by_name=False):
    """
    Sorts BAM file using Sambamba sort

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param boolean sort_by_name: If true, sorts by read name instead of coordinate.
    :return: FileStoreID for sorted BAM file
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
    command = ['/usr/local/bin/sambamba',
               'sort',
               '-t', str(int(job.cores)),
               '-m', str(job.memory),
               '-o', '/data/output.bam',
               '/data/input.bam']

    if sort_by_name:
        command.append('-n')

    start_time = time.time()
    dockerCall(job=job, workDir=work_dir,
               parameters=command,
               tool='quay.io/biocontainers/sambamba:0.6.6--0')
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "sambamba sort")
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))
Пример #6
0
def sort_and_save_bam(job, config, bam_id, skip_sort=True):
    """
    Sorts STAR's output BAM using samtools

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param bool skip_sort: If True, skips sort step and upload BAM
    :param FileID bam_id: FileID for STARs genome aligned bam
    """
    bam_path = os.path.join(job.tempDir, 'aligned.bam')
    sorted_bam = os.path.join(job.tempDir, '{}.sorted.bam'.format(config.uuid))
    job.fileStore.readGlobalFile(bam_id, bam_path)

    parameters = [
        'sort', '-o', '/data/{}.sorted.bam'.format(config.uuid), '-O', 'bam',
        '-T', 'temp', '-@',
        str(job.cores), '/data/aligned.bam'
    ]

    if skip_sort:
        job.log('Skipping samtools sort as STAR already sorted BAM')
        os.rename(bam_path, sorted_bam)
    else:
        dockerCall(job,
                   tool=samtools_version,
                   parameters=parameters,
                   workDir=job.tempDir)

    move_or_upload(config, files=[sorted_bam])
Пример #7
0
def run_bamqc(job, aligned_bam_id, config, save_bam=False):
    """
    Run BAMQC as specified by Treehouse (UCSC)
    https://github.com/UCSC-Treehouse/bam-umend-qc

    :param JobFunctionWrappingJob job:
    :param str aligned_bam_id: FileStoreID of aligned bam from STAR
    :param Expando config: Contains sample information
    :param bool save_bam: Option to save mark-duplicate bam from BAMQC
    :return: FileStoreID for output tar
    :rtype: str
    """
    job.fileStore.readGlobalFile(aligned_bam_id, os.path.join(job.tempDir, 'input.bam'))
    dockerCall(job, tool=bamqc_version, workDir=job.tempDir, parameters=['/data/input.bam', '/data'])

    # Tar Output files
    output_names = ['readDist.txt', 'bam_umend_qc.tsv', 'bam_umend_qc.json']
    output_files = [os.path.join(job.tempDir, x) for x in output_names]
    tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=job.tempDir)
    tar_path = os.path.join(job.tempDir, 'bam_qc.tar.gz')

    # Save output BAM - this step is done here instead of in its own job for efficiency
    if save_bam:
        # Tag bam with sample UUID, upload, and delete
        bam_path = os.path.join(job.tempDir, 'sortedByCoord.md.bam')
        new_bam = os.path.join(job.tempDir, config.uuid + '.sortedByCoord.md.bam')
        os.rename(bam_path, new_bam)
        move_or_upload(config, [new_bam])
        job.fileStore.deleteGlobalFile(new_bam)

    # Delete intermediates
    job.fileStore.deleteGlobalFile(aligned_bam_id)

    return job.fileStore.writeGlobalFile(tar_path)
Пример #8
0
def star_index(job, args):
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=args.ref, name='ref.fa', work_dir=work_dir)
    download_url(url=args.gtf, name='annotation.gtf', work_dir=work_dir)

    # Run STAR to generate index
    star_dir = os.path.join(work_dir, args.star_name)
    os.mkdir(star_dir)
    parameters = [
        '--runThreadN',
        str(args.cores), '--runMode', 'genomeGenerate', '--genomeDir',
        '/data/' + args.star_name, '--genomeFastaFiles', 'ref.fa',
        '--sjdbGTFfile', 'annotation.gtf'
    ]
    dockerCall(
        job,
        tool=
        'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
        workDir=work_dir,
        parameters=parameters)

    # Compress starIndex into a tarball
    subprocess.check_call(['tar', '-zcvf', star_dir + '.tar.gz', star_dir])

    # Move to output dir or return
    if _move_instead_of_return:
        move_files([star_dir + '.tar.gz'], args.output_dir)
    else:
        return job.fileStore.readGlobalFile(star_dir + '.tar.gz')
Пример #9
0
def run_fastqc(job, r1_id, r2_id):
    """
    Run Fastqc on the input reads

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2
    :return: FileStoreID of fastQC output (tarball)
    :rtype: str
    """
    # Read in files and set parameters
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    parameters = ['/data/R1.fastq']
    output_names = ['R1_fastqc.html', 'R1_fastqc.zip']
    if r2_id:
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend(['-t', '2', '/data/R2.fastq'])
        output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip'])

    # Call fastQC
    dockerCall(job=job,
               tool=fastqc_version,
               workDir=job.tempDir,
               parameters=parameters)

    # Package output files and return FileStoreID
    output_files = [os.path.join(job.tempDir, x) for x in output_names]
    tarball_files(tar_name='fastqc.tar.gz',
                  file_paths=output_files,
                  output_dir=job.tempDir)
    return job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'fastqc.tar.gz'))
Пример #10
0
def download_bam_from_gdc(job, work_dir, url, token):
    """
    Downloads BAM file from the GDC using an url (format: "gdc://<GDC ID>") and a GDC access token

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str work_dir: Directory being mounted into Docker
    :param str url: gdc URL to be downloaded
    :param str token: Full path to token
    :return: Path to BAM
    :rtype: str
    """
    assert token, 'gdc_token is missing which is required for downloading. Check config.'
    copy_files([os.path.abspath(token)], work_dir)

    parsed_url = urlparse(url)
    parameters = [
        'download', '-d', '/data', '-t',
        '/data/{}'.format(os.path.basename(token)), parsed_url.netloc
    ]
    dockerCall(job, tool=gdc_version, parameters=parameters, workDir=work_dir)
    files = [
        x for x in os.listdir(os.path.join(work_dir, parsed_url.netloc))
        if x.lower().endswith('.bam')
    ]
    assert len(files) == 1, 'More than one BAM found from GDC URL: {}'.format(
        files)
    bam_path = os.path.join(work_dir, parsed_url.netloc, files[0])
    return bam_path
Пример #11
0
def _testDockerCleanFn(job, workDir, detached=None, rm=None, defer=None, containerName=None):
    """
    Test function for test docker_clean.  Runs a container with given flags and then dies leaving
    behind a zombie container
    :param toil.job.Job job: job
    :param workDir: See `work_dir=` in :func:`dockerCall`
    :param bool rm: See `rm=` in :func:`dockerCall`
    :param bool detached: See `detached=` in :func:`dockerCall`
    :param int defer: See `defer=` in :func:`dockerCall`
    :param str containerName: See `container_name=` in :func:`dockerCall`
    :return:
    """
    dockerParameters = ['--log-driver=none', '-v', os.path.abspath(workDir) + ':/data',
                        '--name', containerName]
    if detached:
        dockerParameters.append('-d')
    if rm:
        dockerParameters.append('--rm')

    def killSelf():
        test_file = os.path.join(workDir, 'test.txt')
        # This will kill the worker once we are sure the docker container started
        while not os.path.exists(test_file):
            _log.debug('Waiting on the file created by spooky_container.')
            time.sleep(1)
        # By the time we reach here, we are sure the container is running.
        os.kill(os.getpid(), signal.SIGKILL)  # signal.SIGINT)
    t = Thread(target=killSelf)
    # Make it a daemon thread so that thread failure doesn't hang tests.
    t.daemon = True
    t.start()
    dockerCall(job, tool='quay.io/ucsc_cgl/spooky_test', workDir=workDir, defer=defer, dockerParameters=dockerParameters)
Пример #12
0
def kallisto_index(job, args):
    if args.transcriptome:
        download_url(url=args.transcriptome,
                     name='transcriptome.fa',
                     work_dir=job.tempDir)
    else:
        _create_transcriptome(job, args, job.tempDir)

    # Run Kallisto Index
    parameters = [
        'index', 'transcriptome.fa', '-i',
        '/data/{}.index'.format(args.kallisto_name)
    ]
    dockerCall(
        job,
        tool=
        'quay.io/ucsc_cgl/kallisto:0.43.1--355c19b1fb6fbb85f7f8293e95fb8a1e9d0da163',
        workDir=job.tempDir,
        parameters=parameters)

    # Move to output dir
    output_path = os.path.join(job.tempDir, args.kallisto_name + '.index')
    if _move_instead_of_return:
        move_files([output_path], args.output_dir)
    else:
        return job.fileStore.readGlobalFile(output_path)
Пример #13
0
def run_bam_qc(job, aligned_bam_id, config):
    """
    Run BAM QC as specified by California Kids Cancer Comparison (CKCC)

    :param JobFunctionWrappingJob job:
    :param str aligned_bam_id: FileStoreID of aligned bam from STAR
    :param Namespace config: Argparse Namespace object containing argument inputs
        Must contain:
            config.uuid str: UUID of input sample
            config.save_bam bool: True/False depending on whether to save bam
            config.output_dir str: Path to save bam
            config.ssec str: Path to encryption key for secure upload to S3
    :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar
    :rtype: tuple(bool, str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(
        aligned_bam_id,
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    dockerCall(job,
               tool='hbeale/treehouse_bam_qc:1.0',
               workDir=work_dir,
               parameters=['runQC.sh', str(job.cores)])

    # Tar Output files
    output_names = [
        'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf',
        'rnaAligned.out.md.sorted.geneBodyCoverage.txt'
    ]
    if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')):
        output_names.append('readDist.txt_PASS_qc.txt')
        fail_flag = False
    else:
        output_names.append('readDist.txt_FAIL_qc.txt')
        fail_flag = True
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='bam_qc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)

    # Save output BAM
    if config.save_bam:
        bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam')
        new_bam_path = os.path.join(work_dir,
                                    config.uuid + '.sortedByCoord.md.bam')
        os.rename(bam_path, new_bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=new_bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[new_bam_path], output_dir=config.output_dir)

    # Delete intermediates
    job.fileStore.deleteGlobalFile(aligned_bam_id)

    return fail_flag, job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'bam_qc.tar.gz'))
Пример #14
0
def run_rsem(job, bam_id, rsem_ref_url, paired=True):
    """
    RNA quantification with RSEM

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str bam_id: FileStoreID of transcriptome bam for quantification
    :param str rsem_ref_url: URL of RSEM reference (tarball)
    :param bool paired: If True, uses parameters for paired end data
    :return: FileStoreIDs for RSEM's gene and isoform output
    :rtype: str
    """
    # Retrieve RSEM reference
    download_url(url=rsem_ref_url,
                 name='rsem_ref.tar.gz',
                 work_dir=job.tempDir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(job.tempDir, 'rsem_ref.tar.gz'), '-C', job.tempDir
    ])
    os.remove(os.path.join(job.tempDir, 'rsem_ref.tar.gz'))
    # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix
    rsem_files = []
    for root, directories, files in os.walk(job.tempDir):
        rsem_files.extend([os.path.join(root, x) for x in files])
    # "grp" is a required RSEM extension that should exist in the RSEM reference
    ref_prefix = [
        os.path.basename(os.path.splitext(x)[0]) for x in rsem_files
        if 'grp' in x
    ][0]
    ref_folder = os.path.join('/data',
                              os.listdir(job.tempDir)[0]) if len(
                                  os.listdir(job.tempDir)) == 1 else '/data'
    # Read bam from fileStore
    job.fileStore.readGlobalFile(
        bam_id, os.path.join(job.tempDir, 'transcriptome.bam'))

    # Call: RSEM
    output_prefix = 'rsem'
    parameters = [
        '--quiet', '--no-qualities', '-p',
        str(job.cores), '--forward-prob', '0.5', '--seed-length', '25',
        '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam',
        os.path.join(ref_folder, ref_prefix), output_prefix
    ]
    if paired:
        parameters = ['--paired-end'] + parameters
    dockerCall(job,
               parameters=parameters,
               workDir=job.tempDir,
               tool=rsem_version)

    # Store output in fileStore and return
    gene_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, output_prefix + '.genes.results'))
    isoform_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, output_prefix + '.isoforms.results'))
    return gene_id, isoform_id
Пример #15
0
def _testSubprocessDockerPermissions(job):
    testDir = job.fileStore.getLocalTempDir()
    dockerCall(job,
               tool='ubuntu',
               workDir=testDir,
               parameters=[['touch', '/data/test.txt']])
    outFile = os.path.join(testDir, 'test.txt')
    assert os.path.exists(outFile)
    assert not ownerName(outFile) == "root"
Пример #16
0
def run_base_recalibration(job, bam, bai, ref, ref_dict, fai, dbsnp, mills, unsafe=False):
    """
    Creates recalibration table for Base Quality Score Recalibration

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str ref: FileStoreID for reference genome fasta file
    :param str ref_dict: FileStoreID for reference genome sequence dictionary file
    :param str fai: FileStoreID for reference genome fasta index file
    :param str dbsnp: FileStoreID for dbSNP VCF file
    :param str mills: FileStoreID for Mills VCF file
    :param bool unsafe: If True, runs GATK in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreID for the recalibration table file
    :rtype: str
    """
    inputs = {'ref.fasta': ref,
              'ref.fasta.fai': fai,
              'ref.dict': ref_dict,
              'input.bam': bam,
              'input.bai': bai,
              'dbsnp.vcf': dbsnp,
              'mills.vcf': mills}

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))

    # Call: GATK -- BaseRecalibrator
    parameters = ['-T', 'BaseRecalibrator',
                  '-nct', str(int(job.cores)),
                  '-R', '/data/ref.fasta',
                  '-I', '/data/input.bam',
                  # Recommended known sites:
                  # https://software.broadinstitute.org/gatk/guide/article?id=1247
                  '-knownSites', '/data/dbsnp.vcf',
                  '-knownSites', '/data/mills.vcf',
                  '-o', '/data/recal_data.table']

    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])

    # Set TMPDIR to /data to prevent writing temporary files to /tmp
    docker_parameters = ['--rm',
                         '--log-driver', 'none',
                         '-e', 'JAVA_OPTS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory),
                         '-v', '{}:/data'.format(work_dir)]
    start_time = time.time()
    dockerCall(job=job, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
               workDir=work_dir,
               parameters=parameters,
               dockerParameters=docker_parameters)
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "GATK3 BaseRecalibrator")

    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'recal_data.table'))
Пример #17
0
def apply_bqsr_recalibration(job, table, bam, bai, ref, ref_dict, fai, unsafe=False):
    """
    Creates BAM file with recalibrated base quality scores

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str table: FileStoreID for BQSR recalibration table file
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str ref: FileStoreID for reference genome fasta file
    :param str ref_dict: FileStoreID for reference genome sequence dictionary file
    :param str fai: FileStoreID for reference genome fasta index file
    :param bool unsafe: If True, runs GATK in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreIDs for recalibrated BAM and BAI files
    :rtype: tuple(str, str)
    """
    inputs = {'ref.fasta': ref,
              'ref.fasta.fai': fai,
              'ref.dict': ref_dict,
              'recal.table': table,
              'input.bam': bam,
              'input.bai': bai}

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))

    # Call: GATK -- PrintReads
    parameters = ['-T', 'PrintReads',
                  '-nct', str(int(job.cores)),
                  '-R', '/data/ref.fasta',
                  '-I', '/data/input.bam',
                  '-BQSR', '/data/recal.table',
                  '-o', '/data/bqsr.bam']
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads")
    
    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])

    # Set TMPDIR to /data to prevent writing temporary files to /tmp
    docker_parameters = ['--rm',
                         '--log-driver', 'none',
                         '-e', 'JAVA_OPTS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory),
                         '-v', '{}:/data'.format(work_dir)]
    start_time = time.time()
    dockerCall(job=job, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
               workDir=work_dir,
               parameters=parameters,
               dockerParameters=docker_parameters)
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads")

    output_bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'bqsr.bam'))
    output_bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'bqsr.bai'))
    return output_bam, output_bai
Пример #18
0
def _create_transcriptome(job, args, work_dir):
    # Download files to generate transcriptome
    download_url(url=args.ref, name='ref.fa', work_dir=work_dir)
    download_url(url=args.gtf, name='annotation.gtf', work_dir=work_dir)

    parameters = [
        'gtf_to_fasta', '/data/annotation.gtf', '/data/ref.fa',
        '/data/transcriptome.fa'
    ]
    dockerCall(job,
               tool='limesbonn/tophat2',
               workDir=work_dir,
               parameters=parameters)
Пример #19
0
def index_bam(job, bam_path):
    """
    Creates a BAM index (.bai) in the same directory as the BAM
    Indexing is necessary for viewing slices of the BAM

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam_path: Path to BAM
    """
    work_dir = os.path.dirname(os.path.abspath(bam_path))
    parameters = ['index', docker_path(bam_path)]
    dockerCall(job,
               workDir=work_dir,
               parameters=parameters,
               tool=samtools_version)
Пример #20
0
def run_samtools_faidx(job, ref_id):
    """
    Use Samtools to create reference index file

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome
    :return: FileStoreID for reference index
    :rtype: str
    """
    job.fileStore.logToMaster('Created reference index')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta'))
    command = ['faidx', '/data/ref.fasta']
    dockerCall(job=job, workDir=work_dir, parameters=command,
               tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.fasta.fai'))
Пример #21
0
def run_picard_create_sequence_dictionary(job, ref_id):
    """
    Uses Picard to create reference sequence dictionary

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome fasta file
    :return: FileStoreID for sequence dictionary file
    :rtype: str
    """
    job.fileStore.logToMaster('Created reference dictionary')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta'))
    command = ['CreateSequenceDictionary', 'R=ref.fasta', 'O=ref.dict']
    dockerCall(job=job, workDir=work_dir,
               parameters=command,
               tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.dict'))
Пример #22
0
def run_kallisto(job, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    # Retrieve files and define parameters
    download_url(url=kallisto_index_url,
                 name='kallisto_hg38.idx',
                 work_dir=job.tempDir)
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    parameters = [
        'quant', '-i', '/data/kallisto_hg38.idx', '-t',
        str(job.cores), '-o', '/data/', '-b', '100', '--fusion'
    ]

    # If R2 fastq is present...
    if r2_id:
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend(['/data/R1.fastq', '/data/R2.fastq'])
    else:
        parameters.extend(
            ['--single', '-l', '200', '-s', '15', '/data/R1.fastq'])

    # Call: Kallisto
    dockerCall(job,
               workDir=job.tempDir,
               parameters=parameters,
               tool=kallisto_version)

    # Tar output files together, store in fileStore, and return
    output_names = [
        'run_info.json', 'abundance.tsv', 'abundance.h5', 'fusion.txt'
    ]
    output_files = [os.path.join(job.tempDir, x) for x in output_names]
    tarball_files(tar_name='kallisto.tar.gz',
                  file_paths=output_files,
                  output_dir=job.tempDir)
    return job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'kallisto.tar.gz'))
Пример #23
0
def run_samtools_index(job, bam):
    """
    Runs SAMtools index to create a BAM index file

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID of the BAM file
    :return: FileStoreID for BAM index file
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sample.bam'))
    # Call: index the bam
    parameters = ['index', '/data/sample.bam']
    dockerCall(job=job, workDir=work_dir, parameters=parameters,
               tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    # Write to fileStore
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai'))
Пример #24
0
def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'):
    """
    Runs Picard MarkDuplicates on a BAM file. Requires that the BAM file be coordinate sorted.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str validation_stringency: BAM file validation stringency, default is LENIENT
    :return: FileStoreIDs for BAM and BAI files
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()

    # Retrieve file path
    job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sorted.bam'))
    job.fileStore.readGlobalFile(bai, os.path.join(work_dir, 'sorted.bai'))

    # Call: picardtools
    command = ['MarkDuplicates',
               'INPUT=sorted.bam',
               'OUTPUT=mkdups.bam',
               'METRICS_FILE=metrics.txt',
               'ASSUME_SORTED=true',
               'CREATE_INDEX=true',
               'VALIDATION_STRINGENCY=%s' % validation_stringency.upper()]

    # picard-tools container doesn't have JAVA_OPTS variable
    # Set TMPDIR to /data to prevent writing temporary files to /tmp
    docker_parameters = ['--rm',
                         '--log-driver', 'none',
                         '-e', 'JAVA_OPTIONS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory),
                         '-v', '{}:/data'.format(work_dir)]

    start_time = time.time()
    dockerCall(job=job, workDir=work_dir,
               parameters=command,
               tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e',
               dockerParameters=docker_parameters)
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "Picard MarkDuplicates")

    bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bam'))
    bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bai'))
    return bam, bai
Пример #25
0
def run_bwa_index(job, ref_id):
    """
    Use BWA to create reference index files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome
    :return: FileStoreIDs for BWA index files
    :rtype: tuple(str, str, str, str, str)
    """
    job.fileStore.logToMaster('Created BWA index files')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa'))
    command = ['index', '/data/ref.fa']
    dockerCall(job=job, workDir=work_dir, parameters=command,
               tool='quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c')
    ids = {}
    for output in ['ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa']:
        ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile(os.path.join(work_dir, output)))
    return ids['amb'], ids['ann'], ids['bwt'], ids['pac'], ids['sa']
Пример #26
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter trimming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple(str, str)
    """
    # Retrieve files and define parameters
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    parameters = ['-a', fwd_3pr_adapter, '-m', '35']

    # If R2 fastq is present...
    if r2_id:
        require(rev_3pr_adapter,
                "Paired end data requires a reverse 3' adapter sequence.")
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend([
            '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p',
            '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq'
        ])
    else:
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])

    # Call: CutAdapt
    dockerCall(job=job,
               tool=cutadapt_version,
               workDir=job.tempDir,
               parameters=parameters)

    # Write to fileStore
    r1_cut_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'R1_cutadapt.fastq'))
    r2_cut_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'R2_cutadapt.fastq')) if r2_id else None

    return r1_cut_id, r2_cut_id
Пример #27
0
def call_conductor(job,
                   master_ip,
                   src,
                   dst,
                   memory=None,
                   override_parameters=None):
    """
    Invokes the Conductor container to copy files between S3 and HDFS and vice versa.
    Find Conductor at https://github.com/BD2KGenomics/conductor.

    :param toil.Job.job job: The Toil Job calling this function
    :param masterIP: The Spark leader IP address.
    :param src: URL of file to copy.
    :param src: URL of location to copy file to.
    :param memory: Gigabytes of memory to provision for Spark driver/worker.
    :param override_parameters: Parameters passed by the user, that override our defaults.

    :type masterIP: MasterAddress
    :type src: string
    :type dst: string
    :type memory: int or None
    :type override_parameters: list of string or None
    """

    arguments = ["-C", src, dst]

    docker_parameters = [
        '--log-driver', 'none',
        master_ip.docker_parameters(["--net=host"])
    ]
    dockerCall(
        job=job,
        tool="quay.io/ucsc_cgl/conductor",
        parameters=_make_parameters(
            master_ip,
            [],  # no conductor specific spark configuration
            memory,
            arguments,
            override_parameters),
        dockerParameters=docker_parameters)
Пример #28
0
def run_samtools_rmdup(job, bam):
    """
    Mark reads as PCR duplicates using SAMtools rmdup

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :return: FileStoreID for sorted BAM file
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
    command = ['rmdup',
               '/data/input.bam',
               '/data/output.bam']

    start_time = time.time()
    dockerCall(job=job, workDir=work_dir,
               parameters=command,
               tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "samtools rmdup")
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))
Пример #29
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter trimming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()
    if r2_id:
        require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.")
    # Retrieve files
    parameters = ['-a', fwd_3pr_adapter,
                  '-m', '35']
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['-A', rev_3pr_adapter,
                           '-o', '/data/R1_cutadapt.fastq',
                           '-p', '/data/R2_cutadapt.fastq',
                           '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])
    # Call: CutAdapt
    dockerCall(job=job, tool='quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2',
               workDir=work_dir, parameters=parameters)
    # Write to fileStore
    if r1_id and r2_id:
        r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2_cutadapt.fastq'))
    else:
        r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = None
    return r1_cut_id, r2_cut_id
Пример #30
0
def run_rsem_gene_mapping(job, rsem_gene_id, rsem_isoform_id):
    """
    Parses RSEM output files to map ENSEMBL IDs to Gencode HUGO gene names

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str rsem_gene_id: FileStoreID of rsem_gene_ids
    :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids
    :return: FileStoreID from RSEM post process tarball
    :rytpe: str
    """
    # Retrieve input files
    genes = job.fileStore.readGlobalFile(
        rsem_gene_id, os.path.join(job.tempDir, 'rsem_genes.results'))
    iso = job.fileStore.readGlobalFile(
        rsem_isoform_id, os.path.join(job.tempDir, 'rsem_isoforms.results'))

    # Perform HUGO gene / isoform name mapping
    command = ['-g', 'rsem_genes.results', '-i', 'rsem_isoforms.results']
    dockerCall(job,
               parameters=command,
               workDir=job.tempDir,
               tool=rsemgenemapping_version)
    hugo_files = [
        os.path.join(job.tempDir, x)
        for x in ['rsem_genes.hugo.results', 'rsem_isoforms.hugo.results']
    ]

    # Create tarballs for output, store in fileStore, and return
    tarball_files('rsem.tar.gz',
                  file_paths=[genes, iso],
                  output_dir=job.tempDir)
    tarball_files('rsem_hugo.tar.gz',
                  file_paths=hugo_files,
                  output_dir=job.tempDir)
    rsem_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'rsem.tar.gz'))
    hugo_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'rsem_hugo.tar.gz'))
    return rsem_id, hugo_id
Пример #31
0
def run_samblaster(job, sam):
    """
    Marks reads as PCR duplicates using SAMBLASTER

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str sam: FileStoreID for SAM file
    :return: FileStoreID for deduped SAM file
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(sam, os.path.join(work_dir, 'input.sam'))
    command = ['/usr/local/bin/samblaster',
               '-i', '/data/input.sam',
               '-o', '/data/output.sam',
               '--ignoreUnmated']

    start_time = time.time()
    dockerCall(job=job, workDir=work_dir,
               parameters=command,
               tool='quay.io/biocontainers/samblaster:0.1.24--0')
    end_time = time.time()
    _log_runtime(job, start_time, end_time, "SAMBLASTER")
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.sam'))
Пример #32
0
def _testDockerPermissions(job):
    testDir = job.fileStore.getLocalTempDir()
    dockerCall(job, tool='ubuntu', workDir=testDir, parameters=[['touch', '/data/test.txt']])
    outFile = os.path.join(testDir, 'test.txt')
    assert os.path.exists(outFile)
    assert not ownerName(outFile) == "root"