def create_reference_index(job, ref_id):
    """
    Uses Samtools to create reference index file (.fasta.fai)

    ref_id: str     The fileStore ID of the reference
    """
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path to reference
    try:
        job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa'))  
    except:
        sys.stderr.write("Failed when reading global file %s to %s. Retrying with dict index." % (ref_id,
                                                                                                  os.path.join(work_dir, 'ref.fa')))
        
        try:
            job.fileStore.readGlobalFile(ref_id['ref.fa'], os.path.join(work_dir, 'ref.fa'))  
        except:
            sys.stderr.write("Reading %s on retry failed." % ref_id['ref.fa'])
            raise

    # Call: Samtools
    command = ['faidx', 'ref.fa']
    docker_call(work_dir=work_dir, parameters=command,
                tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                inputs=['ref.fa'],
                outputs={'ref.fa.fai': None})
    output = os.path.join(work_dir, 'ref.fa.fai')
    assert os.path.exists(output)
    # Write to fileStore
    return job.fileStore.writeGlobalFile(output)
Exemplo n.º 2
0
    def start(self, fileStore):
        """
        Start spark and hdfs master containers

        fileStore: Unused
        """
        
        self.IP = check_output(["hostname", "-f",])[:-1]

        _log.info("Started Spark master container.")
        self.sparkContainerID = docker_call(tool = "quay.io/ucsc_cgl/apache-spark-master:1.5.2",
                                            docker_parameters = ["--net=host",
                                                                 "-d",
                                                                 "-v", "/mnt/ephemeral/:/ephemeral/:rw",
                                                                 "-e", "SPARK_MASTER_IP="+self.IP,
                                                                 "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local",
                                                                 "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"],
                                            rm=False,
                                            sudo = self.sudo,
                                            check_output = True,
                                            mock = False)[:-1]
        _log.info("Started HDFS Datanode.")
        self.hdfsContainerID = docker_call(tool = "quay.io/ucsc_cgl/apache-hadoop-master:2.6.2",
                                           docker_parameters = ["--net=host",
                                                                "-d"],
                                           parameters = [self.IP],
                                           rm=False,
                                           sudo = self.sudo,
                                           check_output = True,
                                           mock = False)[:-1]
        return self.IP
Exemplo n.º 3
0
def index(job, shared_ids, input_args):
    """
    Index sample bam using samtools, calls haplotypeCaller.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    bam_path = return_input_paths(job, work_dir, shared_ids, 'toil.bam')
    output_path = os.path.join(work_dir, 'toil.bam.bai')
    # Call: index the normal.bam
    parameters = ['index', 'toil.bam']
    inputs=['toil.bam']
    outputs={'toil.bam.bai': None}
    docker_call(work_dir = work_dir,
                parameters = parameters,
                tool = 'quay.io/ucsc_cgl/samtools',
                inputs=inputs,
                outputs=outputs,
                sudo = input_args['sudo'])
    # Update FileStore and call child
    shared_ids['toil.bam.bai'] = job.fileStore.writeGlobalFile(output_path)
    job.addChildJobFn(haplotype_caller, shared_ids, input_args, cores = input_args['cpu_count'])
def print_reads(job, cores, table, indel_bam, indel_bai, ref, ref_dict, fai, mem):
    """
    Creates BAM that has had the base quality scores recalibrated

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param int cores: Maximum number of cores on host node
    :param str table: Recalibration table FileStoreID
    :param str indel_bam: Indel interval FileStoreID
    :param str indel_bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :return: FileStoreID for the processed bam
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, table, indel_bam, indel_bai]
    file_names = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.recal.table',
                  'sample.indel.bam', 'sample.indel.bai']
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))
    # Call: GATK -- PrintReads
    parameters = ['-T', 'PrintReads',
                  '-nct', str(cores),
                  '-R', '/data/ref.fasta',
                  '--emit_original_quals',
                  '-I', '/data/sample.indel.bam',
                  '-BQSR', '/data/sample.recal.table',
                  '-o', '/data/sample.bqsr.bam']
    docker_call(tool='quay.io/ucsc_cgl/gatk:3.4--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write ouptut to file store
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bqsr.bam'))
    return bam_id
def base_recalibration(job, shared_ids, input_args):
    """
    Creates recal table to perform Base Quality Score Recalibration

    job_vars: tuple     Contains the input_args and ids dictionaries
    sample: str         Either "normal" or "tumor" to track which one is which
    """
    # Unpack convenience variables for job
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve input file paths
    return_input_paths(job, work_dir, shared_ids, 'ref.fa', 'sample.indel.bam',
                       'dbsnp.vcf', 'ref.fa.fai',
                       'ref.dict', 'sample.indel.bam.bai')
    # Output file path
    output = os.path.join(work_dir, 'sample.recal.table')
    # Call: GATK -- IndelRealigner
    parameters = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189
                  '-T', 'BaseRecalibrator',
                  '-nct', str(input_args.cpu_count),
                  '-R', 'ref.fa',
                  '-I', 'sample.indel.bam',
                  '-knownSites', 'dbsnp.vcf',
                  '-o', 'sample.recal.table']
    docker_call(tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                work_dir=work_dir, parameters=parameters,
                inputs=['ref.fa', 'sample.indel.bam', 'dbsnp.vcf', 'ref.fa.fai',
                        'ref.dict', 'sample.indel.bam.bai'],
                outputs={'sample.recal.table': None},
                env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory})
    # Write to fileStore
    shared_ids['sample.recal.table'] = job.fileStore.writeGlobalFile(output)
    job.addChildJobFn(print_reads, shared_ids, input_args, cores = input_args.cpu_count)
def mark_dups_sample(job, shared_ids, input_args):
    """
    Uses picardtools MarkDuplicates
    """
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    read_from_filestore(job, work_dir, shared_ids, 'sample.sorted.bam')
    outpath = os.path.join(work_dir, 'sample.mkdups.bam')
    # Call: picardtools
    command = ['MarkDuplicates',
               'INPUT=sample.sorted.bam',
               'OUTPUT=sample.mkdups.bam',
               'METRICS_FILE=metrics.txt',
               'ASSUME_SORTED=true',
               'CREATE_INDEX=true']
    docker_call(work_dir=work_dir, parameters=command,
                env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory},
                tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                inputs=['sample.sorted.bam'],
                outputs={'sample.mkdups.bam': None, 'sample.mkdups.bai': None})
    shared_ids['sample.mkdups.bam'] = job.fileStore.writeGlobalFile(outpath)

    # picard writes the index for file.bam at file.bai, not file.bam.bai
    _move_bai(outpath)
    shared_ids['sample.mkdups.bam.bai'] = job.fileStore.writeGlobalFile(outpath + ".bai")
    job.addChildJobFn(realigner_target_creator, shared_ids, input_args, cores = input_args.cpu_count)
Exemplo n.º 7
0
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id):
    """
    Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform.
    These are two-column files: Genes and Quantifications.
    HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: UUID to mark the samples with
    :param str rsem_gene_id: FileStoreID of rsem_gene_ids
    :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids
    :return: FileStoreID from RSEM post process tarball
    :rytpe: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    # I/O
    job.fileStore.readGlobalFile(rsem_gene_id, os.path.join(work_dir, 'rsem_gene.tab'))
    job.fileStore.readGlobalFile(rsem_isoform_id, os.path.join(work_dir, 'rsem_isoform.tab'))
    # Convert RSEM files into individual .tab files.
    docker_call(tool='jvivian/rsem_postprocess', parameters=[uuid], work_dir=work_dir)
    os.rename(os.path.join(work_dir, 'rsem_gene.tab'), os.path.join(work_dir, 'rsem_genes.results'))
    os.rename(os.path.join(work_dir, 'rsem_isoform.tab'), os.path.join(work_dir, 'rsem_isoforms.results'))
    output_files = ['rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab', 'rsem.isoform.norm_counts.tab',
                    'rsem.isoform.raw_counts.tab', 'rsem_genes.results', 'rsem_isoforms.results']
    # Perform HUGO gene / isoform name mapping
    genes = [x for x in output_files if 'rsem.genes' in x]
    isoforms = [x for x in output_files if 'rsem.isoform' in x]
    command = ['-g'] + genes + ['-i'] + isoforms
    docker_call(tool='jvivian/gencode_hugo_mapping', parameters=command, work_dir=work_dir)
    hugo_files = [os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1] for x in genes + isoforms]
    # Create tarballs for outputs
    tarball_files('rsem.tar.gz', file_paths=[os.path.join(work_dir, x) for x in output_files], output_dir=work_dir)
    tarball_files('rsem_hugo.tar.gz', [os.path.join(work_dir, x) for x in hugo_files], output_dir=work_dir)
    rsem_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem.tar.gz'))
    hugo_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    return rsem_id, hugo_id
def realigner_target_creator(job, shared_ids, input_args):
    """
    Creates <type>.intervals file needed for indel realignment

    job_vars: tuple     Contains the input_args and ids dictionaries
    sample: str         Either "normal" or "tumor" to track which one is which
    """
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve input file paths
    read_from_filestore(job, work_dir, shared_ids, 'ref.fa',
                        'sample.mkdups.bam', 'ref.fa.fai', 'ref.dict',
                        'sample.mkdups.bam.bai', 'phase.vcf', 'mills.vcf')

    # Output file path
    output = os.path.join(work_dir, 'sample.intervals')
    # Call: GATK -- RealignerTargetCreator
    parameters = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189
                  '-T', 'RealignerTargetCreator',
                  '-nt', str(input_args.cpu_count),
                  '-R', 'ref.fa',
                  '-I', 'sample.mkdups.bam',
                  '-known', 'phase.vcf',
                  '-known', 'mills.vcf',
                  '--downsampling_type', 'NONE',
                  '-o', 'sample.intervals']

    docker_call(work_dir=work_dir, parameters=parameters,
                tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                inputs=['ref.fa','sample.mkdups.bam', 'ref.fa.fai', 'ref.dict',
                        'sample.mkdups.bam.bai', 'phase.vcf', 'mills.vcf'],
                outputs={'sample.intervals': None},
                env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory})
    shared_ids['sample.intervals'] = job.fileStore.writeGlobalFile(output)
    job.addChildJobFn(indel_realignment, shared_ids, input_args)
Exemplo n.º 9
0
def call_adam(master_ip, arguments, memory=None, override_parameters=None):
    """
    Invokes the ADAM container. Find ADAM at https://github.com/bigdatagenomics/adam.

    :param masterIP: The Spark leader IP address.
    :param arguments: Arguments to pass to ADAM.
    :param memory: Gigabytes of memory to provision for Spark driver/worker.
    :param override_parameters: Parameters passed by the user, that override our defaults.

    :type masterIP: MasterAddress
    :type arguments: list of string
    :type memory: int or None
    :type override_parameters: list of string or None
    """
    default_params = ["--conf", "spark.driver.maxResultSize=0"] # set max result size to unlimited, see #177

    docker_call(rm=False,
                tool="quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80",
                docker_parameters=master_ip.docker_parameters(["--net=host"]),
                parameters=_make_parameters(master_ip,
                                            default_params,
                                            memory,
                                            arguments,
                                            override_parameters),
                mock=False)
Exemplo n.º 10
0
def run_kallisto(job, cores, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param int cores: Number of cores to run Kallisto with
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir)
    # Retrieve files
    parameters = ['quant',
                  '-i', '/data/kallisto_hg38.idx',
                  '-t', str(cores),
                  '-o', '/data/',
                  '-b', '100']
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq'))
        parameters.extend(['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        parameters.extend(['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq'])

    # Call: Kallisto
    docker_call(tool='quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86',
                work_dir=work_dir, parameters=parameters)
    # Tar output files together and store in fileStore
    output_files = [os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']]
    tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'kallisto.tar.gz'))
def base_recalibration(job, cores, indel_bam, indel_bai, ref, ref_dict, fai, dbsnp, mem):
    """
    Creates recal table used in Base Quality Score Recalibration

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param int cores: Maximum number of cores on a worker node
    :param str indel_bam: Indel interval FileStoreID
    :param str indel_bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str dbsnp: DBSNP VCF FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :return: FileStoreID for the processed bam
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, indel_bam, indel_bai, dbsnp]
    file_names = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.indel.bam', 'sample.indel.bai', 'dbsnp.vcf']
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))
    # Call: GATK -- IndelRealigner
    parameters = ['-T', 'BaseRecalibrator',
                  '-nct', str(cores),
                  '-R', '/data/ref.fasta',
                  '-I', '/data/sample.indel.bam',
                  '-knownSites', '/data/dbsnp.vcf',
                  '-o', '/data/sample.recal.table']
    docker_call(tool='quay.io/ucsc_cgl/gatk:3.4--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write output to file store
    table = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.recal.table'))
    return job.addChildJobFn(print_reads, cores, table, indel_bam, indel_bai, ref, ref_dict, fai, mem,
                             cores=cores, memory=mem, disk='25G').rv()
Exemplo n.º 12
0
def call_conductor(master_ip, src, dst, memory=None, override_parameters=None):
    """
    Invokes the Conductor container to copy files between S3 and HDFS and vice versa.
    Find Conductor at https://github.com/BD2KGenomics/conductor.

    :param masterIP: The Spark leader IP address.
    :param src: URL of file to copy.
    :param src: URL of location to copy file to.
    :param memory: Gigabytes of memory to provision for Spark driver/worker.
    :param override_parameters: Parameters passed by the user, that override our defaults.

    :type masterIP: MasterAddress
    :type src: string
    :type dst: string
    :type memory: int or None
    :type override_parameters: list of string or None
    """

    arguments = ["--", "-C", src, dst]

    docker_call(rm=False,
                tool="quay.io/ucsc_cgl/conductor",
                docker_parameters=master_ip.docker_parameters(["--net=host"]),
                parameters=_make_parameters(master_ip,
                                            [], # no conductor specific spark configuration
                                            memory,
                                            arguments,
                                            override_parameters),
                mock=False)
Exemplo n.º 13
0
def create_reference_dict_hc(job, shared_ids, input_args):
    """
    Uses Picardtools to create sequence dictionary for reference genome.
    Calls next step in pipeline - spawn batch jobs

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    # Unpack convenience variables for job
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa')
    # Call: picardtools
    picard_output = os.path.join(work_dir, 'ref.dict')
    command = ['CreateSequenceDictionary', 'R=ref.fa', 'O=ref.dict']
    inputs=['ref.fa']
    outputs={picard_output: None}
    docker_call(work_dir = work_dir,
                env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory},
                parameters = command,
                tool = 'quay.io/ucsc_cgl/picardtools',
                inputs=inputs,
                outputs=outputs)
    # Update fileStore for output
    shared_ids['ref.dict'] = job.fileStore.writeGlobalFile(picard_output)
    job.addChildJobFn(spawn_batch_variant_calling, shared_ids, input_args)
Exemplo n.º 14
0
def create_reference_index_hc(job, shared_ids, input_args):
    """
    Uses samtools to create reference index file in working directory,
    spawns next job in pipeline - create reference dictionary

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    # Unpack convenience variables for job
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa')
    faidx_output = os.path.join(work_dir, 'ref.fa.fai')
    # Call: Samtools
    faidx_command = ['faidx', 'ref.fa']
    inputs= ref_path
    outputs={'ref.fa.fai': None}
    docker_call(work_dir = work_dir,
                parameters = faidx_command,
                tool = 'quay.io/ucsc_cgl/samtools',
                inputs=inputs,
                outputs=outputs)
    # Update fileStore for output
    shared_ids['ref.fa.fai'] = job.fileStore.writeGlobalFile(faidx_output)
    job.addChildJobFn(create_reference_dict_hc, shared_ids, input_args)
Exemplo n.º 15
0
def run_star(job, cores, r1_id, r2_id, star_index_url, wiggle=False):
    """
    Performs alignment of fastqs to bam via STAR

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param int cores: Number of cores to run star with
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None)
    :param str star_index_url: STAR index tarball
    :param bool wiggle: If True, will output a wiggle file and return it
    :return: FileStoreID from RSEM
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir)
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    os.remove(os.path.join(work_dir, 'starIndex.tar.gz'))
    # Determine tarball structure - star index contains are either in a subdir or in the tarball itself
    star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data'
    # Parameter handling for paired / single-end data
    parameters = ['--runThreadN', str(cores),
                  '--genomeDir', star_index,
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1']
    if wiggle:
        parameters.extend(['--outWigType', 'bedGraph',
                           '--outWigStrand', 'Unstranded',
                           '--outWigReferencesPrefix', 'chr'])
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq'])
    # Call: STAR Mapping
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Write to fileStore
    transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam'))
    sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    if wiggle:
        wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg'))
        return transcriptome_id, sorted_id, wiggle_id
    else:
        return transcriptome_id, sorted_id
Exemplo n.º 16
0
def test_docker_call(tmpdir):
    from toil_scripts.lib.programs import docker_call
    work_dir = str(tmpdir)
    parameter = ['--help']
    tool = 'quay.io/ucsc_cgl/samtools'
    docker_call(work_dir=work_dir, parameters=parameter, tool=tool)
    # Test outfile
    fpath = os.path.join(work_dir, 'test')
    with open(fpath, 'w') as f:
        docker_call(tool='ubuntu', env=dict(foo='bar'), parameters=['printenv', 'foo'], outfile=f)
    assert open(fpath).read() == 'bar\n'
Exemplo n.º 17
0
def _download_with_genetorrent(url, file_path, cghub_key_path):
    parsed_url = urlparse(url)
    analysis_id = parsed_url.path[1:]
    assert parsed_url.scheme == 'gnos', 'Improper format. gnos://cghub/ID. User supplied: {}'.format(parsed_url)
    work_dir = os.path.dirname(file_path)
    folder_path = os.path.join(work_dir, os.path.basename(analysis_id))
    parameters = ['-vv', '-c', cghub_key_path, '-d', analysis_id]
    docker_call(tool='quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b',
                work_dir=work_dir, parameters=parameters)
    sample = glob.glob(os.path.join(folder_path, '*tar*'))
    assert len(sample) == 1, 'More than one sample tar in CGHub download: {}'.format(analysis_id)
Exemplo n.º 18
0
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf')
    download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle')
    # Call Spladder
    command = ['--insert_ir=y',
               '--insert_es=y',
               '--insert_ni=y',
               '--remove_se=n',
               '--validate_sg=n',
               '-b', 'alignment.bam',
               '-o ', '/data',
               '-a', 'annotation.gtf',
               '-v', 'y',
               '-c', '3',
               '-M', 'single',
               '-T', 'n',
               '-n', '50',
               '-P', 'y',
               '-p', 'n',
               '--sparse_bam', 'y']
    docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
Exemplo n.º 19
0
def haplotype_caller(job, shared_ids, input_args):
    """
    Uses GATK HaplotypeCaller to identify SNPs and Indels and writes a gVCF.
    Calls per-sample genotyper to genotype gVCF.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'toil.bam', 'toil.bam.bai']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    output = '%s.raw.BOTH%s.gvcf' % (input_args['uuid'],
                                     input_args['suffix'])
    
    # Call GATK -- HaplotypeCaller
    command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189
               '-nct', str(input_args['cpu_count']),
               '-R', 'ref.fa',
               '-T', 'HaplotypeCaller',
               '--genotyping_mode', 'Discovery',
               '--emitRefConfidence', 'GVCF',
               '-I', 'toil.bam',
               '-o', output,
               '-variant_index_type', 'LINEAR',
               '-variant_index_parameter', '128000',
               '--annotation', 'QualByDepth',
               '--annotation', 'DepthPerSampleHC',
               '--annotation', 'FisherStrand',
               '--annotation', 'ReadPosRankSumTest']
    try:
        inputs=input_files
        outputs={output: None}
        docker_call(work_dir = work_dir,
                    env={'JAVA_OPTS':'-Xmx%sg' % input_args['memory']},
                    parameters = command,
                    tool = 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                    inputs=inputs,
                    outputs=outputs,
                    sudo = input_args['sudo'])
    except:
        sys.stderr.write("Running haplotype caller with %s in %s failed." % (
            " ".join(command), work_dir))
        raise

    # Update fileStore and spawn child job
    shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output))

    # upload gvcf
    upload_or_move_hc(work_dir, input_args, output)

    # call variants prior to vqsr
    job.addChildJobFn(genotype_gvcf, shared_ids, input_args, cores = input_args['cpu_count'])
Exemplo n.º 20
0
def star(job, inputs, r1_cutadapt, r2_cutadapt):
    """
    Performs alignment of fastqs to BAM via STAR

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str r1_cutadapt: FileStore ID of read 1 fastq
    :param str r2_cutadapt: FileStore ID of read 2 fastq
    """
    job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(inputs.cores, 16)
    # Retrieve files
    job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq'))
    job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq'))
    # Get starIndex
    download_url(inputs.star_index, work_dir, 'starIndex.tar.gz')
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    # Parameters
    parameters = ['--runThreadN', str(cores),
                  '--genomeDir', '/data/starIndex',
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1',
                  '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']
    # Call: STAR Map
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Call Samtools Index
    index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam']
    docker_call(work_dir=work_dir, parameters=index_command,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')
    # fileStore
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai'))
    job.fileStore.deleteGlobalFile(r1_cutadapt)
    job.fileStore.deleteGlobalFile(r2_cutadapt)
    # Launch children and follow-on
    vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv()
    spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv()
    job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
Exemplo n.º 21
0
def run_bwa(job, inputs, ids):
    """
    Aligns two fastqs into a BAMFILE via BWA

    :param JobFunctionWrappingJob job: Passed by Toil automatically
    :param Namespace inputs: Input arguments (see main)
    :param list ids: list of FileStore IDs (R1, R2, reference inputs)
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_names = ['r1.fq.gz', 'r2.fq.gz', 'ref.fa', 'ref.fa.amb', 'ref.fa.ann',
                  'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa', 'ref.fa.fai']
    if inputs.alt:
        file_names.append('ref.fa.alt')

    for fileStoreID, name in zip(ids, file_names):
        job.fileStore.readGlobalFile(fileStoreID, os.path.join(work_dir, name))

    # Add read group line
    rg = "@RG\\tID:{0}\\tLB:{1}\\tPL:{2}\\tPU:{3}\\tSM:{0}".format(inputs.uuid, inputs.library,
                                                                   inputs.platform, inputs.program_unit)

    # BWA Options
    opt_args = []
    if not inputs.skip_sort:
        opt_args.append('-s')
    if inputs.trim:
        opt_args.append('-a')
    # Call: bwakit
    parameters = (['-t', str(inputs.cores),
                   '-R', rg] +
                  opt_args +
                  ['-o', '/data/aligned',
                   '/data/ref.fa',
                   '/data/r1.fq.gz',
                   '/data/r2.fq.gz'])
    outputs = {'aligned.aln.bam': inputs.mock_bam}

    docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e',
                parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir)

    # BWA insists on adding an `*.aln.sam` suffix, so rename the output file
    output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid))
    os.rename(os.path.join(work_dir, 'aligned.aln.bam'),
              output_file)

    # Either write file to local output directory or upload to S3 cloud storage
    job.fileStore.logToMaster('Aligned sample: {}'.format(inputs.uuid))
    if inputs.output_dir:
        move_files([output_file], inputs.output_dir)
    if inputs.s3_dir:
        s3am_upload(output_file, inputs.s3_dir, s3_key_path=inputs.ssec)
Exemplo n.º 22
0
def call_conductor(masterIP, inputs, src, dst):
    """
    Invokes the Conductor container to copy files between S3 and HDFS

    :type masterIP: MasterAddress
    """
    docker_call(rm = False,
                tool = "quay.io/ucsc_cgl/conductor",
                docker_parameters = masterIP.docker_parameters(["--net=host"]),
                parameters = ["--master", "spark://"+masterIP+":"+SPARK_MASTER_PORT,
                 "--conf", "spark.driver.memory=%sg" % inputs["driverMemory"],
                 "--conf", "spark.executor.memory=%sg" % inputs["executorMemory"],
                 "--", "-C", src, dst],
                mock=False)
Exemplo n.º 23
0
def run_picard_create_sequence_dictionary(job, ref_id):
    """
    Use Picard-tools to create reference dictionary

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome
    :return: FileStoreID for reference dictionary
    :rtype: str
    """
    job.fileStore.logToMaster('Created reference dictionary')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta'))
    command = ['CreateSequenceDictionary', 'R=ref.fasta', 'O=ref.dict']
    docker_call(work_dir=work_dir, parameters=command,
                tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.dict'))
Exemplo n.º 24
0
def run_samtools_faidx(job, ref_id):
    """
    Use Samtools to create reference index file

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome
    :return: FileStoreID for reference index
    :rtype: str
    """
    job.fileStore.logToMaster('Created reference index')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta'))
    command = ['faidx', 'ref.fasta']
    docker_call(work_dir=work_dir, parameters=command,
                tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.fasta.fai'))
Exemplo n.º 25
0
def genotype_gvcf(job, shared_ids, input_args):
    """
    Genotypes the gVCF generated by the HaplotypeCaller.
    Calls variant quality score recalibration functions.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """

    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['%s.raw.BOTH%s.gvcf' % (input_args['uuid'],
                                           input_args['suffix']),
                   'ref.fa', 'ref.fa.fai', 'ref.dict']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    output = 'unified.raw.BOTH.gatk.vcf'
    
    command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189
               '-nt', str(input_args['cpu_count']),
               '-R', 'ref.fa',
               '-T', 'GenotypeGVCFs',
               '--variant', '%s.raw.BOTH.gatk.gvcf' % input_args['uuid'],
               '--out', output,
               '-stand_emit_conf', '10.0',
               '-stand_call_conf', '30.0']

    try:
        inputs=input_files
        outputs={output: None}
        docker_call(work_dir = work_dir,
                    env={'JAVA_OPTS':'-Xmx%sg' % input_args['memory']},
                    parameters = command,
                    tool = 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                    inputs=inputs,
                    outputs=outputs,
                    sudo = input_args['sudo'])
    except:
        sys.stderr.write("Running GenotypeGVCFs with %s in %s failed." % (
            " ".join(command), work_dir))
        raise

    # Update fileStore and spawn child job
    shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output))

    # run vqsr
    job.addChildJobFn(vqsr_snp, shared_ids, input_args, cores = input_args['cpu_count'])
    job.addChildJobFn(vqsr_indel, shared_ids, input_args, cores = input_args['cpu_count'])
Exemplo n.º 26
0
def call_adam(masterIP, inputs, arguments):
    """
    Invokes the ADAM container

    :type masterIP: MasterAddress
    """
    default_params = ["--master", ("spark://%s:%s" % (masterIP, SPARK_MASTER_PORT)),
                      "--conf", ("spark.driver.memory=%sg" % inputs["driverMemory"]),
                      "--conf", ("spark.executor.memory=%sg" % inputs["executorMemory"]),
                      "--conf", ("spark.hadoop.fs.default.name=hdfs://%s:%s" % (masterIP, HDFS_MASTER_PORT)),
                      "--conf", "spark.driver.maxResultSize=0", # set max result size to unlimited, see #177
                      "--"]
    docker_call(rm = False,
                tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80",
                docker_parameters = masterIP.docker_parameters(["--net=host"]),
                parameters = default_params + arguments,
                mock=False)
def run_merge_vcf(job, options, index_dir_id, work_dir, vcf_file_key_list):

    RealTimeLogger.get().info("Completed gam merging and gam path variant calling.")
    RealTimeLogger.get().info("Starting vcf merging...")
    # Set up the IO stores each time, since we can't unpickle them on Azure for
    # some reason.
    input_store = IOStore.get(options.input_store)
    out_store = IOStore.get(options.out_store)
    
    # Download local input files from the remote storage container
    graph_dir = work_dir
    read_global_directory(job.fileStore, index_dir_id, graph_dir)
   
    vcf_merging_file_key_list = [] 
    for vcf_file_key in vcf_file_key_list:
        vcf_file = "{}/{}".format(work_dir, vcf_file_key)
        vcf_file_idx = "{}.tbi".format(vcf_file)
        out_store.read_input_file(vcf_file_key, vcf_file)
        out_store.read_input_file(vcf_file_key + ".tbi", vcf_file_idx)
        vcf_merging_file_key_list.append(vcf_file)

    vcf_merged_file_key = "" 
    if len(vcf_file_key_list) > 1:
        # merge vcf files
        vcf_merged_file_key = "{}.vcf.gz".format(options.sample_name)
        command=['concat', '-O', 'z', '-o', '{}/{}'.format(work_dir, vcf_merged_file_key), ' '.join(vcf_merging_file_key_list)]
        docker_call(work_dir=work_dir, parameters=command,
                    tool='biodckr/bcftools')
#        run("bcftools concat -O z -o {}/{} {}".format(work_dir, vcf_merged_file_key, ' '.join(vcf_merging_file_key_list)))
        run("tabix -f -p vcf {}/{}".format(work_dir, vcf_merged_file_key))
    else:
        vcf_merged_file_key = vcf_file_key_list[0]

    # save variant calling results to the output store
    vcf_file = "{}/{}".format(work_dir, vcf_merged_file_key)
    vcf_file_idx = "{}/{}.tbi".format(work_dir, vcf_merged_file_key)

    out_store.write_output_file(vcf_file, vcf_merged_file_key)
    out_store.write_output_file(vcf_file_idx, vcf_merged_file_key + ".tbi")
    
    #Run downloader to download output IO store files to local output directory.
    vcf_file_id = job.fileStore.writeGlobalFile(vcf_file)
    vcf_file_idx_id = job.fileStore.writeGlobalFile(vcf_file_idx) 
    downloadList = [[vcf_file_id, vcf_merged_file_key], [vcf_file_idx_id, vcf_merged_file_key+".tbi"]]

    return downloadList
Exemplo n.º 28
0
def run_rsem(job, cores, bam_id, rsem_ref_url, paired=True):
    """
    RNA quantification with RSEM

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param int cores: Number of cores to run RSEM with
    :param str bam_id: FileStoreID of transcriptome bam for quantification
    :param str rsem_ref_url: URL of RSEM reference (tarball)
    :param bool paired: If True, uses parameters for paired end data
    :return: FileStoreIDs for RSEM's gene and isoform output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir)
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir])
    os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz'))
    # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix
    rsem_files = []
    for root, directories, files in os.walk(work_dir):
        rsem_files.extend([os.path.join(root, x) for x in files])
    # "grp" is a required RSEM extension that should exist in the RSEM reference
    ref_prefix = [os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x][0]
    ref_folder = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data'
    # I/O
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'transcriptome.bam'))
    output_prefix = 'rsem'
    # Call: RSEM
    parameters = ['--quiet',
                  '--no-qualities',
                  '-p', str(cores),
                  '--forward-prob', '0.5',
                  '--seed-length', '25',
                  '--fragment-length-mean', '-1.0',
                  '--bam', '/data/transcriptome.bam',
                  os.path.join(ref_folder, ref_prefix),
                  output_prefix]
    if paired:
        parameters = ['--paired-end'] + parameters
    docker_call(tool='quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21',
                parameters=parameters, work_dir=work_dir)
    os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab'))
    os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab'))
    # Write to FileStore
    gene_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_gene.tab'))
    isoform_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_isoform.tab'))
    return gene_id, isoform_id
Exemplo n.º 29
0
def run_samtools_index(job, bam_id):
    """
    Runs samtools index to create (.bai) files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam_id: FileStoreID of the bam file
    :return: BAM index FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'sample.bam'))
    # Call: index the bam
    parameters = ['index', '/data/sample.bam']
    docker_call(work_dir=work_dir, parameters=parameters,
                tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
    # Write to fileStore
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai'))
Exemplo n.º 30
0
def run_indel_realignment(job, intervals, bam, bai, ref, ref_dict, fai, phase, mills, mem, unsafe=False):
    """
    Creates realigned bams using the intervals file from previous step

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str intervals: Indel interval FileStoreID
    :param str bam: Sample BAM FileStoreID
    :param str bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str phase: Phase VCF FileStoreID
    :param str mills: Mills VCF FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreID for the processed bam
    :rtype: tuple(str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, intervals, bam, bai, phase, mills]
    inputs = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.intervals',
              'sample.bam', 'sample.bam.bai', 'phase.vcf', 'mills.vcf']
    for file_store_id, name in zip(file_ids, inputs):
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))
    # Call: GATK -- IndelRealigner
    parameters = ['-T', 'IndelRealigner',
                  '-R', '/data/ref.fasta',
                  '-I', '/data/sample.bam',
                  '-known', '/data/phase.vcf',
                  '-known', '/data/mills.vcf',
                  '-targetIntervals', '/data/sample.intervals',
                  '--downsampling_type', 'NONE',
                  '-maxReads', str(720000), # Taken from MC3 pipeline
                  '-maxInMemory', str(5400000), # Taken from MC3 pipeline
                  '-o', '/data/sample.indel.bam']
    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])
    docker_call(tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                inputs=inputs,
                outputs={'sample.indel.bam': None, 'sample.indel.bai': None},
                work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write to fileStore
    indel_bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.indel.bam'))
    indel_bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.indel.bai'))
    return indel_bam, indel_bai