def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        MAX_RAM = 64   # maximum amount of ram to use in Gb.

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)
        
        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar
        #   -T SplitNCigarReads -R ../hg19.fa -I $i -o $j
        #   -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60
        #   -U ALLOW_N_CIGAR_READS

        # Start with 5 Gb RAM.
        commands = make_commands(jobs, ref.fasta_file_full, 5)
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_procs"] = nc

        # If any of the analyses didn't finish, try again with more
        # RAM.
        jobs2 = []
        for x in jobs:
            in_filename, log_filename, out_filename = x
            if filelib.exists_nz(out_filename):
                continue
            jobs2.append(x)
        if jobs2:
            commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM)
            nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores)
            parallel.pshell(commands, max_procs=nc)
            metadata["commands"] += commands
            
        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)

        return metadata
示例#2
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bam_path = in_data.identifier
        assert os.path.exists(bam_path)
        assert os.path.isdir(bam_path)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # Find all the BAM files.
        bam_filenames = filelib.list_files_in_path(
            bam_path, endswith=".bam", case_insensitive=True)

        jobs = []  # list of in_filename, out_filename
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            assert not os.path.exists(out_filename)
            x = in_filename, out_filename
            jobs.append(x)

        # Symlink the BAM files to the output path.
        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Index each of the files.
        sq = parallel.quote
        samtools = filelib.which_assert(config.samtools)
        commands = []
        for x in jobs:
            in_filename, out_filename = x
            cmd = [
                sq(samtools),
                "index",
                sq(out_filename),
                ]
            x = " ".join(cmd)
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # TODO: Check for output files.
        
        return metadata
示例#3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents

        #in_filenames = filelib.list_files_in_path(
        #    bam_node.identifier, endswith=".bam", case_insensitive=True)
        in_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \
        #   I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta
        picard_jar = alignlib.find_picard_jar("picard")

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        # Make a list of commands.
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, out_filename = x

            x = [
                "java",
                "-Xmx5g",
                "-jar",
                sq(picard_jar),
                "ReorderSam",
                "I=%s" % sq(in_filename),
                "O=%s" % sq(out_filename),
                "REFERENCE=%s" % ref.fasta_file_full,
            ]
            x = " ".join(x)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, out_filename = x
            filelib.assert_exists_nz(out_filename)
示例#4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = in_filename, err_filename, out_filename
            jobs.append(x)

        # samtools mpileup -f [reference sequence] [BAM file(s)]
        #   > myData.mpileup
        samtools = mlib.findbin("samtools")
        sq = mlib.sq
        commands = []
        for x in jobs:
            in_filename, err_filename, out_filename = x

            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            x.append(sq(in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, err_filename, out_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
示例#5
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        sam_filenames = mlib.find_sam_files(in_data.identifier)
        assert sam_filenames, "No .sam files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        samtools = mlib.findbin("samtools")

        jobs = []  # list of (sam_filename, bam_filename)
        for sam_filename in sam_filenames:
            p, f = os.path.split(sam_filename)
            assert f.endswith(".sam")
            f = f.replace(".sam", ".bam")
            bam_filename = os.path.join(out_path, f)
            x = sam_filename, bam_filename
            jobs.append(x)

        # Make a list of samtools commands.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sam_filename, bam_filename = x

            # samtools view -bS -o <bam_filename> <sam_filename>
            x = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(bam_filename),
                sq(sam_filename),
            ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)
        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, sam_filename, log_filename
            jobs.append(x)

        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                              pair1,
                                              fastq_file2=pair2,
                                              sam_file=sam_filename,
                                              num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        filelib.safe_mkdir(out_path)

        buildver = module_utils.get_user_option(user_options,
                                                "buildver",
                                                allowed_values=["hg19"],
                                                not_empty=True)

        jobs = []  # list of (in_filename, log_filename, out_filestem)
        for in_filename in vcf_filenames:
            # Annovar takes a filestem, without the ".vcf".
            p, f = os.path.split(in_filename)
            f, exp = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filestem = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filestem
            jobs.append(x)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filestem = x

            x = alignlib.make_annovar_command(in_filename, log_filename,
                                              out_filestem, buildver)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]  # out_filestems
        x = ["%s.%s_multianno.vcf" % (x, buildver) for x in x]
        filelib.assert_exists_nz_many(x)
示例#8
0
def run_many_pybinreg(jobs, num_procs):
    # jobs should be a list of cmd, outpath, outfile.
    from genomicode import parallel

    commands = []
    for x in jobs:
        cmd, outpath, outfile = x
        
        if not os.path.exists(outpath):
            os.mkdir(outpath)
        cmd = "%s >& %s" % (cmd, outfile)
        commands.append(cmd)

    parallel.pshell(commands, max_procs=num_procs)

    for x in jobs:
        cmd, outpath, outfile = x
        check_pybinreg(outpath, outfile, cmd)
示例#9
0
def _run_genomecov(jobs, reference_file, num_cores):
    from genomicode import parallel
    from genomicode import filelib
    from genomicode import ngslib

    # Set up the commands to run.
    commands = []
    for x in jobs:
        x = ngslib.make_bedtools_genomecov_command(x.bam_filename,
                                                   reference_file,
                                                   x.genomecov_filename)
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores)

    # Make sure the analysis completed successfully.
    x = [x.genomecov_filename for x in jobs]
    filelib.assert_exists_nz_many(x)

    return commands
示例#10
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "FASTQ files not found: %s" % in_data.identifier
        filelib.safe_mkdir(out_path)
        metadata = {}

        fastqc = mlib.findbin("fastqc")
        fastqc_q = parallel.quote(fastqc)

        commands = [
            "%s --outdir=%s --extract %s" % (fastqc_q, out_path, x)
            for x in filenames
        ]
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        #commands = ["ls > %s" % x for x in filenames]
        parallel.pshell(commands, max_procs=num_cores)

        # Fastqc generates files:
        # <file>_fastqc/
        # <file>_fastqc.zip
        # The contents of the .zip file are identical to the directories.
        # If this happens, then delete the .zip files because they are
        # redundant.
        files = os.listdir(out_path)
        filenames = [os.path.join(out_path, x) for x in files]
        for filename in filenames:
            zip_filename = "%s.zip" % filename
            if os.path.exists(zip_filename):
                os.unlink(zip_filename)
示例#11
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
示例#12
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, group_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(group_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "No FASTQ files found."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Make sure no duplicate samples.
        x1 = [x[0] for x in fastq_files]
        x2 = {}.fromkeys(x1).keys()
        assert len(x1) == len(x2), "dup sample"

        # Make a list of all FASTQ files to align.
        fastq_filenames = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            assert pair1
            fastq_filenames.append(pair1)
            if pair2:
                fastq_filenames.append(pair2)

        # Make a list of all the jobs to do.
        jobs = []  # list of (fastq_filename, sai_filename)
        for in_filename in fastq_filenames:
            in_path, in_file = os.path.split(in_filename)
            x = in_file
            if x.lower().endswith(".fq"):
                x = x[:-3]
            elif x.lower().endswith(".fastq"):
                x = x[:-6]
            sai_filename = os.path.join(out_path, "%s.sai" % x)
            log_filename = os.path.join(out_path, "%s.log" % x)
            x = in_filename, sai_filename, log_filename
            jobs.append(x)

        # Calculate the number of threads per job.
        nc = max(1, num_cores / len(jobs))

        # Make the bwa commands.
        commands = []
        for x in jobs:
            fastq_filename, sai_filename, log_filename = x
            x = alignlib.make_bwa_aln_command(ref.fasta_file_full,
                                              fastq_filename,
                                              sai_filename,
                                              log_filename,
                                              num_threads=nc)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, sai_filename, log_filename = x
            assert filelib.exists_nz(sai_filename), \
                   "Missing: %s" % sai_filename
        return metadata
示例#13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
示例#14
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_filenames = mlib.find_bam_files(in_data.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bam2fastx (unknown version)"

        # Somehow bam2fastx doesn't work if there are spaces in the
        # filename.  Make a temporary filename with no spaces, and
        # then rename it later.
        # Actually, may not be bam2fastx's fault.

        jobs = []
        for i, bam_filename in enumerate(bam_filenames):
            p, f, e = mlib.splitpath(bam_filename)
            #bai_filename = alignlib.find_bai_file(bam_filename)
            #assert bai_filename, "Missing index for: %s" % bam_filename
            #temp_bam_filename = "%d.bam" % i
            #temp_bai_filename = "%d.bam.bai" % i
            #temp_fa_filename = "%d.fa" % i
            fa_filename = os.path.join(out_path, "%s.fa" % f)
            x = filelib.GenericObject(
                bam_filename=bam_filename,
                #bai_filename=bai_filename,
                #temp_bam_filename=temp_bam_filename,
                #temp_bai_filename=temp_bai_filename,
                #temp_fa_filename=temp_fa_filename,
                fa_filename=fa_filename)
            jobs.append(x)
        bam2fastx = mlib.findbin("bam2fastx")

        # Link all the bam files.
        #for j in jobs:
        #    assert not os.path.exists(j.temp_bam_filename)
        #    #assert not os.path.exists(j.temp_bai_filename)
        #    os.symlink(j.bam_filename, j.temp_bam_filename)
        #    #os.symlink(j.bai_filename, j.temp_bai_filename)

        commands = []
        for j in jobs:
            # bam2fastx -A --fasta -o rqc14.fa rqc11.bam
            x = [
                mlib.sq(bam2fastx),
                "-A",
                "--fasta",
                #"-o", mlib.sq(j.temp_fa_filename),
                #mlib.sq(j.temp_bam_filename),
                "-o", mlib.sq(j.fa_filename),
                mlib.sq(j.bam_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_fa_filename, j.fa_filename)
        #    # Remove the link to the BAM file.
        #    os.unlink(j.temp_bam_filename)
        
        x = [j.fa_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import config
        from Betsy import module_utils as mlib

        mpileup_node = in_data
        mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier,
                                                       endswith=".pileup")
        assert mpileup_filenames, "No .pileup files."
        #nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        #ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        tool = "mpileup2snp"
        if vartype == "indel":
            tool = "mpileup2indel"

        # list of (sample, in_filename, tmp1_filename, tmp2_filename,
        #          out_filename)
        jobs = []
        for in_filename in mpileup_filenames:
            p, sample, ext = mlib.splitpath(in_filename)
            tmp1_filename = os.path.join(out_path, "%s.tmp1" % sample)
            tmp2_filename = os.path.join(out_path, "%s.tmp2" % sample)
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, tmp1_filename, tmp2_filename, out_filename
            jobs.append(x)

        # VarScan will generate a "Parsing Exception" if there are 0
        # reads in a location.  Filter those out.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = "awk -F'\t' '$4 != 0 {print}' %s > %s" % (in_filename,
                                                          tmp1_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        x = [x[2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # java -jar /usr/local/bin/VarScan.jar <tool> $i --output_vcf 1 > $j
        varscan = filelib.which_assert(config.varscan_jar)

        # Make a list of commands.
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = [
                "java",
                "-jar",
                sq(varscan),
                tool,
                tmp1_filename,
                "--p-value",
                0.05,
                "--output-vcf",
                1,
            ]
            x = " ".join(map(str, x))
            x = "%s >& %s" % (x, tmp2_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)
        x = [x[3] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Clean up the VCF files.  VarScan leaves extraneous lines
        # there.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            alignlib.clean_varscan_vcf(sample, tmp2_filename, out_filename)
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # The tmp files are really big.  Don't save those.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            filelib.safe_unlink(tmp1_filename)
            filelib.safe_unlink(tmp2_filename)
示例#16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
示例#17
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # java -jar picard.jar CollectAlignmentSummaryMetrics \
        #   R=reference_sequence.fasta \
        #   I=input.bam \
        #   O=output.txt
        opj = os.path.join
        jobs = []   # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(
                sample=sample,
                bam_filename=bam_filename,
                out_filename=out_filename,
                log_filename=log_filename)
            jobs.append(x)

        # Make the commands to run picard.
        picard_jar = alignlib.find_picard_jar("picard")
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Should have better way of getting java path.
            cmd = [
                "java",
                "-Xmx10g",
                "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics",
                "I=%s" % sq(j.bam_filename),
                "R=%s" % sq(ref.fasta_file_full),
                "O=%s" % sq(j.out_filename),
                ]
            cmd = " ".join(cmd)
            cmd = "%s >& %s" % (cmd, sq(j.log_filename))
            commands.append(cmd)

        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Summarize the insert size files.
        outfile = opj(out_path, "summary.txt")
        _summarize_alignment_summary_metrics(jobs, outfile)
        filelib.assert_exists_nz(outfile)

        return metadata
示例#18
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
def get_paired_orientation_bowtie2(
    reference_genome, filename1, filename2, outpath=None):
    # Return tuple of ("ff", "fr", or "rf"; reads_ns; reads_fr;
    # reads_rf; reads_ff).
    import os
    import shutil
    import tempfile
    import multiprocessing
    #from genomicode import genomelib
    from genomicode import alignlib
    from genomicode import parallel

    # Strategy: run bowtie2 in all orientations.  Return the one with
    # most reads aligned.  Do with a subset of the data, so this
    # doesn't take a long time.

    # 100 is too low.  Gave is wrong result (fr instead of rf) on the
    # Thunderbolts miSEQ data.  Minimum number that gives right answer
    # is 250.
    # NUM_READS  Time (s)  1 core
    #     50       2.4
    #    100       2.4
    #    250       2.5
    #    500       2.7
    #   1000       2.9
    #   2000       3.3
    #   5000       4.6
    #  10000       7.8
    # 100000      52.6
    NUM_READS = 1000

    # If outpath is None, then put everything into a temporary
    # directory.
    path = outpath   # where to write the results
    tempdir = None   # temporary directory to be deleted
    try:
        if path is None:
            tempdir = tempfile.mkdtemp(dir=".")
            path = tempdir   # write into a temporary directory

        #short_filename1 = os.path.join(path, "short_1.fq")
        #short_filename2 = os.path.join(path, "short_2.fq")
        #copy_fastq(filename1, short_filename1, NUM_READS)
        #copy_fastq(filename2, short_filename2, NUM_READS)
        sam_ff = os.path.join(path, "orient_ff.sam")
        sam_fr = os.path.join(path, "orient_fr.sam")
        sam_rf = os.path.join(path, "orient_rf.sam")
        sam_ns = os.path.join(path, "orient_ns.sam")
        log_ff = os.path.join(path, "orient_ff.log")
        log_fr = os.path.join(path, "orient_fr.log")
        log_rf = os.path.join(path, "orient_rf.log")
        log_ns = os.path.join(path, "orient_ns.log")


        nc = multiprocessing.cpu_count()
        nc = int(max(nc/4.0, 1))
        nc = 1
        x1 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_ff, orientation="ff",
            max_reads=NUM_READS, num_threads=nc)
        x2 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_fr, orientation="fr",
            max_reads=NUM_READS, num_threads=nc)
        x3 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_rf, orientation="rf",
            max_reads=NUM_READS, num_threads=nc)
        x4 = alignlib.make_bowtie2_command(
            reference_genome, fastq_file1=filename1,
            fastq_file2=filename2, sam_file=sam_ns, orientation=None,
            max_reads=NUM_READS, num_threads=nc)
        x1 += " >& %s" % log_ff
        x2 += " >& %s" % log_fr
        x3 += " >& %s" % log_rf
        x4 += " >& %s" % log_ns
        commands = [x1, x2, x3, x4]

        parallel.pshell(commands)

        # Read the results.
        output_ff = alignlib.parse_bowtie2_output(log_ff)
        output_fr = alignlib.parse_bowtie2_output(log_fr)
        output_rf = alignlib.parse_bowtie2_output(log_rf)
        output_ns = alignlib.parse_bowtie2_output(log_ns)
    finally:
        if tempdir is not None and os.path.exists(tempdir):
            shutil.rmtree(tempdir)

    reads_ff = output_ff["concordant_reads"]
    reads_fr = output_fr["concordant_reads"]
    reads_rf = output_rf["concordant_reads"]
    reads_ns = output_ns["concordant_reads"]
    assert type(reads_ff) is type(0)

    orient = [
        (reads_ff, "ff"),
        (reads_fr, "fr"),
        (reads_rf, "rf"),
        #(reads_ns, None),
        ]
    orient.sort(reverse=True)

    # Debug:
    if False:
        print orient
        raise AssertionError

    # If highest is within 10% of the un-stranded one, then it's
    #cutoff = reads_ns * 0.10
    #if reads_ns >= orient[3][0] - reads_ns*0.10:
    #    return None
    return orient[0][-1], reads_ns, reads_fr, reads_rf, reads_ff
示例#20
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        ## Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        jobs = []
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            #raw_outfile = os.path.join(out_path, "%s.raw" % sample)
            vcf_outfile = os.path.join(out_path, "%s.vcf" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(bam_filename=bam_filename,
                                      vcf_outfile=vcf_outfile,
                                      log_filename=log_filename)
            jobs.append(x)

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar
        #   -T HaplotypeCaller -R ucsc.hg19.fasta
        #   -dontUseSoftClippedBases -stand_call_conf 20.0
        #   -stand_emit_conf 20.0 -I $i -o $j

        # Make a list of commands.
        commands = []
        for j in jobs:
            # For debugging.  If exists, don't do it again.
            #if filelib.exists_nz(j.raw_outfile):
            if filelib.exists_nz(j.vcf_outfile):
                continue
            x = alignlib.make_GATK_command(T="HaplotypeCaller",
                                           R=ref.fasta_file_full,
                                           dontUseSoftClippedBases=None,
                                           stand_call_conf=20.0,
                                           stand_emit_conf=20.0,
                                           I=j.bam_filename,
                                           o=j.vcf_outfile)
            x = "%s >& %s" % (x, j.log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)

        # Filter each of the VCF files.
        #for j in jobs:
        #    filter_by_vartype(vartype, j.raw_outfile, j.vcf_outfile)
        #metadata["filter"] = vartype

        # Make sure the analysis completed successfully.
        x = [j.vcf_outfile for j in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
示例#21
0
def main():
    import os
    import shutil
    import argparse
    from genomicode import filelib
    from genomicode import parallel

    p = filelib.tswrite
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("mapability_file", help="PeakSeq mapability file.")
    parser.add_argument("treatment_bam", help="BAM file of treated sample.")
    parser.add_argument("control_bam", help="BAM file of background sample.")
    parser.add_argument("outpath", help="Directory to store the results.")

    parser.add_argument("--experiment_name", help="Name of experiment.")
    parser.add_argument("--fragment_length", type=int, help="")
    #group.add_argument(
    #    "--noclobber", action="store_true",
    #    help="Don't overwrite files if they already exist.")

    args = parser.parse_args()
    filelib.assert_exists_nz(args.mapability_file)
    filelib.assert_exists_nz(args.treatment_bam)
    filelib.assert_exists_nz(args.control_bam)

    if args.fragment_length:
        assert args.fragment_length > 0 and args.fragment_length < 10000

    # Set up directories to run it on.
    p("Setting up directories.\n")
    if not os.path.exists(args.outpath):
        os.mkdir(args.outpath)

    # Copy the mapability file to the outpath.
    shutil.copy2(args.mapability_file, args.outpath)

    # Do preprocessing for PeakSeq.
    p("Preprocessing.\n")
    treatment_preproc_path = os.path.join(args.outpath, "preprocess.treatment")
    control_preproc_path = os.path.join(args.outpath, "preprocess.control")
    if not os.path.exists(treatment_preproc_path):
        os.mkdir(treatment_preproc_path)
    if not os.path.exists(control_preproc_path):
        os.mkdir(control_preproc_path)
    x1 = make_peakseq_preproc_command(args.treatment_bam,
                                      treatment_preproc_path)
    x2 = make_peakseq_preproc_command(args.control_bam, control_preproc_path)
    x = parallel.pshell([x1, x2])
    print x
    # Make sure expected files exist.
    x1 = os.path.join(treatment_preproc_path, "chr_ids.txt")
    x2 = os.path.join(control_preproc_path, "chr_ids.txt")
    filelib.assert_exists_nz(x1)
    filelib.assert_exists_nz(x2)

    # Make configuration file.
    p("Making configuration file.\n")
    config_file = os.path.join(args.outpath, "config.dat")
    make_config_file(config_file,
                     treatment_preproc_path,
                     control_preproc_path,
                     args.mapability_file,
                     experiment_name=args.experiment_name,
                     fragment_length=args.fragment_length)

    # Run PeakSeq.
    p("Running PeakSeq in %s.\n" % args.outpath)
    cmd = make_peakseq_run_command(config_file)
    x = parallel.sshell(cmd, path=args.outpath)
    print x

    p("Done.\n")
示例#22
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        vcf_node, ref_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        jobs = []
        for in_filename in vcf_filenames:
            p, f = os.path.split(in_filename)
            f, exp = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.grp" % f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            recal_filename = os.path.join(out_path,
                                          "%s.recalibrate_SNP.recal" % f)
            tranches_filename = os.path.join(out_path,
                                             "%s.recalibrate_SNP.tranches" % f)
            rscript_filename = os.path.join(out_path,
                                            "%s.recalibrate_SNP_plots.R" % f)
            assert in_filename != out_filename
            x = (in_filename, log_filename, recal_filename, tranches_filename,
                 rscript_filename)
            jobs.append(x)

        # -resource:dbsnp,known=true,training=false,truth=false,prior=6.0
        #    dbsnp_135.b37.vcf
        # -resource:hapmap,known=false,training=true,truth=true,prior=15.0
        #    hapmap_3.3.b37.sites.vcf
        # -resource:1000G,known=false,training=true,truth=false,prior=10.0
        #    1000G_phase1.snps.high_confidence.vcf
        # -resource:omni,known=false,training=true,truth=false,prior=12.0
        #    1000G_omni2.5.b37.sites.vcf
        known_sites = []
        x1 = module_utils.get_user_option(user_options,
                                          "vcf_recal_dbsnp",
                                          not_empty=True,
                                          check_file=True)
        x2 = module_utils.get_user_option(user_options,
                                          "vcf_recal_mills_indels",
                                          not_empty=True,
                                          check_file=True)
        x3 = module_utils.get_user_option(user_options,
                                          "vcf_recal_1kg_indels",
                                          not_empty=True,
                                          check_file=True)
        x4 = module_utils.get_user_option(user_options,
                                          "vcf_recal_omni",
                                          not_empty=True,
                                          check_file=True)
        y1 = "resource:dbsnp,known=true,training=false,truth=false,prior=6.0"
        y2 = "resource:hapmap,known=false,training=true,truth=true,prior=15.0"
        y3 = "resource:1000G,known=false,training=true,truth=false,prior=10.0"
        y4 = "resource:omni,known=false,training=true,truth=false,prior=12.0"
        known_sites = [(y1, x1), (y2, x2), (y3, x3), (y4, x4)]

        # Names of annotations to be used for annotations.
        AN = [
            "DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum",
            "InbreedingCoeff"
        ]
        TRANCHE = ["100.0", "99.9", "99.0", "90.0"]

        # Make a list of commands.
        commands = []
        for x in jobs:
            (in_filename, log_filename, recal_filename, tranches_filename,
             rscript_filename) = x
            x1 = known_sites
            x2 = [("an", x) for x in AN]
            x3 = [("tranche", x) for x in TRANCHE]
            unhash = x1 + x2 + x3
            x = alignlib.make_GATK_command(T="VariantRecalibrator",
                                           R=ref.fasta_file_full,
                                           input=in_filename,
                                           mode="SNP",
                                           recalFile=recal_filename,
                                           tranchesFile=tranches_filename,
                                           rscriptFile=rscript_filename,
                                           _UNHASHABLE=unhash)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
示例#23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node, insert_size_node, alignment_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # ./pindel -f <reference.fa> -i <bam_configuration_file>
        #   -c <chromosome_name> -o <out_prefix>
        #   -T <num threads>
        #
        # Creates files:
        # <out_prefix>_D     Deletion
        # <out_prefix>_SI    Short insertion
        # <out_prefix>_LI    Long insertion
        # <out_prefix>_INV   Inversion
        # <out_prefix>_TD    Tandem deletion
        # <out_prefix>_BP    Breakpoint
        # <out_prefix>_RP    ??? read pair???
        # <out_prefix>_CloseEndMapped   Only on end could be mapped.

        # Pindel cannot handle spaces in the BAM filenames (because of
        # the config file).  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = opj(bam_path, "%s.bam" % x)
            local_bai = opj(bam_path, "%s.bam.bai" % x)
            config_filename = opj(out_path, "%s.config.txt" % sample)
            out_prefix = opj(out_path, sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      config_filename=config_filename,
                                      out_prefix=out_prefix,
                                      log_filename=log_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # Read the insert sizes.
        summary_file = opj(insert_size_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2size = _read_insert_sizes(summary_file)
        # Make sure all the samples have inserts.
        for j in jobs:
            assert j.sample in sample2size, \
                   "Missing in insert size file: %s" % j.sample

        # Read the fragment sizes.
        summary_file = opj(alignment_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2readlen = _read_fragment_sizes(summary_file)
        # Make sure all the samples have read lengths.
        for j in jobs:
            assert j.sample in sample2readlen, \
                   "Missing in alignment summary file: %s" % j.sample

        # Make the config file.
        for j in jobs:
            # <insert size> is the whole length to be sequenced, including
            # the length of the pair of reads.  Picard only counts the
            # sequence between the reads.
            size = sample2size[j.sample]
            read_length = sample2readlen[j.sample]
            insert_size = size + read_length * 2
            handle = open(j.config_filename, 'w')
            print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample)
            handle.close()

        # Make a list of commands.
        pindel = mlib.get_config("pindel", which_assert_file=True)
        sq = parallel.quote
        commands = []
        for j in jobs:
            cmd = [
                sq(pindel),
                "-f",
                sq(ref.fasta_file_full),
                "-i",
                sq(j.config_filename),
                "-c",
                "ALL",
                "-T",
                1,
                "-o",
                sq(j.out_prefix),
            ]
            cmd = " ".join(map(str, cmd))
            cmd = "%s >& %s" % (cmd, j.log_filename)
            commands.append(cmd)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        x = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x)
        x1 = ["%s_D" % x.out_prefix for x in jobs]
        x2 = ["%s_SI" % x.out_prefix for x in jobs]
        x3 = ["%s_LI" % x.out_prefix for x in jobs]
        x4 = ["%s_INV" % x.out_prefix for x in jobs]
        x5 = ["%s_TD" % x.out_prefix for x in jobs]
        x6 = ["%s_BP" % x.out_prefix for x in jobs]
        x = x1 + x2 + x3 + x4 + x5 + x6
        filelib.assert_exists_many(x)

        return metadata
示例#24
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile,
        #          fixed_outfile, filtered_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            orig_outfile = opj(out_path, "%s.raw" % sample)
            fix_outfile = opj(out_path, "%s.vcf" % sample)
            #filter_outfile = opj(out_path, "%s.vcf" % sample)
            x = cancer_sample, normal_bamfile, cancer_bamfile, \
                orig_outfile, fix_outfile
            x = filelib.GenericObject(cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      orig_outfile=orig_outfile,
                                      fix_outfile=fix_outfile)
            jobs.append(x)

        # python /usr/local/museq/classify.py \
        #   normal:test31/normal.bam tumour:test31/tumor.bam \
        #   reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   model:/usr/local/museq/model_v4.1.2.npz \
        #   --config /usr/local/museq/metadata.config \
        #   -o test51.vcf
        opj = os.path.join
        museq = mlib.get_config("museq", assert_exists=True)
        classify_py = opj(museq, "classify.py")
        model_file = opj(museq, "model_v4.1.2.npz")
        config_file = opj(museq, "metadata.config")
        filelib.assert_exists_nz(classify_py)
        filelib.assert_exists_nz(model_file)
        filelib.assert_exists_nz(config_file)

        # museq's config file generates a broken VCF file.  Fix it.
        fixed_config_file = "fixed.config"
        fix_config_file(config_file, fixed_config_file)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x

            x = [
                "python",  # should allow user to specify python
                sq(classify_py),
                sq("normal:%s" % j.normal_bamfile),
                sq("tumour:%s" % j.cancer_bamfile),
                sq("reference:%s" % ref.fasta_file_full),
                sq("model:%s" % model_file),
                "--config",
                sq(fixed_config_file),
                "-o",
                sq(j.orig_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.  On Thunderbolts test,
        # took < 1 Gb.
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # JointSNVMix produces non-standard VCF files.  Fix this so it
        # will work with other programs downstream.
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x
            fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile)

        # Filter each of the VCF files.
        #for x in jobs:
        #    cancer_sample, normal_bamfile, cancer_bamfile, \
        #                   raw_outfile, fix_outfile, vcf_outfile = x
        #    filter_by_vartype(vartype, fix_outfile, vcf_outfile)
        #metadata["filter"] = vartype

        #x = [x[-1] for x in jobs]
        x = [j.fix_outfile for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        in_filenames = mlib.find_bam_files(bam_node.identifier)
        assert in_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            f, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filename = os.path.join(out_path, "%s.intervals" % f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        filter_reads_with_N_cigar = mlib.get_user_option(
            user_options,
            "filter_reads_with_N_cigar",
            allowed_values=["no", "yes"])

        known_sites = []
        x1 = mlib.get_user_option(user_options,
                                  "realign_known_sites1",
                                  check_file=True)
        x2 = mlib.get_user_option(user_options,
                                  "realign_known_sites2",
                                  check_file=True)
        x3 = mlib.get_user_option(user_options,
                                  "realign_known_sites3",
                                  check_file=True)
        x = [x1, x2, x3]
        x = [x for x in x if x]
        known_sites = x
        assert known_sites

        # I/O bound, so not likely to get a big speedup with nt.

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4
        #   -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j
        #   --known <known_vcf_file>

        # RealignerTargetCreator takes ~10Gb per process.  Each thread
        # takes the full amount of memory.
        nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            n = max(1, nc / len(jobs))
            x = [("-known", x) for x in known_sites]
            if filter_reads_with_N_cigar == "yes":
                x.append(("-filter_reads_with_N_cigar", None))
            x = alignlib.make_GATK_command(nt=n,
                                           T="RealignerTargetCreator",
                                           R=ref.fasta_file_full,
                                           I=in_filename,
                                           o=out_filename,
                                           _UNHASHABLE=x)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=nc)
        metadata["num_procs"] = nc
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
        return metadata
示例#26
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        ## Importing pysam is hard!
        #import sys
        #sys_path_old = sys.path[:]
        #sys.path = [x for x in sys.path if x.find("RSeQC") < 0]
        #import pysam
        #sys.path = sys_path_old

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            assert in_filename != out_filename
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        # Don't do this.  Need MD, NM, NH in
        # summarize_alignment_cigar.  To be sure, just redo it.
        ## If the files already have MD tags, then just symlink the
        ## files.  Don't add again.
        #i = 0
        #while i < len(jobs):
        #    in_filename, out_filename = jobs[i]
        #
        #    handle = pysam.AlignmentFile(in_filename, "rb")
        #    align = handle.next()
        #    tag_dict = dict(align.tags)
        #    if "MD" not in tag_dict:
        #        i += 1
        #        continue
        #    # Has MD tags.  Just symlink and continue.
        #    os.symlink(in_filename, out_filename)
        #    del jobs[i]

        # Make a list of samtools commands.
        # Takes ~200 Mb per process, so should not be a big issue.
        samtools = filelib.which_assert(config.samtools)
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            # samtools calmd -b <in.bam> <ref.fasta> > <out.bam>

            # May generate error:
            # [bam_fillmd1] different NM for read
            #   'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19
            # Pipe stderr to different file.
            x = [
                samtools,
                "calmd",
                "-b",
                sq(in_filename),
                sq(ref.fasta_file_full),
            ]
            x = " ".join(x)
            x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename))
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)
示例#27
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_folder, sample_node, gene_node, strand_node = antecedents
        bam_path = bam_folder.identifier
        assert filelib.dir_exists(bam_path)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}

        attr2order = {
            "name": "name",
            "coordinate": "pos",
        }
        x = bam_folder.data.attributes["sorted"]
        sort_order = attr2order.get(x)
        assert sort_order, "Cannot handle sorted: %s" % x

        #attr2stranded = {
        #    "single" : "no",
        #    "paired" : "no",
        #    "paired_ff" : None,
        #    "paired_fr" : "yes",
        #    "paired_rf" : "reverse",
        #    }
        #x = sample_node.data.attributes["orientation"]
        #stranded = attr2stranded.get(x)
        #assert stranded, "Cannot handle orientation: %s" % x

        ht_stranded = None
        if stranded.stranded == "unstranded":
            ht_stranded = "no"
        elif stranded.stranded == "firststrand":
            ht_stranded = "reverse"
        elif stranded.stranded == "secondstrand":
            ht_stranded = "yes"
        assert ht_stranded is not None

        #gtf_file = mlib.get_user_option(
        #    user_options, "gtf_file", not_empty=True)
        #assert os.path.exists(gtf_file), "File not found: %s" % gtf_file

        mode = mlib.get_user_option(user_options,
                                    "htseq_count_mode",
                                    allowed_values=[
                                        "union", "intersection-strict",
                                        "intersection-nonempty"
                                    ])

        # Make a list of the jobs to run.
        jobs = []
        for bam_filename in filelib.list_files_in_path(bam_path,
                                                       endswith=".bam",
                                                       case_insensitive=True):
            x = os.path.split(bam_filename)[1]
            x = os.path.splitext(x)[0]
            x = "%s.count" % x
            out_file = x
            x = bam_filename, out_file
            jobs.append(x)

        # Generate commands for each of the files.
        sq = parallel.quote
        commands = []
        for x in jobs:
            bam_filename, out_file = x
            x = alignlib.make_htseq_count_command(bam_filename,
                                                  gtf_file,
                                                  sort_order,
                                                  ht_stranded,
                                                  mode=mode)
            x = "%s >& %s" % (x, sq(out_file))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # Make sure the analysis completed successfully.
        x = [x[1] for x in jobs]
        x = [os.path.join(out_path, x) for x in x]
        output_filenames = x
        filelib.assert_exists_nz_many(output_filenames)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
示例#29
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        #import call_variants_GATK

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # Platypus generates an error if there are spaces in the BAM
        # filename.  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = os.path.join(bam_path, "%s.bam" % x)
            local_bai = os.path.join(bam_path, "%s.bam.bai" % x)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            err_filename = os.path.join(out_path, "%s.err" % sample)
            # Unfiltered file.
            #raw_filename = os.path.join(out_path, "%s.raw" % sample)
            # Final VCF file.
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = filelib.GenericObject(bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      log_filename=log_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # TODO: Keep better track of the metadata.
        buffer_size = 100000
        max_reads = 5E6
        # Running into errors sometimes, so increase these numbers.
        #   WARNING - Too many reads (5000000) in region
        #   1:500000-600000. Quitting now. Either reduce --bufferSize or
        #   increase --maxReads.
        buffer_size = buffer_size * 10
        max_reads = max_reads * 10

        # Make a list of commands.
        commands = []
        for j in jobs:
            #nc = max(1, num_cores/len(jobs))
            x = alignlib.make_platypus_command(bam_file=j.local_bam,
                                               ref_file=ref.fasta_file_full,
                                               log_file=j.log_filename,
                                               out_file=j.out_filename,
                                               buffer_size=buffer_size,
                                               max_reads=max_reads)
            x = "%s >& %s" % (x, j.err_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        for j in jobs:
            if filelib.exists_nz(j.out_filename):
                continue
            for line in open(j.err_filename):
                if line.find("WARNING - Too many reads") >= 0:
                    print line,
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        # Filter each of the VCF files.
        #for j in jobs:
        #    call_variants_GATK.filter_by_vartype(
        #        vartype, j.raw_filename, j.out_filename)
        #metadata["filter"] = vartype

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node, target_node = antecedents

        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        target_filenames = filelib.list_files_in_path(target_node.identifier,
                                                      endswith=".intervals")
        assert target_filenames, "No .intervals files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        assert len(bam_filenames) == len(target_filenames), \
               "Should have an .intervals file for each bam file."
        sample2bamfilename = {}
        for filename in bam_filenames:
            p, f = os.path.split(filename)
            sample, ext = os.path.splitext(f)
            assert sample not in sample2bamfilename
            sample2bamfilename[sample] = filename
        sample2targetfilename = {}
        for filename in target_filenames:
            p, f = os.path.split(filename)
            sample, ext = os.path.splitext(f)
            assert sample not in sample2targetfilename
            sample2targetfilename[sample] = filename
        assert len(sample2bamfilename) == len(sample2targetfilename)

        missing = [
            x for x in sample2bamfilename if x not in sample2targetfilename
        ]
        assert not missing, "Missing interval files for %d bam files." % \
               len(missing)

        # list of (bam_filename, target_filename, log_filename, out_filename)
        jobs = []
        for sample in sample2bamfilename:
            bam_filename = sample2bamfilename[sample]
            target_filename = sample2targetfilename[sample]

            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = bam_filename, target_filename, log_filename, out_filename
            jobs.append(x)

        known_sites = []
        x1 = module_utils.get_user_option(user_options,
                                          "realign_known_sites1",
                                          check_file=True)
        x2 = module_utils.get_user_option(user_options,
                                          "realign_known_sites2",
                                          check_file=True)
        x3 = module_utils.get_user_option(user_options,
                                          "realign_known_sites3",
                                          check_file=True)
        x = [x1, x2, x3]
        x = [x for x in x if x]
        known_sites = x
        assert known_sites

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar \
        #   -T IndelRealigner -R <ref.fa> \
        #   -I <bam_file> -targetIntervals <target_file> -o <bam_file>

        # Make a list of commands.
        commands = []
        for x in jobs:
            bam_filename, target_filename, log_filename, out_filename = x
            x = [("known", x) for x in known_sites]
            x = alignlib.make_GATK_command(T="IndelRealigner",
                                           R=ref.fasta_file_full,
                                           I=bam_filename,
                                           targetIntervals=target_filename,
                                           o=out_filename,
                                           _UNHASHABLE=x)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)