예제 #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import shutil
        from genomicode import parallel

        vcf_folder = in_data
        vcf_files = find_vcf_files(vcf_folder.identifier)
        metadata = {}

        TEMPFILE = "temp.txt"
        handle = open(TEMPFILE, 'w')
        header = ("Caller", "File", "Sample", "Chrom", "Pos", "Ref", "Alt",
                  "Source", "Num Ref", "Num Alt", "Total Reads", "VAF",
                  "Filter", "Call", "GQ")
        print >> handle, "\t".join(header)
        handle.close()

        # Write out data from each of the VCF files.
        jobs = []
        for x in vcf_files:
            filestem, filename = x

            # filestem   197B-MG
            # filename   /data/jchang/biocore/call01/radia.vcf/197B-MG.vcf

            args = filename, filestem, header, TEMPFILE
            x = summarize_vcf_file, args, {}
            jobs.append(x)
        parallel.pyfun(jobs, num_procs=num_cores, lock_keyword="lock")
        metadata["num_cores"] = num_cores

        shutil.move(TEMPFILE, out_filename)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 4

        filelib.safe_mkdir(out_path)
        filenames = module_utils.find_fastq_files(in_data.identifier)
        assert filenames, "I could not find any FASTQ files."

        REMOVE = [".gz", ".bz2", ".xz"]

        # Uncompress the files to the new directory in parallel.
        commands = []
        for in_filename in filenames:
            in_path, in_file = os.path.split(in_filename)
            x = in_file
            for r in REMOVE:
                if x.lower().endswith(r):
                    x = x[:-len(r)]
            out_file = x
            out_filename = os.path.join(out_path, out_file)

            args = in_filename, out_filename
            keywds = {}
            x = uncompress_file, args, keywds
            commands.append(x)

        nc = min(MAX_CORES, num_cores)
        parallel.pyfun(commands, num_procs=nc)
예제 #3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        bam_filenames = module_utils.find_bam_files(in_data.identifier)
        assert bam_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.matches.txt" % s)
            x = in_filename, out_filename
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            in_filename, out_filename = x
            x = summarize_bam_file, (in_filename, out_filename), None
            jobs2.append(x)

        parallel.pyfun(jobs2, num_procs=num_cores, DELAY=0.1)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        for bam_filename in bam_filenames:
            x = count_duplicates, (bam_filename,), {}
            jobs.append(x)
        results = parallel.pyfun(jobs, num_procs=num_cores)
        metadata["num_cores"] = num_cores
        assert len(results) == len(bam_filenames)

        handle = open(outfile, 'w')
        header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated"
        print >>handle, "\t".join(header)
        for i in range(len(bam_filenames)):
            x, sample, x = mlib.splitpath(bam_filenames[i])
            total_reads, dup_reads = results[i]
            perc_dup = float(dup_reads) / total_reads * 100
            perc_dup = "%.2f" % perc_dup
            x = sample, dup_reads, total_reads, perc_dup
            print >>handle, "\t".join(map(str, x))
       
        return metadata
예제 #5
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel

        vcf_node = in_data
        vcf_files = filelib.list_files_in_path(vcf_node.identifier,
                                               endswith=".vcf",
                                               case_insensitive=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # in_vcf_filename, out_vcf_filename
        for vcf_file in vcf_files:
            path, file_ = os.path.split(vcf_file)
            out_vcf_file = os.path.join(out_path, file_)
            x = vcf_file, out_vcf_file
            jobs.append(x)

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel"]

        # Generate the commands.
        commands = []
        for x in jobs:
            in_vcf_file, out_vcf_file = x

            args = vartype, in_vcf_file, out_vcf_file
            x = filter_by_vartype, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        import filter_variants_GATK

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        assert vcf_filenames, "No VCF files found."
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        metadata["filter"] = vartype

        jobs = []  # list of filelib.GenericObject
        for in_filename in vcf_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(in_filename=in_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        # Filter each of the VCF files.
        jobs2 = []
        for j in jobs:
            args = vartype, j.in_filename, j.out_filename
            x = filter_variants_GATK.filter_by_vartype, args, {}
            jobs2.append(x)
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 2

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "I could not find any FASTQ files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_samples = mlib.get_user_option(user_options,
                                           "num_samples",
                                           not_empty=True,
                                           type=int)
        metadata["num_samples"] = num_samples

        jobs = []
        for in_filename in filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        cmds = []
        for x in jobs:
            in_filename, out_filename = x
            x = copy_fastq_file, (in_filename, out_filename, num_samples), {}
            cmds.append(x)

        nc = min(MAX_CORES, num_cores)
        metadata["num cores"] = nc
        parallel.pyfun(cmds, num_procs=nc)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_filename):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        fastq_node, sample_node, align_node = antecedents
        fastq_data = mlib.find_merged_fastq_files(sample_node.identifier,
                                                  fastq_node.identifier)
        assert fastq_data, "I could not find any FASTQ files."
        align_filenames = filelib.list_files_in_path(align_node.identifier,
                                                     endswith=".matches.txt")
        assert align_filenames, "No .matches.txt files."
        align_filenames.sort()
        metadata = {}

        assert len(fastq_data) == len(align_filenames), \
               "Mismatch: num samples %d %d" % (
            len(fastq_data), len(align_filenames))

        num_mismatches = mlib.get_user_option(user_options,
                                              "num_mismatches",
                                              type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2fastqdata = {}
        for x in fastq_data:
            sample, f1, f2 = x
            sample2fastqdata[sample] = x

        # list of (sample, align_filename, summary_filename,
        #   fastq_filename1, fastq_filename2)
        jobs = []
        for in_filename in align_filenames:
            p, f = os.path.split(in_filename)
            # <sample>.matches.txt
            ext = ".matches.txt"
            assert f.endswith(ext)
            sample = f[:-len(ext)]
            assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample
            summary_filename = "%s.summary.txt" % sample
            x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample]
            x = sample, in_filename, summary_filename, \
                fastq_filename1, fastq_filename2
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, align_filename, summary_filename, \
                    fastq_file1, fastq_file2 = x
            args = align_filename, fastq_file1, fastq_file2, num_mismatches
            keywds = {
                "temp_path": ".",
                "outfile": summary_filename,
            }
            x = summarize_matches_file, args, keywds
            jobs2.append(x)

        # Since this can take a lot of memory (depending on the number
        # of reads, can easily take 8 Gb), do just 1 process at a
        # time.  Also, I/O intensive.  Don't do too many at a time.
        #MAX_PROCS = 1
        MAX_PROCS = 4
        nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS)
        #nc = min(MAX_PROCS, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1)
        metadata["num_cores"] = nc
        assert len(results) == len(jobs2)

        # Put together the results in a table.
        handle = open(out_filename, 'w')
        header = "sample", "match", "total", "RPM", "match", "mismatch"
        print >> handle, "\t".join(header)
        for x in zip(jobs, results):
            x, d = x
            sample, in_filename, summary_filename, \
                    fastq_filename1, fastq_filename2 = x
            match = d["perfect_alignments"]
            total = d["total_alignments"]
            rpm = int(float(match) / total * 1E6)
            perc_match = d["perc_perfect"]
            perc_mismatch = 1 - d["perc_perfect"]
            x = sample, match, total, rpm, perc_match, perc_mismatch
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()
        return metadata
예제 #9
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        MAX_CORES = 4  # I/O intensive.

        fastq_node, sample_node, bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier,
                                                    fastq_node.identifier,
                                                    as_dict=True)

        metadata = {}

        jobs = []  # list of (sample, bam_file, fastq_file)
        for filename in bam_filenames:
            path, sample, ext = mlib.splitpath(filename)
            assert sample in sample2fastq, "Missing fastq: %s" % sample
            fastq1, fastq2 = sample2fastq[sample]
            x = sample, filename, fastq1
            jobs.append(x)

        funcalls = []
        for x in jobs:
            sample, bam_filename, fastq_filename = x
            # Count the number of reads.
            x1 = count_reads, (fastq_filename, ), {}
            # Count the number of alignments.
            x2 = count_alignments, (bam_filename, ), {}
            funcalls.append(x1)
            funcalls.append(x2)
        assert len(funcalls) == len(jobs) * 2

        nc = min(num_cores, MAX_CORES)
        results = parallel.pyfun(funcalls, num_procs=nc)
        metadata["num_cores"] = nc

        # list of (sample, aligns, aligned_reads, total_reads, perc_aligned).
        results2 = []
        for i, x in enumerate(jobs):
            sample, bam_filename, fastq_filename = x
            x1 = results[i * 2]
            x2 = results[i * 2 + 1]
            total_reads = x1
            aligned_reads, alignments = x2
            perc_aligned = float(aligned_reads) / total_reads
            x = sample, alignments, aligned_reads, total_reads, perc_aligned
            results2.append(x)
        results = results2

        # sort by sample name
        results.sort()

        # Make table where the rows are the samples and the columns
        # are the statistics.
        table = []
        header = ("Sample", "Alignments", "Aligned Reads", "Total Reads",
                  "Perc Aligned")
        table.append(header)
        for x in results:
            sample, alignments, aligned_reads, total_reads, perc_aligned = x

            x1 = parselib.pretty_int(alignments)
            x2 = parselib.pretty_int(aligned_reads)
            x3 = parselib.pretty_int(total_reads)
            x4 = "%.2f%%" % (perc_aligned * 100)
            x = sample, x1, x2, x3, x4
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = mlib.findbin("txt2xls", quote=True)
        parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile))
        return metadata
예제 #10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_vcf_node, bf_vcf_node = antecedents
        in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        common_only = mlib.get_user_option(user_options,
                                           "backfill_common_only",
                                           allowed_values=["no", "yes"],
                                           not_empty=True)

        in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames]
        bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames]

        # Make sure there are no duplicate sample names.
        x1 = {}.fromkeys(in_vcf_samples).keys()
        x2 = {}.fromkeys(bf_vcf_samples).keys()
        assert len(in_vcf_samples) == len(x1), "Duplicate samples"
        assert len(bf_vcf_samples) == len(x2), "Duplicate samples"

        # Find the samples.
        common = [x for x in in_vcf_samples if x in bf_vcf_samples]
        in_only = [x for x in in_vcf_samples if x not in common]
        bf_only = [x for x in bf_vcf_samples if x not in common]
        assert common, "No common samples."

        pretty_in = parselib.pretty_list(in_only, max_items=5)
        pretty_bf = parselib.pretty_list(bf_only, max_items=5)
        if common_only == "no":
            assert not (in_only and bf_only), \
                   "Extra samples in both sets:\n%s\n%s" % (
                pretty_in, pretty_bf)
            assert not in_only, "Target VCF file has extra samples: %s" % \
                   pretty_in
            assert not bf_only, "Source VCF file has extra samples: %s." % \
                   pretty_bf
        SAMPLES = common

        # list of sample, in_vcf_filename, bf_vcf_filename, out_filename
        jobs = []
        for sample in SAMPLES:
            assert sample in in_vcf_samples
            assert sample in bf_vcf_samples
            i = in_vcf_samples.index(sample)
            j = bf_vcf_samples.index(sample)
            in_filename = in_vcf_filenames[i]
            bf_filename = bf_vcf_filenames[j]
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, bf_filename, out_filename
            jobs.append(x)

        jobs2 = []
        for x in jobs:
            sample, in_filename, bf_filename, out_filename = x
            fn = backfill_vcf
            args = in_filename, bf_filename, out_filename
            keywds = {}
            jobs2.append((fn, args, keywds))
        #num_cores = 1
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
예제 #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
예제 #12
0
def main():
    import os
    import sys
    import itertools
    import argparse

    import arrayio
    import analyze_clinical_outcome as aco
    import boxplot
    from genomicode import parallel

    parser = argparse.ArgumentParser(
        description="Associate gene expression patterns with a "
        "categorical phenotype.")
    
    parser.add_argument(
        'expression_file',
        help='Either a gene expression file (GCT,CDT,PCL format) or gene set '
        'scores from score_geneset.py.')
    parser.add_argument(
        'phenotype_file', help="Table of phenotypes (tab-delimited text "
        "file).")
    parser.add_argument(
        "--ignore_samples", help="Ignore the samples where an annotation "
        "(a column in the phenotype file) matches a specific value.  "
        "Format:<header>,<value>")
    parser.add_argument(
        "-j", dest="num_procs", type=int, default=1,
        help="Number of processors to use.")
    
    group = parser.add_argument_group(title='Analysis')
    group.add_argument(
        '--phenotype', default=[], action='append',
        help='Header in the phenotype file (MULTI).  Format: <header>')
    group.add_argument(
        '--all_phenotypes', action="store_true",
        help="Analyze all phenotypes in the file.")
    parser.add_argument(
        "--ignore_phenotype", default=[], action="append",
        help="Ignore this column in the phenotype file.  "
        "Helpful to get rid of the sample column when using "
        "--all_phenotypes.  Format: <header>  (MULTI)")
    group.add_argument(
        '--ignore_insufficient_groups', action="store_true",
        help="If a phenotype only has one group, then ignore it rather "
        "than raising an error.")
    group.add_argument(
        '--gene', default=[], action='append',
        help='Comma separated name or ID of genes to analyze.  '
        'I will search for this gene in the annotations of the '
        'expression_file.  '
        'You can use this parameter multiple times to search more genes.')
    group.add_argument(
        "--empty_vs_filled", action="store_true",
        help="Instead of categorizing by the contents of the cells, "
        "compare the ones that are empty against the ones that are filled.")
    group.add_argument(
        "--all_genes", action="store_true",
        help="Run analysis on all genes in this file.")
    group.add_argument(
        '--geneset', default=[], action='append',
        help='Name of the geneset to analyze. To specify multiple gene sets, '
        'use this parameter multiple times.')
    group.add_argument(
        "--center_by_phenotype",
        help="Center the scores or gene expression values seen for a "
        "phenotype to 0.  Only one --phenotype can be analyzed in this way "
        "at a time.  This phenotype should have two possible values.  "
        "If there are more values, they need to be merged into two groups.  "
        "Each phenotype must be seen in each BATCH.  "
        "Format: <BATCH_HEADER>;<PHENO 1 VALUE>[,<PHENO 1 VALUE>,...];"
        "<PHENO 2 VALUE>[,<PHENO 2 VALUE>,...]")
    group = parser.add_argument_group(title='Output')
    group.add_argument(
        '-o', dest='filestem', default=None,
        help='Prefix used to name files.  e.g. "myanalysis".')
    group.add_argument(
        "--gene_header", action="append", default=[],
        help="When naming the output file, use the gene name(s) under this "
        "Header (MULTI).  If not given, will try to use a combination of the "
        "probe ID and gene symbol.")

    group = parser.add_argument_group(title='Formatting the boxplot')
    group.add_argument(
        "--box_mar_left", default=1.0, type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--box_mar_bottom", default=1.0, type=float,
        help="Scale margin at bottom of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--box_mar_top", default=1.0, type=float,
        help="Scale margin at top of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--water_mar_left", default=1.0, type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--water_mar_bottom", default=1.0, type=float,
        help="Scale margin at bottom of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--water_mar_top", default=1.0, type=float,
        help="Scale margin at top of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--water_xlabel_off", action="store_true",
        help="Do not label the X axis on the waterfall plot.")

    ## group.add_argument(
    ##     '--km_title', default=None, help='Title for the Kaplan-Meier plot.')
    ## group.add_argument(
    ##     '--km_title_size', default=1.0, type=float,
    ##     help='Scale the size of the title.  Default 1.0 (no scaling).')
    ## group.add_argument(
    ##     '--km_mar_title', default=1.0, type=float, 
    ##     help="Scale margin for the title.  Default 1.0 (no scaling).")
    ## group.add_argument(
    ##     '--km_subtitle_size', default=1.0, type=float,
    ##     help='Scale the size of the subtitle.  Default 1.0 (no scaling).')
    ## group.add_argument(
    ##     '--km_mar_subtitle', default=1.0, type=float, 
    ##     help="Scale margin for the subtitle.  Default 1.0 (no scaling).")
    ## group.add_argument(
    ##     '--km_xlab', default=None, 
    ##     help='x-axis label for the Kaplan-Meier plot.')
    ## group.add_argument(
    ##     '--km_ylab', default=None, 
    ##     help='y-axis label for the Kaplan-Meier plot.')
    ## group.add_argument(
    ##     '--km_legend_size', default=1.0, type=float,
    ##     help='Scale the size of the legend.  Default 1.0 (no scaling).')
    
    args = parser.parse_args()

    # Check inputs.
    assert args.expression_file, (
        'Please specify a gene expression or gene set score file.')
    assert os.path.exists(args.expression_file), "File not found: %s" % \
           args.expression_file
    assert args.phenotype_file, 'Please specify a phenotype file.'
    assert os.path.exists(args.phenotype_file), "File not found: %s" % \
           args.phenotype_file
    assert args.num_procs >= 1 and args.num_procs < 100

    assert args.phenotype or args.all_phenotypes, \
           'Please specify the phenotype to analyze.'
    assert not (args.phenotype and args.all_phenotypes)
    assert args.gene or args.geneset or args.all_genes, \
           'Please specify a gene or gene set.'
    assert not (args.gene and args.all_genes)
    has_gene = args.gene or args.all_genes
    assert not (has_gene and args.geneset), \
        'Please specify either a gene or a gene set, not both.'

    assert args.box_mar_bottom > 0 and args.box_mar_bottom < 10
    assert args.box_mar_left > 0 and args.box_mar_left < 10
    assert args.box_mar_top > 0 and args.box_mar_top < 10
    assert args.water_mar_bottom > 0 and args.water_mar_bottom < 10
    assert args.water_mar_left > 0 and args.water_mar_left < 10
    assert args.water_mar_top > 0 and args.water_mar_top < 10
    ## assert args.km_title_size > 0 and args.km_title_size < 10
    ## assert args.km_mar_title > 0 and args.km_mar_title < 10
    ## assert args.km_subtitle_size > 0 and args.km_subtitle_size < 10
    ## assert args.km_mar_subtitle > 0 and args.km_mar_subtitle < 10
    ## assert args.km_legend_size > 0 and args.km_legend_size < 10
    
    # Clean up the input.
    phenotypes = parse_phenotypes(args.phenotype)
    genes = aco.parse_genes(args.gene)
    gene_sets = aco.parse_gene_sets(args.geneset)
    x = parse_groups(args.center_by_phenotype)
    center_batch, center_group1, center_group2 = x
    filestem = aco.parse_filestem(args.filestem)

    if center_batch:
        assert len(phenotypes) == 1, \
               "Only 1 phenotype can be centered by groups."

    # Read the input files.
    M = aco.read_expression_or_geneset_scores(
        genes, args.all_genes, gene_sets, args.expression_file)
    x = aco.read_clinical_annotations(M, args.phenotype_file)
    M, clinical_annots = x

    # Filter the phenotype files.
    if args.ignore_samples:
        x = ignore_samples(M, clinical_annots, args.ignore_samples)
        M, clinical_annots = x

    if args.all_phenotypes:
        phenotypes = sorted(clinical_annots)
    phenotypes = [x for x in phenotypes if x not in args.ignore_phenotype]

    # Make sure at least one of the phenotypes are in the clinical
    # annotations.
    x = [x for x in phenotypes if x in clinical_annots]
    assert x, "Could not find phenotypes: %s" % ", ".join(phenotypes)
    phenotypes = x

    # Select the genes or gene sets of interest.
    if not args.all_genes:
        x = genes or gene_sets
        M = M.matrix(row=x)
    assert M.nrow(), "I could not find any of the genes or gene sets."

    # Make sure the batch information is valid.
    if center_batch:
        assert center_batch in clinical_annots, "Missing annotation: %s" % \
               center_batch
        assert len(phenotypes) == 1
        pheno = phenotypes[0]
        values = clinical_annots[pheno]
        for x in values:
            assert x in center_group1 or x in center_group2, \
                   "Unknown phenotype: %s" % x

    # Calculate the association of each gene and each phenotype.
    #expression_or_score = "Expression"
    #if gene_sets:
    #    expression_or_score = "Score"

    jobs = []  # list of (function, args, keywds)
    keys = []
    for x in itertools.product(phenotypes, range(M.nrow())):
        pheno_header, i = x
        phenotype = clinical_annots[pheno_header]
        if args.empty_vs_filled:
            x = ["0"] * len(phenotype)
            for j in range(len(phenotype)):
                if phenotype[j].strip():
                    x[j] = "1"
            phenotype = x
        
        scores = M.value(i, None)
        if center_batch:
            batch = clinical_annots[center_batch]
            scores = center_scores(
                scores, batch, phenotype, center_group1, center_group2)

        x = phenotype, scores, args.ignore_insufficient_groups
        x = calc_association, x, {}
        jobs.append(x)
        keys.append((pheno_header, i))
    retvals = parallel.pyfun(jobs, num_procs=args.num_procs)
    assert len(retvals) == len(keys)

    # (header, gene_index) -> returned from calc_association
    gene_phenotype_scores = {}
    for (pheno_header, i), x in zip(keys, retvals):
        if x is None:
            continue
        gene_phenotype_scores[(pheno_header, i)] = x
        

    # Files generated:
    # <filestem>.stats.txt      Or to STDOUT if no <filestem> given.
    # <filestem>.<outcome>.<gene_id>.waterfall.png
    # <filestem>.<outcome>.<gene_id>.boxplot.png
    # <filestem>.<outcome>.<gene_id>.prism.txt        Prism format.

    # Write the output in a table with headers:
    # <headers>            # From the expression or gene set file.
    # Phenotype
    # Groups               # one for each group
    # Num Samples          # one for each group, separated by semicolon
    # Average Expression   # one for each group, separated by semicolon
    # Relationship
    # p-value

    outhandle = sys.stdout
    if filestem:
        outhandle = open("%s.stats.txt" % filestem, 'w')

    # Figure out the header for the table.
    header = M.row_names() + [
        "Phenotype", "Groups", "Num Samples", "Average Expression",
        "Delta", "Relationship", "p-value"]
    print >>outhandle, "\t".join(header)

    # Write out each row of the table.
    for x in itertools.product(phenotypes, range(M.nrow())):
        pheno_header, gene_i = x
        SCORE = gene_phenotype_scores.get((pheno_header, gene_i))
        if not SCORE:   # couldn't calculate.
            continue

        gene_names = [M.row_names(x)[gene_i] for x in M.row_names()]
        phenotype = pheno_header
        group_names = SCORE["group_names"]
        I = range(len(group_names))
        num_samples = [SCORE["num_samples"][x] for x in I]
        mean_score = [SCORE["mean_score"][x] for x in I]
        delta = ""
        if len(group_names) == 2:
            delta = SCORE["delta"]
        relationship = SCORE["relationship"]
        p_value = SCORE["p_value"]

        _fmt = aco._format_list
        x = gene_names + [
            phenotype, _fmt(group_names), _fmt(num_samples), _fmt(mean_score),
            delta, relationship, p_value]
        assert len(x) == len(header)
        print >>outhandle, "\t".join(map(str, x))
    if filestem:
        outhandle.close()

    # Write out other files.
    if not filestem:
        return

    jobs = []  # list of (fn, args, keywds)
    for x in itertools.product(phenotypes, range(M.nrow())):
        pheno_header, gene_i = x
        SCORE = gene_phenotype_scores.get((pheno_header, gene_i))
        if not SCORE:
            continue
        
        # Write the PRISM file.
        gene_id = aco.format_gene_name(M, None, gene_i)
        sample_names = M.col_names(arrayio.COL_ID)
        filename = aco._make_filename(
            M, gene_i, filestem, pheno_header, args.gene_header,
            "prism", "txt")
        x1 = (filename,
                SCORE["scores"], SCORE["phenotypes"], SCORE["group_names"])
        x = write_prism_file, x1, {}
        jobs.append(x)

        
        # Make a boxplot.
        filename = aco._make_filename(
            M, gene_i, filestem, pheno_header, args.gene_header,
            "boxplot", "png")
        pretty_gene = aco.pretty_gene_name(M, args.gene_header, gene_i)

        group_names = SCORE["group_names"]
        pheno2scores = {}
        for pheno, score in zip(SCORE["phenotypes"], SCORE["scores"]):
            if pheno not in pheno2scores:
                pheno2scores[pheno] = []
            pheno2scores[pheno].append(score)
        p_value = "p=%.2g" % SCORE["p_value"]
        x1 = (filename, group_names, pheno2scores)
        x2 = {
            "height" : 1600,
            "width" : 1600,
            "title" : pretty_gene,
            "subtitle" : p_value,
            "subtitle_col" : "#A60400",
            "subtitle_size" : 1.2,
            "subtitle_line" : 0.5,
            "ylab" : "Gene Expression",
            "mar_bottom" : args.box_mar_bottom,
            "mar_left" : args.box_mar_left,
            "mar_top" : 1.25,
            }
        x = boxplot.plot_boxplot, x1, x2
        jobs.append(x)
            
        # Make a waterfall plot.
        #filename = "%s%s.%s.waterfall.png" % (
        #    filestem, pheno_header, gene_id_h)
        filename = aco._make_filename(
            M, gene_i, filestem, pheno_header, args.gene_header,
            "waterfall", "png")
        pretty = aco.pretty_gene_name(M, args.gene_header, gene_i)
        x1 = (
            filename, SCORE["scores"], SCORE["phenotypes"],
            SCORE["group_names"], sample_names, SCORE["p_value"], pretty,
            args.water_mar_bottom, args.water_mar_left, args.water_mar_top,
            args.water_xlabel_off)
        x = plot_waterfall, x1, {}
        jobs.append(x)
    parallel.pyfun(jobs, num_procs=args.num_procs)
예제 #13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        vcf_node, nc_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Filenames:
        # <caller>.vcf

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        genome = mlib.get_user_option(user_options,
                                      "snpeff_genome",
                                      not_empty=True)
        databases = list_snpeff_databases()
        assert genome in databases, "Unknown genome database: %s" % genome

        # For each caller, do the SnpEFF calls.  Some callers include
        # the somatic information, others do not.  If germline samples
        # are present, then do with _cancer.  Otherwise, do not.

        # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt
        #   GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log

        # Don't bother annotating positions that do not pass filter.
        # Filter them out first based on FILTER column.

        opj = os.path.join
        jobs = []
        for in_filename in vcf_filenames:
            path, stem, ext = mlib.splitpath(in_filename)
            samples_file = opj(out_path, "%s.cancerSamples.txt" % stem)
            filtered_filename = opj(out_path, "%s.filtered_input" % stem)
            out_filename = opj(out_path, "%s.vcf" % stem)
            log_filename = opj(out_path, "%s.log" % stem)
            x = filelib.GenericObject(in_filename=in_filename,
                                      samples_file=samples_file,
                                      filtered_filename=filtered_filename,
                                      out_filename=out_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # First, filter each of the VCF files.
        commands = []
        for j in jobs:
            # For debugging.  If this file exists, don't filter it again.
            if os.path.exists(j.filtered_filename):
                continue
            args = j.in_filename, j.filtered_filename, wgs_or_wes
            x = vcflib.filter_vcf_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Make the cancer_samples files.
        for j in jobs:
            # Will generate this if there are cancer samples.
            make_cancer_samples_file(j.filtered_filename, nc_match,
                                     j.samples_file)

        # Make a list of commands.
        commands = []
        for j in jobs:
            cancer = False
            if os.path.exists(j.samples_file):
                cancer = True
            x = make_snpeff_command(j.filtered_filename,
                                    genome,
                                    j.out_filename,
                                    j.log_filename,
                                    is_cancer=cancer,
                                    cancer_samples_file=j.samples_file)
            commands.append(x)

        nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        # Make sure the analysis completed successfully.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Log files should be empty.
        for j in jobs:
            filelib.assert_exists(j.log_filename)
            assert not filelib.exists_nz(j.log_filename), \
                   "Error with %s.\n%s" % (j.stem, j.log_filename)
            filelib.safe_unlink(j.log_filename)

        return metadata
예제 #14
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import ngslib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        features_bed = mlib.get_user_option(user_options,
                                            "features_bed",
                                            check_file=True)
        if features_bed:
            metadata["features_bed"] = features_bed

        # Applies to genomecov.
        min_coverage = user_options.get("ignore_coverage_below")
        if min_coverage == "":
            min_coverage = None
        if min_coverage is not None:
            min_coverage = int(min_coverage)
            assert min_coverage >= 0

        metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version()
        metadata["num_cores"] = num_cores
        metadata["commands"] = []

        # Set up the filenames.
        # list of (
        #   sample,
        #   orig_bam_filename,    Original bam filename.
        #   bam_filename,         bam file, after filtering out unmapped reads.
        #   genomecov_filename,   Generated by genomecov.  Histogram.
        #   histo_datafile,       Data file to generate histogram (from cov).
        #   histo_plotfile,       Histogram plot.
        #   histo_prismfile,      To make histogram in PRISM.
        #
        #   ONLY USED IF features_bed
        #   intervallist_file,    Made from BED file.
        #   cov_filename,         Generated by Picard.
        #   targetcov_filename,   Generated by Picard.  Per target coverage.
        #   log_filename,         Output from Picard.
        #   )
        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            clean_bam_filename = opj(out_path, "%s.bam" % sample)
            assert clean_bam_filename != bam_filename
            genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample)
            histo_datafile = opj(out_path, "%s.histo.txt" % sample)
            histo_plotfile = opj(out_path, "%s.histo.png" % sample)
            histo_prismfile = opj(out_path, "%s.prism.txt" % sample)

            intervallist_file = opj(out_path, "%s.interval.txt" % sample)
            cov_filename = opj(out_path, "%s.coverage.txt" % sample)
            targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample)
            log_filename = opj(out_path, "%s.picard.log" % sample)

            x = filelib.GenericObject(sample=sample,
                                      orig_bam_filename=bam_filename,
                                      bam_filename=clean_bam_filename,
                                      genomecov_filename=genomecov_filename,
                                      histo_datafile=histo_datafile,
                                      histo_plotfile=histo_plotfile,
                                      histo_prismfile=histo_prismfile,
                                      intervallist_file=intervallist_file,
                                      cov_filename=cov_filename,
                                      targetcov_filename=targetcov_filename,
                                      log_filename=log_filename)
            #x = sample, bam_filename, genomecov_filename, \
            #    histo_datafile, histo_plotfile, histo_prismfile, \
            #    intervallist_file, cov_filename, targetcov_filename, \
            #    log_filename
            jobs.append(x)

        # Remove unmapped reads from the BAM files.
        # Need to remove the unmapped reads or Picard might complain:
        # Exception in thread "main"
        # htsjdk.samtools.SAMFormatException: SAM validation error:
        # ERROR: Record 154286082, Read name
        # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0
        # for unmapped read.
        #
        # This can happen with BWA generated alignments.
        cmds = []
        for x in jobs:
            x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename)
            cmds.append(x)
        parallel.pshell(cmds, max_procs=num_cores)
        x = [x.bam_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Generate the intervallist_file(s).
        if features_bed:
            cmds = []
            for x in jobs:
                args = x.intervallist_file, features_bed, x.bam_filename
                x = _make_intervallist_file, args, {}
                cmds.append(x)
            parallel.pyfun(cmds, num_procs=num_cores)

        # Make the commands to run picard.
        if features_bed:
            commands = []
            for x in jobs:
                x = _make_calculatehsmetrics_command(
                    x.intervallist_file, x.bam_filename, x.cov_filename,
                    x.targetcov_filename, ref.fasta_file_full, x.log_filename)
                commands.append(x)
            metadata["commands"].append(commands)
            parallel.pshell(commands, max_procs=num_cores)

            x1 = [x.cov_filename for x in jobs]
            x2 = [x.targetcov_filename for x in jobs]
            filelib.assert_exists_nz_many(x1 + x2)

        # Use genomecov to count read depth.
        x = _run_genomecov(jobs, ref_node.identifier, num_cores)
        metadata["commands"].append(x)

        # Summarize the average read depth.
        summary_file = opj(out_path, "summary.xls")
        _summarize_average_read_depth(jobs, min_coverage, summary_file)

        # Make histograms of the distribution of the read depth for
        # each sample.
        for x in jobs:
            _make_histo_file(x.genomecov_filename, x.histo_datafile)

        # Delete the filtered BAM files to save space.
        for x in jobs:
            filelib.assert_exists_nz(x.bam_filename)
            os.unlink(x.bam_filename)
        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        svm_node, vcf_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        metadata = {}

        # 1.  vcf_filenames
        # 2.  parsed_snpeff_files   one for each VCF file
        # 3.  merged_snpeff_file    just one file
        # 4.  clean_snpeff_file     clean up the annotations to final form
        # 5.  outfile

        merged_snpeff_file = "snpeff.merged.txt"
        cleaned_snpeff_file = "snpeff.clean.txt"

        jobs = []
        for vcf_filename in vcf_filenames:
            path, caller, ext = mlib.splitpath(vcf_filename)
            parsed_snpeff_file = "%s.parsed.txt" % caller
            j = filelib.GenericObject(
                caller=caller,
                vcf_filename=vcf_filename,
                parsed_snpeff_file=parsed_snpeff_file,
            )
            jobs.append(j)

        # Parse each of the snpeff files.
        commands = []
        for j in jobs:
            args = j.vcf_filename, j.parsed_snpeff_file
            # Debugging.  If this file exists, do not generate it
            # again.
            if os.path.exists(j.parsed_snpeff_file):
                continue
            x = parse_snpeff_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        # Merge the parsed files.
        x = [j.parsed_snpeff_file for j in jobs]
        x = [x for x in x if os.path.exists(x)]
        parsed_files = x
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(merged_snpeff_file):
            merge_parsed_files(parsed_files, merged_snpeff_file)

        # Clean up the snpEff file.  Coordinates should be unique.
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(cleaned_snpeff_file):
            clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file)

        # Merge the snpEff annotations into the SimpleVariantMatrix.
        add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile)

        return metadata
예제 #16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out Strelka version.

        skip_depth_filter = False
        x = mlib.get_user_option(user_options,
                                 "strelka_skip_depth_filter",
                                 allowed_values=["no", "yes"],
                                 not_empty=True)
        if x == "yes":
            skip_depth_filter = True
        assert "vartype" in out_attributes, "Missing attribute: vartype"
        x = out_attributes["vartype"]
        assert x in ["snp", "indel"]
        vartype = x

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # Make sure each cancer sample is unique.  Otherwise, the
        # analysis directories will conflict.
        tumor_samples = [x[-1] for x in nc_match]
        dups = {}
        for i in range(1, len(tumor_samples)):
            if tumor_samples[i] in tumor_samples[:i]:
                dups[tumor_samples[i]] = 1
        assert not dups, "NormalCancerFile contains multiple instances of: %s"\
               % ", ".join(sorted(dups))

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          config_file, output_dir
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            config_file = opj(out_path, "config.%s.ini" % cancer_sample)
            analysis_path = opj(out_path, "analysis.%s" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                config_file, analysis_path
            jobs.append(x)

        # Make each of the config files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            _make_config_file(config_file, skip_depth_filter=skip_depth_filter)

        # Make the analysis directories.
        jobs2 = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            fn = _make_analysis_directory
            args = (analysis_path, config_file, ref.fasta_file_full,
                    normal_bamfile, cancer_bamfile)
            keywds = None
            jobs2.append((fn, args, keywds))
        parallel.pyfun(jobs2, num_procs=num_cores)

        # Run the analysis.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            cmd = "make -j %d" % num_cores
            parallel.sshell(cmd, path=analysis_path)
        metadata["num_cores"] = num_cores

        # Make sure files exists.
        x = [x[-1] for x in jobs]
        x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x]
        filelib.assert_exists_nz_many(x)

        # Clean the VCF files and save into the out_path.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            # <analysis_path>/results/all.somatic.snvs.vcf
            # <analysis_path>/results/all.somatic.indels.vcf
            vartype2file = {
                "snp": "all.somatic.snvs.vcf",
                "indel": "all.somatic.indels.vcf",
            }
            assert vartype in vartype2file
            x = vartype2file[vartype]
            src_file = os.path.join(analysis_path, "results", x)
            dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample)
            alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file,
                                       dst_file)

        #metadata["commands"] = commands
        return metadata
예제 #17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 2

        fastq_node, group_node = antecedents
        fastq_path = fastq_node.identifier
        sample_group_file = group_node.identifier
        filelib.safe_mkdir(out_path)
        metadata = {}

        module_utils.assert_sample_group_file(sample_group_file, fastq_path)
        x = module_utils.read_sample_group_file(group_node.identifier)
        x = module_utils.fix_sample_group_filenames(x, fastq_path)
        sample_groups = x

        # For merging, the order of the files in the sample_group_file
        # must be maintainted.  Otherwise, will be merged out of order.

        # The new files should be named:
        # <Sample>.fastq          # if single end
        # <Sample>_<Pair>.fastq   # if paired end
        jobs = []
        for x in sample_groups:
            in_filename, sample, pair = x
            #in_filename = os.path.join(fastq_path, file_)
            assert os.path.exists(in_filename)

            out_file = "%s.fastq" % sample
            if pair:
                out_file = "%s_%s.fastq" % (sample, pair)
            out_filename = os.path.join(out_path, out_file)
            x = in_filename, sample, pair, out_filename
            jobs.append(x)

        out2ins = {}  # out_filename -> list of in_filenames
        for x in jobs:
            in_filename, sample, pair, out_filename = x
            if out_filename not in out2ins:
                out2ins[out_filename] = []
            out2ins[out_filename].append(in_filename)

        commands = []
        for out_filename, in_filenames in out2ins.iteritems():
            # Debugging.  Don't merge again if it already exists.
            if os.path.exists(out_filename):
                continue
            args = in_filenames, out_filename
            keywds = {}
            x = merge_or_symlink_files, args, keywds
            commands.append(x)
        commands.sort()

        nc = min(MAX_CORES, num_cores)
        parallel.pyfun(commands, nc)
        metadata["num_cores"] = nc

        # If the files are paired, make sure they are paired
        # correctly.
        sample2outfiles = {}  # sample -> list of out filenames
        for x in jobs:
            in_filename, sample, pair, out_filename = x
            if sample not in sample2outfiles:
                sample2outfiles[sample] = []
            if out_filename not in sample2outfiles[sample]:
                sample2outfiles[sample].append(out_filename)
        commands = []
        all_samples = sorted(sample2outfiles)
        for sample in all_samples:
            out_filenames = sorted(sample2outfiles[sample])
            if len(out_filenames) == 1:
                continue
            # Make sure they are aligned.
            x = check_fastq_alignment, (sample, out_filenames), {}
            commands.append(x)
        commands.sort()
        retvals = parallel.pyfun(commands, nc)
        assert len(retvals) == len(commands)

        errors = [x for x in retvals if x]
        assert not errors, "\n".join(errors)

        return metadata
예제 #18
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
예제 #19
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.  Also,
        # takes 4-5 Gb RAM per process.
        MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4)

        fastq_node, sample_node, summary_node = antecedents
        fastq_path = fastq_node.identifier
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_path)
        assert fastq_files, "I could not find any FASTQ files."
        summary_filenames = filelib.list_files_in_path(
            summary_node.identifier, endswith=".matches.txt")
        assert summary_filenames, "No .matches.txt files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_mismatches = mlib.get_user_option(
            user_options, "num_mismatches", type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2summary = {}  # sample -> summary_filename
        for filename in summary_filenames:
            # <sample>.matches.txt
            p, f = os.path.split(filename)
            assert f.endswith(".matches.txt")
            sample = f.replace(".matches.txt", "")
            assert sample not in sample2summary
            sample2summary[sample] = filename

        # list of (sample, fastq_file1, fastq_file2, summary_filename,
        #          out_file1, out_file2, subtracted_file1, subtracted_file2)
        jobs = []
        for x in fastq_files:
            sample, pair1_fastq, pair2_fastq = x
            assert sample in sample2summary, \
                   "Missing summary for sample: %s" % sample
            p1, f1 = os.path.split(pair1_fastq)
            if pair2_fastq:
                p2, f2 = os.path.split(pair2_fastq)
                assert p1 == p2
            out1_fastq = os.path.join(out_path, f1)
            sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1)
            out2_fastq = None
            sub2_fastq = None
            if pair2_fastq:
                out2_fastq = os.path.join(out_path, f2)
                sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2)
            x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \
                out1_fastq, out2_fastq, sub1_fastq, sub2_fastq
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, pair1_fastq, pair2_fastq, summary_file, \
                    out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x
            x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \
                num_mismatches
            x = subtract_mouse_reads, x, {}
            jobs2.append(x)
            if pair2_fastq:
                x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \
                    num_mismatches
                x = subtract_mouse_reads, x, {}
                jobs2.append(x)

        nc = min(MAX_CORES, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5)
        assert len(results) == len(jobs2)
        metadata["num_cores"] = nc
        
        # Make sure the fastq files were generated.
        x1 = [x[4] for x in jobs]
        x2 = [x[5] for x in jobs]
        x = x1 + x2
        x = [x for x in x if x]
        # BUG: If all reads were removed, then this will fail incorrectly.
        filelib.assert_exists_nz_many(x)

        return metadata