Пример #1
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        #from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        # TODO: Merge with merge_variants_snp.py.
        #CALLERS = [
        #    "gatk", "platypus", "varscan",
        #    ]
        vcf_paths = [x.identifier for x in antecedents]
        nodes = [x.data for x in antecedents]
        CALLERS = [x.attributes["caller"] for x in nodes]
        assert len(CALLERS) == len(vcf_paths)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file)
        jobs = []
        for i, caller in enumerate(CALLERS):
            inpath = vcf_paths[i]
            caller_h = hashlib.hash_var(caller)
            
            vcf_files = filelib.list_files_in_path(
                inpath, endswith=".vcf", toplevel_only=True)
            for file_ in vcf_files:
                # IN_FILE:   <inpath>/<sample>.vcf
                # OUT_FILE:  <out_path>/<caller>.vcf/<sample>.vcf
                p, sample, e = mlib.splitpath(file_)
                assert e == ".vcf"
                out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h)
                out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample)

                x = filelib.GenericObject(
                    sample=sample, caller=caller,
                    out_vcf_path=out_vcf_path, in_vcf_file=file_,
                    out_vcf_file=out_vcf_file)
                jobs.append(x)
                
        # Make sure the same samples are found in all callers.
        caller2samples = {}
        for j in jobs:
            if j.caller not in caller2samples:
                caller2samples[j.caller] = []
            caller2samples[j.caller].append(j.sample)
        comp_samples = None
        for caller, samples in caller2samples.iteritems():
            samples = sorted(samples)
            if comp_samples is None:
                comp_samples = samples
            assert comp_samples == samples, "%s %s" % (comp_samples, samples)

        for j in jobs:
            filelib.safe_mkdir(j.out_vcf_path)
            os.symlink(j.in_vcf_file, j.out_vcf_file)

        return metadata
Пример #2
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, gene_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        assert bam_filenames, "No bam files found."
        metadata = {}

        # Make output filenames.
        p, r, e = mlib.splitpath(gtf_file)
        bed_file = "%s.bed" % r

        # Make bed file.
        alignlib.gtf_to_bed(gtf_file, bed_file)
        #bed_file = "/data/jchang/biocore/gtf02.txt"

        # Figure out the orientation.
        x = get_paired_stranded_rseqc(bed_file, bam_filenames[0])
        single_or_paired, stranded, frac_failed, frac_first, frac_second = x

        x = mlib.Stranded(single_or_paired, stranded, frac_failed, frac_first,
                          frac_second)
        mlib.write_stranded(x, outfile)
        return metadata
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        for bam_filename in bam_filenames:
            x = count_duplicates, (bam_filename,), {}
            jobs.append(x)
        results = parallel.pyfun(jobs, num_procs=num_cores)
        metadata["num_cores"] = num_cores
        assert len(results) == len(bam_filenames)

        handle = open(outfile, 'w')
        header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated"
        print >>handle, "\t".join(header)
        for i in range(len(bam_filenames)):
            x, sample, x = mlib.splitpath(bam_filenames[i])
            total_reads, dup_reads = results[i]
            perc_dup = float(dup_reads) / total_reads * 100
            perc_dup = "%.2f" % perc_dup
            x = sample, dup_reads, total_reads, perc_dup
            print >>handle, "\t".join(map(str, x))
       
        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import merge_vcf_folder

        vcffolders_node = antecedents
        filelib.safe_mkdir(out_path)
        metadata = {}

        x = os.listdir(vcffolders_node.identifier)
        x = [x for x in x if x.endswith(".vcf")]
        assert x, "No VCF folders found: %s" % vcffolders_node.identifier
        x = [os.path.join(vcffolders_node.identifier, x) for x in x]
        vcf_folders = x

        jobs = []
        for folder in vcf_folders:
            path, root, ext = mlib.splitpath(folder)
            assert ext == ".vcf"
            caller = root
            vcf_filenames = filelib.list_files_in_path(folder,
                                                       endswith=".vcf",
                                                       toplevel_only=True)
            assert vcf_filenames, "No .vcf files: %s" % folder
            out_filename = os.path.join(out_path, "%s.vcf" % root)
            tmp_path = "%s.indexed.vcf" % caller
            x = filelib.GenericObject(caller=caller,
                                      vcf_filenames=vcf_filenames,
                                      out_filename=out_filename,
                                      tmp_path=tmp_path)
            jobs.append(x)

        for j in jobs:
            m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames,
                                                 j.out_filename, num_cores,
                                                 j.tmp_path)
            if "commands" not in metadata:
                metadata["commands"] = []
            metadata["commands"].extend(m["commands"])

        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Пример #5
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_vcf_node, bf_vcf_node = antecedents
        in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        common_only = mlib.get_user_option(user_options,
                                           "backfill_common_only",
                                           allowed_values=["no", "yes"],
                                           not_empty=True)

        in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames]
        bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames]

        # Make sure there are no duplicate sample names.
        x1 = {}.fromkeys(in_vcf_samples).keys()
        x2 = {}.fromkeys(bf_vcf_samples).keys()
        assert len(in_vcf_samples) == len(x1), "Duplicate samples"
        assert len(bf_vcf_samples) == len(x2), "Duplicate samples"

        # Find the samples.
        common = [x for x in in_vcf_samples if x in bf_vcf_samples]
        in_only = [x for x in in_vcf_samples if x not in common]
        bf_only = [x for x in bf_vcf_samples if x not in common]
        assert common, "No common samples."

        pretty_in = parselib.pretty_list(in_only, max_items=5)
        pretty_bf = parselib.pretty_list(bf_only, max_items=5)
        if common_only == "no":
            assert not (in_only and bf_only), \
                   "Extra samples in both sets:\n%s\n%s" % (
                pretty_in, pretty_bf)
            assert not in_only, "Target VCF file has extra samples: %s" % \
                   pretty_in
            assert not bf_only, "Source VCF file has extra samples: %s." % \
                   pretty_bf
        SAMPLES = common

        # list of sample, in_vcf_filename, bf_vcf_filename, out_filename
        jobs = []
        for sample in SAMPLES:
            assert sample in in_vcf_samples
            assert sample in bf_vcf_samples
            i = in_vcf_samples.index(sample)
            j = bf_vcf_samples.index(sample)
            in_filename = in_vcf_filenames[i]
            bf_filename = bf_vcf_filenames[j]
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, bf_filename, out_filename
            jobs.append(x)

        jobs2 = []
        for x in jobs:
            sample, in_filename, bf_filename, out_filename = x
            fn = backfill_vcf
            args = in_filename, bf_filename, out_filename
            keywds = {}
            jobs2.append((fn, args, keywds))
        #num_cores = 1
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
Пример #6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile,
        #          fixed_outfile, filtered_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            orig_outfile = opj(out_path, "%s.raw" % sample)
            fix_outfile = opj(out_path, "%s.vcf" % sample)
            #filter_outfile = opj(out_path, "%s.vcf" % sample)
            x = cancer_sample, normal_bamfile, cancer_bamfile, \
                orig_outfile, fix_outfile
            x = filelib.GenericObject(cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      orig_outfile=orig_outfile,
                                      fix_outfile=fix_outfile)
            jobs.append(x)

        # python /usr/local/museq/classify.py \
        #   normal:test31/normal.bam tumour:test31/tumor.bam \
        #   reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   model:/usr/local/museq/model_v4.1.2.npz \
        #   --config /usr/local/museq/metadata.config \
        #   -o test51.vcf
        opj = os.path.join
        museq = mlib.get_config("museq", assert_exists=True)
        classify_py = opj(museq, "classify.py")
        model_file = opj(museq, "model_v4.1.2.npz")
        config_file = opj(museq, "metadata.config")
        filelib.assert_exists_nz(classify_py)
        filelib.assert_exists_nz(model_file)
        filelib.assert_exists_nz(config_file)

        # museq's config file generates a broken VCF file.  Fix it.
        fixed_config_file = "fixed.config"
        fix_config_file(config_file, fixed_config_file)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x

            x = [
                "python",  # should allow user to specify python
                sq(classify_py),
                sq("normal:%s" % j.normal_bamfile),
                sq("tumour:%s" % j.cancer_bamfile),
                sq("reference:%s" % ref.fasta_file_full),
                sq("model:%s" % model_file),
                "--config",
                sq(fixed_config_file),
                "-o",
                sq(j.orig_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.  On Thunderbolts test,
        # took < 1 Gb.
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # JointSNVMix produces non-standard VCF files.  Fix this so it
        # will work with other programs downstream.
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x
            fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile)

        # Filter each of the VCF files.
        #for x in jobs:
        #    cancer_sample, normal_bamfile, cancer_bamfile, \
        #                   raw_outfile, fix_outfile, vcf_outfile = x
        #    filter_by_vartype(vartype, fix_outfile, vcf_outfile)
        #metadata["filter"] = vartype

        #x = [x[-1] for x in jobs]
        x = [j.fix_outfile for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Пример #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Пример #8
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
Пример #9
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import parselib
        from Betsy import module_utils as mlib

        mpileup_node, nc_node = antecedents
        mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier,
                                                       endswith=".pileup")
        assert mpileup_filenames, "No .pileup files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        #ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]

        sample2pufile = {}  # sample -> mpileup filename
        for filename in mpileup_filenames:
            path, sample, ext = mlib.splitpath(filename)
            sample2pufile[sample] = filename

        # Make sure files exist for all the samples.
        all_samples = []
        for (normal_sample, cancer_sample) in nc_match:
            if normal_sample not in all_samples:
                all_samples.append(normal_sample)
            if cancer_sample not in all_samples:
                all_samples.append(cancer_sample)
        missing = [x for x in all_samples if x not in sample2pufile]
        x = parselib.pretty_list(missing, max_items=5)
        assert not missing, "Missing BAM files for samples: %s" % x

        # list of (sample, normal_pileup, cancer_pileup,
        #          tmp1_normal, tmp1_cancer, log_filename, out_filename)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_pileup = sample2pufile[normal_sample]
            cancer_pileup = sample2pufile[cancer_sample]
            p, sample, ext = mlib.splitpath(cancer_pileup)
            tmp1_normal = opj(out_path, "%s.normal.tmp1" % sample)
            tmp1_cancer = opj(out_path, "%s.cancer.tmp1" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            out_filename = opj(out_path, "%s.vcf" % sample)
            x = sample, normal_sample, cancer_sample, \
                normal_pileup, cancer_pileup, \
                tmp1_normal, tmp1_cancer, log_filename, out_filename
            jobs.append(x)

        # VarScan will generate a "Parsing Exception" if there are 0
        # reads in a location.  Will be either "0" or blank.  Filter
        # those lines out.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            x1 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (normal_pileup,
                                                           tmp1_normal)
            x2 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (cancer_pileup,
                                                           tmp1_cancer)
            commands.extend([x1, x2])
        parallel.pshell(commands, max_procs=num_cores)
        x = [x[3] for x in jobs] + [x[4] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # java -jar VarScan.jar somatic [normal_pileup] [tumor_pileup]
        #   [output] OPTIONS
        varscan = mlib.findbin("varscan_jar")

        # Use parameters from:
        # Using VarScan 2 for Germline Variant Calling and Somatic
        # Mutation Detection

        # Make a list of commands.
        commands = []
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            x = [
                "java",
                "-jar",
                sq(varscan),
                "somatic",
                sq(tmp1_normal),
                sq(tmp1_cancer),
                sample,
                "--min-coverage",
                10,
                "--min-avg-qual",
                15,
                "--min-normal-coverage",
                10,
                "--min-tumor-coverage",
                10,
                "--min-var-freq",
                0.05,
                "--somatic-p-value",
                0.05,
                "--output-vcf",
                1,
            ]
            x = " ".join(map(str, x))
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)
        x = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Files in out_path can get very big.  Clean them up.
        # <sample>.normal.tmp1    Very big (10's Gb).
        # <sample>.cancer.tmp1    Very big (10's to 100 Gb).
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            if os.path.exists(tmp1_normal):
                os.unlink(tmp1_normal)
            if os.path.exists(tmp1_cancer):
                os.unlink(tmp1_cancer)

        # Copy the final file to the right place.
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            # Will be written in current directory.
            varscan_out = "%s.snp.vcf" % sample
            if vartype == "indel":
                varscan_out = "%s.indel.vcf" % sample
            filelib.assert_exists(varscan_out)
            shutil.copy2(varscan_out, out_filename)

        # VarScan names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            _fix_normal_cancer_names(out_filename, normal_sample,
                                     cancer_sample)
Пример #10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          vcf_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                vcf_outfile
            jobs.append(x)

        # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   test31/tumor.bam test31/normal.bam test41.vcf
        somaticsniper = mlib.get_config("somaticsniper",
                                        which_assert_file=True)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x

            x = [
                sq(somaticsniper),
                "-q",
                1,
                "-Q",
                15,
                "-G",
                "-L",
                "-F",
                "vcf",
                "-f",
                sq(ref.fasta_file_full),
                sq(cancer_bamfile),
                sq(normal_bamfile),
                sq(vcf_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # SomaticSniper names the samples "NORMAL" and "TUMOR".
        # Replace them with the actual names.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x
            call_somatic_varscan._fix_normal_cancer_names(
                vcf_outfile, normal_sample, cancer_sample)

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Пример #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        svm_node, vcf_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        metadata = {}

        # 1.  vcf_filenames
        # 2.  parsed_snpeff_files   one for each VCF file
        # 3.  merged_snpeff_file    just one file
        # 4.  clean_snpeff_file     clean up the annotations to final form
        # 5.  outfile

        merged_snpeff_file = "snpeff.merged.txt"
        cleaned_snpeff_file = "snpeff.clean.txt"

        jobs = []
        for vcf_filename in vcf_filenames:
            path, caller, ext = mlib.splitpath(vcf_filename)
            parsed_snpeff_file = "%s.parsed.txt" % caller
            j = filelib.GenericObject(
                caller=caller,
                vcf_filename=vcf_filename,
                parsed_snpeff_file=parsed_snpeff_file,
            )
            jobs.append(j)

        # Parse each of the snpeff files.
        commands = []
        for j in jobs:
            args = j.vcf_filename, j.parsed_snpeff_file
            # Debugging.  If this file exists, do not generate it
            # again.
            if os.path.exists(j.parsed_snpeff_file):
                continue
            x = parse_snpeff_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        # Merge the parsed files.
        x = [j.parsed_snpeff_file for j in jobs]
        x = [x for x in x if os.path.exists(x)]
        parsed_files = x
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(merged_snpeff_file):
            merge_parsed_files(parsed_files, merged_snpeff_file)

        # Clean up the snpEff file.  Coordinates should be unique.
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(cleaned_snpeff_file):
            clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file)

        # Merge the snpEff annotations into the SimpleVariantMatrix.
        add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile)

        return metadata
Пример #13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out Strelka version.

        skip_depth_filter = False
        x = mlib.get_user_option(user_options,
                                 "strelka_skip_depth_filter",
                                 allowed_values=["no", "yes"],
                                 not_empty=True)
        if x == "yes":
            skip_depth_filter = True
        assert "vartype" in out_attributes, "Missing attribute: vartype"
        x = out_attributes["vartype"]
        assert x in ["snp", "indel"]
        vartype = x

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # Make sure each cancer sample is unique.  Otherwise, the
        # analysis directories will conflict.
        tumor_samples = [x[-1] for x in nc_match]
        dups = {}
        for i in range(1, len(tumor_samples)):
            if tumor_samples[i] in tumor_samples[:i]:
                dups[tumor_samples[i]] = 1
        assert not dups, "NormalCancerFile contains multiple instances of: %s"\
               % ", ".join(sorted(dups))

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          config_file, output_dir
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            config_file = opj(out_path, "config.%s.ini" % cancer_sample)
            analysis_path = opj(out_path, "analysis.%s" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                config_file, analysis_path
            jobs.append(x)

        # Make each of the config files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            _make_config_file(config_file, skip_depth_filter=skip_depth_filter)

        # Make the analysis directories.
        jobs2 = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            fn = _make_analysis_directory
            args = (analysis_path, config_file, ref.fasta_file_full,
                    normal_bamfile, cancer_bamfile)
            keywds = None
            jobs2.append((fn, args, keywds))
        parallel.pyfun(jobs2, num_procs=num_cores)

        # Run the analysis.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            cmd = "make -j %d" % num_cores
            parallel.sshell(cmd, path=analysis_path)
        metadata["num_cores"] = num_cores

        # Make sure files exists.
        x = [x[-1] for x in jobs]
        x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x]
        filelib.assert_exists_nz_many(x)

        # Clean the VCF files and save into the out_path.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            # <analysis_path>/results/all.somatic.snvs.vcf
            # <analysis_path>/results/all.somatic.indels.vcf
            vartype2file = {
                "snp": "all.somatic.snvs.vcf",
                "indel": "all.somatic.indels.vcf",
            }
            assert vartype in vartype2file
            x = vartype2file[vartype]
            src_file = os.path.join(analysis_path, "results", x)
            dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample)
            alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file,
                                       dst_file)

        #metadata["commands"] = commands
        return metadata
Пример #14
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sai_node, orient_node, sample_node, reference_node = \
                    antecedents
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_node.identifier)
        sai_path = sai_node.identifier
        assert filelib.dir_exists(sai_path)
        orient = mlib.read_orientation(orient_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Technically, doesn't need the SampleGroupFile, since that's
        # already reflected in the sai data.  But better, because the
        # sai data might not always be generated by BETSY.

        # Find the merged fastq files.

        # Find the sai files.
        sai_filenames = filelib.list_files_in_path(
            sai_path, endswith=".sai", case_insensitive=True)
        assert sai_filenames, "No .sai files."

        bwa = mlib.findbin("bwa")
        # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq>
        # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai>
        #   <input_1.fq> <input_2.fq> >

        # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam)
        # all full paths
        jobs = []
        for x in fastq_files:
            sample, pair1_fq, pair2_fq = x

            # The sai file should be in the format:
            # <sai_path>/<sample>.sai    Single end read
            # <sai_path>/<sample>_1.sai  Paired end read
            # <sai_path>/<sample>_2.sai  Paired end read
            # Look for pair1_sai and pair2_sai.
            pair1_sai = pair2_sai = None
            for sai_filename in sai_filenames:
                p, s, e = mlib.splitpath(sai_filename)
                assert e == ".sai"
                if s == sample:
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_1" % (sample):
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_2" % (sample):
                    assert not pair2_sai
                    pair2_sai = sai_filename
            assert pair1_sai, "Missing .sai file: %s" % sample
            if pair2_fq:
                assert pair2_sai, "Missing .sai file 2: %s" % sample
            if pair2_sai:
                assert pair2_fq, "Missing .fq file 2: %s" % sample
                
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                sam_filename, log_filename
            jobs.append(x)

        orientation = orient.orientation
        #orientation = sample_node.data.attributes["orientation"]
        assert orientation in ["single", "paired_fr", "paired_rf"]

        # Make a list of bwa commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                    sam_filename, log_filename = x
            if orientation == "single":
                assert not pair2_fq
                assert not pair2_sai

            samse = "samse"
            if orientation.startswith("paired"):
                samse = "sampe"

            x = [
                sq(bwa),
                samse,
                "-f", sq(sam_filename),
                sq(ref.fasta_file_full),
                ]
            if orientation == "single":
                x += [
                    sq(pair1_sai),
                    sq(pair1_fq),
                ]
            else:
                y = [
                    sq(pair1_sai),
                    sq(pair2_sai),
                    sq(pair1_fq),
                    sq(pair2_fq),
                    ]
                if orientation == "paired_rf":
                    y = [
                        sq(pair2_sai),
                        sq(pair1_sai),
                        sq(pair2_fq),
                        sq(pair1_fq),
                        ]
                x += y
            x += [
                ">&", sq(log_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)
        
        return metadata
Пример #15
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import ngslib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        features_bed = mlib.get_user_option(user_options,
                                            "features_bed",
                                            check_file=True)
        if features_bed:
            metadata["features_bed"] = features_bed

        # Applies to genomecov.
        min_coverage = user_options.get("ignore_coverage_below")
        if min_coverage == "":
            min_coverage = None
        if min_coverage is not None:
            min_coverage = int(min_coverage)
            assert min_coverage >= 0

        metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version()
        metadata["num_cores"] = num_cores
        metadata["commands"] = []

        # Set up the filenames.
        # list of (
        #   sample,
        #   orig_bam_filename,    Original bam filename.
        #   bam_filename,         bam file, after filtering out unmapped reads.
        #   genomecov_filename,   Generated by genomecov.  Histogram.
        #   histo_datafile,       Data file to generate histogram (from cov).
        #   histo_plotfile,       Histogram plot.
        #   histo_prismfile,      To make histogram in PRISM.
        #
        #   ONLY USED IF features_bed
        #   intervallist_file,    Made from BED file.
        #   cov_filename,         Generated by Picard.
        #   targetcov_filename,   Generated by Picard.  Per target coverage.
        #   log_filename,         Output from Picard.
        #   )
        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            clean_bam_filename = opj(out_path, "%s.bam" % sample)
            assert clean_bam_filename != bam_filename
            genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample)
            histo_datafile = opj(out_path, "%s.histo.txt" % sample)
            histo_plotfile = opj(out_path, "%s.histo.png" % sample)
            histo_prismfile = opj(out_path, "%s.prism.txt" % sample)

            intervallist_file = opj(out_path, "%s.interval.txt" % sample)
            cov_filename = opj(out_path, "%s.coverage.txt" % sample)
            targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample)
            log_filename = opj(out_path, "%s.picard.log" % sample)

            x = filelib.GenericObject(sample=sample,
                                      orig_bam_filename=bam_filename,
                                      bam_filename=clean_bam_filename,
                                      genomecov_filename=genomecov_filename,
                                      histo_datafile=histo_datafile,
                                      histo_plotfile=histo_plotfile,
                                      histo_prismfile=histo_prismfile,
                                      intervallist_file=intervallist_file,
                                      cov_filename=cov_filename,
                                      targetcov_filename=targetcov_filename,
                                      log_filename=log_filename)
            #x = sample, bam_filename, genomecov_filename, \
            #    histo_datafile, histo_plotfile, histo_prismfile, \
            #    intervallist_file, cov_filename, targetcov_filename, \
            #    log_filename
            jobs.append(x)

        # Remove unmapped reads from the BAM files.
        # Need to remove the unmapped reads or Picard might complain:
        # Exception in thread "main"
        # htsjdk.samtools.SAMFormatException: SAM validation error:
        # ERROR: Record 154286082, Read name
        # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0
        # for unmapped read.
        #
        # This can happen with BWA generated alignments.
        cmds = []
        for x in jobs:
            x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename)
            cmds.append(x)
        parallel.pshell(cmds, max_procs=num_cores)
        x = [x.bam_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Generate the intervallist_file(s).
        if features_bed:
            cmds = []
            for x in jobs:
                args = x.intervallist_file, features_bed, x.bam_filename
                x = _make_intervallist_file, args, {}
                cmds.append(x)
            parallel.pyfun(cmds, num_procs=num_cores)

        # Make the commands to run picard.
        if features_bed:
            commands = []
            for x in jobs:
                x = _make_calculatehsmetrics_command(
                    x.intervallist_file, x.bam_filename, x.cov_filename,
                    x.targetcov_filename, ref.fasta_file_full, x.log_filename)
                commands.append(x)
            metadata["commands"].append(commands)
            parallel.pshell(commands, max_procs=num_cores)

            x1 = [x.cov_filename for x in jobs]
            x2 = [x.targetcov_filename for x in jobs]
            filelib.assert_exists_nz_many(x1 + x2)

        # Use genomecov to count read depth.
        x = _run_genomecov(jobs, ref_node.identifier, num_cores)
        metadata["commands"].append(x)

        # Summarize the average read depth.
        summary_file = opj(out_path, "summary.xls")
        _summarize_average_read_depth(jobs, min_coverage, summary_file)

        # Make histograms of the distribution of the read depth for
        # each sample.
        for x in jobs:
            _make_histo_file(x.genomecov_filename, x.histo_datafile)

        # Delete the filtered BAM files to save space.
        for x in jobs:
            filelib.assert_exists_nz(x.bam_filename)
            os.unlink(x.bam_filename)
        return metadata
Пример #16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import genomelib
        from genomicode import config
        from Betsy import module_utils as mlib

        fasta_node, bam_node, sample_node, orient_node = antecedents
        fasta_data = mlib.find_merged_fastq_files(sample_node.identifier,
                                                  fasta_node.identifier,
                                                  find_fasta=True)
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        # TODO: Try to figure out version.
        metadata = {}
        metadata["tool"] = "RSeQC (unknown version)"

        pyrseqc = mlib.findbin("pyrseqc")

        gene_model = mlib.get_user_option(user_options,
                                          "gene_model",
                                          not_empty=True,
                                          allowed_values=["hg19"])
        if gene_model == "hg19":
            gene_path = config.rseqc_hg19
        else:
            raise AssertionError, "Unhandled: %s" % gene_model

        filelib.dir_exists(gene_path)
        gene_model_bed = os.path.join(gene_path, "RefSeq.bed12")
        housekeeping_model_bed = os.path.join(gene_path,
                                              "HouseKeepingGenes.bed")

        sample2fastadata = {}
        for x in fasta_data:
            sample, f1, f2 = x
            sample2fastadata[sample] = x

        is_paired = orient.orientation.startswith("paired")

        # Guess the read length.  Read the first fasta.
        assert sample2fastadata
        x = sample2fastadata.keys()[0]
        filename = sample2fastadata[x][1]
        lengths = {}  # length -> count
        for i, x in enumerate(genomelib.read_fasta_many(filename)):
            if i >= 100:
                break
            title, sequence = x
            l = len(sequence)
            lengths[l] = lengths.get(l, 0) + 1
        # Use the most common length.
        c_length = c_count = None
        for (l, c) in lengths.iteritems():
            if c_count is None or c > c_count:
                c_length, c_count = l, c
        assert c_length
        read_length = c_length

        jobs = []  # sample, bam_filename, fasta_file1, fasta_file2, outdir
        for bam_filename in bam_filenames:
            # <path>/<sample>.bam
            p, sample, e = mlib.splitpath(bam_filename)
            assert sample in sample2fastadata
            x, f1, f2 = sample2fastadata[sample]
            outdir = os.path.join(out_path, sample)
            x = sample, bam_filename, f1, f2, outdir
            jobs.append(x)

        # Some of the modules of RSeQC uses a lot of memory.  Have
        # seen a Python process take 33 Gb, and an R process take 200
        # Gb.  However, most of the modules use much less memory.  So
        # run one pyrseqc at a time, and run each one of those
        # processes in parallel.  Is probably slower than running
        # multiple pyrseqc, but takes less memory.
        commands = []
        for x in jobs:
            sample, bam_filename, fasta_filename1, fasta_filename2, outdir = x

            # pyrseqc.py -j 20 --paired_end rqc11.bam rqc14.fa 76 \
            #   mod07.txt hg19.HouseKeepingGenes.bed rqc21 --dry_run
            x = [
                mlib.sq(pyrseqc),
                "-j",
                str(num_cores),
            ]
            if is_paired:
                x += ["--paired_end"]
            x += [
                mlib.sq(bam_filename),
                mlib.sq(fasta_filename1),
                str(read_length),
                mlib.sq(gene_model_bed),
                mlib.sq(housekeeping_model_bed),
                mlib.sq(outdir),
            ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        # pyrseqc takes up to ~40 Gb per process.
        # read_distribution.py takes 33 Gb.
        # read_quality.py spins off an R process that takes ~200 Gb.
        # Make sure we don't use up more memory than is available on
        # the machine.
        #nc = mlib.calc_max_procs_from_ram(60, upper_max=num_cores)
        #metadata["num cores"] = nc
        #x = parallel.pshell(commands, max_procs=nc)

        # Because of memory, just run one at a time, but each one, use
        # multiple cores.
        for cmd in commands:
            x = parallel.sshell(cmd)
            assert x.find("Traceback") < 0, x

        filelib.assert_exists_nz(out_path)

        return metadata
Пример #17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        MAX_CORES = 4  # I/O intensive.

        fastq_node, sample_node, bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier,
                                                    fastq_node.identifier,
                                                    as_dict=True)

        metadata = {}

        jobs = []  # list of (sample, bam_file, fastq_file)
        for filename in bam_filenames:
            path, sample, ext = mlib.splitpath(filename)
            assert sample in sample2fastq, "Missing fastq: %s" % sample
            fastq1, fastq2 = sample2fastq[sample]
            x = sample, filename, fastq1
            jobs.append(x)

        funcalls = []
        for x in jobs:
            sample, bam_filename, fastq_filename = x
            # Count the number of reads.
            x1 = count_reads, (fastq_filename, ), {}
            # Count the number of alignments.
            x2 = count_alignments, (bam_filename, ), {}
            funcalls.append(x1)
            funcalls.append(x2)
        assert len(funcalls) == len(jobs) * 2

        nc = min(num_cores, MAX_CORES)
        results = parallel.pyfun(funcalls, num_procs=nc)
        metadata["num_cores"] = nc

        # list of (sample, aligns, aligned_reads, total_reads, perc_aligned).
        results2 = []
        for i, x in enumerate(jobs):
            sample, bam_filename, fastq_filename = x
            x1 = results[i * 2]
            x2 = results[i * 2 + 1]
            total_reads = x1
            aligned_reads, alignments = x2
            perc_aligned = float(aligned_reads) / total_reads
            x = sample, alignments, aligned_reads, total_reads, perc_aligned
            results2.append(x)
        results = results2

        # sort by sample name
        results.sort()

        # Make table where the rows are the samples and the columns
        # are the statistics.
        table = []
        header = ("Sample", "Alignments", "Aligned Reads", "Total Reads",
                  "Perc Aligned")
        table.append(header)
        for x in results:
            sample, alignments, aligned_reads, total_reads, perc_aligned = x

            x1 = parselib.pretty_int(alignments)
            x2 = parselib.pretty_int(aligned_reads)
            x3 = parselib.pretty_int(total_reads)
            x4 = "%.2f%%" % (perc_aligned * 100)
            x = sample, x1, x2, x3, x4
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = mlib.findbin("txt2xls", quote=True)
        parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile))
        return metadata
Пример #18
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_filenames = mlib.find_bam_files(in_data.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bam2fastx (unknown version)"

        # Somehow bam2fastx doesn't work if there are spaces in the
        # filename.  Make a temporary filename with no spaces, and
        # then rename it later.
        # Actually, may not be bam2fastx's fault.

        jobs = []
        for i, bam_filename in enumerate(bam_filenames):
            p, f, e = mlib.splitpath(bam_filename)
            #bai_filename = alignlib.find_bai_file(bam_filename)
            #assert bai_filename, "Missing index for: %s" % bam_filename
            #temp_bam_filename = "%d.bam" % i
            #temp_bai_filename = "%d.bam.bai" % i
            #temp_fa_filename = "%d.fa" % i
            fa_filename = os.path.join(out_path, "%s.fa" % f)
            x = filelib.GenericObject(
                bam_filename=bam_filename,
                #bai_filename=bai_filename,
                #temp_bam_filename=temp_bam_filename,
                #temp_bai_filename=temp_bai_filename,
                #temp_fa_filename=temp_fa_filename,
                fa_filename=fa_filename)
            jobs.append(x)
        bam2fastx = mlib.findbin("bam2fastx")

        # Link all the bam files.
        #for j in jobs:
        #    assert not os.path.exists(j.temp_bam_filename)
        #    #assert not os.path.exists(j.temp_bai_filename)
        #    os.symlink(j.bam_filename, j.temp_bam_filename)
        #    #os.symlink(j.bai_filename, j.temp_bai_filename)

        commands = []
        for j in jobs:
            # bam2fastx -A --fasta -o rqc14.fa rqc11.bam
            x = [
                mlib.sq(bam2fastx),
                "-A",
                "--fasta",
                #"-o", mlib.sq(j.temp_fa_filename),
                #mlib.sq(j.temp_bam_filename),
                "-o", mlib.sq(j.fa_filename),
                mlib.sq(j.bam_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_fa_filename, j.fa_filename)
        #    # Remove the link to the BAM file.
        #    os.unlink(j.temp_bam_filename)
        
        x = [j.fa_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_node = in_data
        summary_filename = summary_node.identifier
        metadata = {}

        buildver = mlib.get_user_option(user_options,
                                        "annovar_buildver",
                                        allowed_values=["hg19"],
                                        not_empty=True)

        # Name files.
        p, root, ext = mlib.splitpath(summary_filename)
        annovar_infile = "pos.txt"
        log_filename = "annovar.log"
        # Annovar takes a filestem, without the ".vcf".
        annovar_outstem = "annotations"
        # Produces file:
        # <annovar_outstem>.hg19_multianno.txt
        multianno_file = "%s.hg19_multianno.txt" % annovar_outstem
        #temp_file = "temp.txt"

        # Make the infile for Annovar.
        # <chrom> <start> <end> <ref> <alt>
        handle = open(annovar_infile, 'w')
        for d in filelib.read_row(summary_filename, skip=2, header=1):
            x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt
            print >> handle, "\t".join(x)
        handle.close()

        cmd = alignlib.make_annovar_command(annovar_infile,
                                            log_filename,
                                            annovar_outstem,
                                            buildver,
                                            vcf_input=False)
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(log_filename)
        filelib.assert_exists_nz(multianno_file)

        matrix = SimpleVariantMatrix.read(summary_filename)
        annot_matrix = matrix.annot_matrix
        #headers = annot_matrix.headers + anno_header[5:]
        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]

        # Read in the multianno output file.
        pos2d = {}  # (chrom, start, ref, alt) -> d
        anno_header = None
        for d in filelib.read_row(multianno_file, header=1):
            key = d.Chr, int(d.Start), d.Ref, d.Alt
            assert key not in pos2d, "Duplicate pos: %s" % str(key)
            pos2d[key] = d
            if not anno_header:
                anno_header = d._header
        assert anno_header

        # Multianno starts with:
        # Chr Start End Ref Alt
        # Ignore these.
        assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"]
        headers = anno_header[5:]

        all_annots = []
        #for h in annot_matrix.headers_h:
        #    x = annot_matrix.header2annots[h]
        #    all_annots.append(x)
        for i in range(5, len(anno_header)):
            annots = []
            for coord in zip(chrom, pos, ref, alt):
                d = pos2d.get(coord)
                x = ""
                if d:
                    x = d._cols[i]
                annots.append(x)
            all_annots.append(annots)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        matrix.named_matrices.insert(0, ("Annovar", x))

        SimpleVariantMatrix.write(out_filename, matrix)

        ## cols_to_add = len(anno_header) - 5
        ## assert cols_to_add > 0

        ## # Merge the multianno file with the simple call summary.  Add
        ## # these columns before the <Sample>.
        ## # Sample                <Sample>
        ## # Caller                <Caller>
        ## # Chrom  Pos  Ref  Alt  Ref/Alt/VAF
        ## handle = open(temp_file, 'w')
        ## it = filelib.read_cols(summary_filename)
        ## header1 = it.next()
        ## header2 = it.next()
        ## header3 = it.next()
        ## assert len(header1) == len(header2), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert len(header1) == len(header3), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert header1[0] == "Sample"
        ## assert header2[0] == "Caller"
        ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"]
        ## header1 = header1[:4] + [""]*cols_to_add + header1[4:]
        ## header2 = header2[:4] + [""]*cols_to_add + header2[4:]
        ## header3 = header3[:4] + anno_header[5:] + header3[4:]
        ## print >>handle, "\t".join(header1)
        ## print >>handle, "\t".join(header2)
        ## print >>handle, "\t".join(header3)
        ## for cols in it:
        ##     chrom, pos, ref, alt = cols[:4]
        ##     pos = int(pos)
        ##     d = pos2d.get((chrom, pos))
        ##     if not d:
        ##         cols = cols[:4] + [""]*cols_to_add + cols[4:]
        ##         continue
        ##     assert ref == d.Ref, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     assert alt == d.Alt, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     x = d._cols[5:]
        ##     assert len(x) == cols_to_add
        ##     cols = cols[:4] + x + cols[4:]
        ##     print >>handle, "\t".join(cols)
        ## handle.close()

        ## shutil.move(temp_file, out_filename)

        return metadata
Пример #20
0
def merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path):
    # Put indexed files in tmp_path.
    import os
    import stat
    import shutil
    from genomicode import filelib
    from genomicode import hashlib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    # TODO: find the version number of these tools.
    bgzip = mlib.findbin("bgzip")
    tabix = mlib.findbin("tabix")
    bcftools = mlib.findbin("bcftools")
    sq = parallel.quote

    tmp_path = os.path.realpath(tmp_path)
    filelib.safe_mkdir(tmp_path)

    # Keep track of all commands run.
    metadata = {}
    metadata["commands"] = []

    # Ignore VCF files that don't have any variants.
    vcf_filenames = [x for x in vcf_filenames if os.stat(x)[stat.ST_SIZE] > 0]

    # If there are no VCF files with any variants, then just create an
    # empty outfile and return.
    if not vcf_filenames:
        open(out_filename, 'w')
        return

    # 1.  Copy VCF files to temporary directory.             tmp_filename
    # 2.  Fix VCF files (e.g. NextGENe, JointSNVMix broken)
    # 3.  Sort the VCF files (needed for tabix)
    # 4.  Compress  (bgzip)
    # 5.  Index     (tabix)
    # 6.  Merge

    jobs = []
    for in_filename in vcf_filenames:
        path, root, ext = mlib.splitpath(in_filename)
        sample = root
        x = "%s%s" % (hashlib.hash_var(root), ext)
        tmp_filename = os.path.join(tmp_path, x)
        x = filelib.GenericObject(
            sample=sample,
            in_filename=in_filename,
            tmp_filename=tmp_filename,
        )
        jobs.append(x)

    # Make sure temporary files are unique.
    seen = {}
    for j in jobs:
        assert j.tmp_filename not in seen
        seen[j.tmp_filename] = 1

    # Merge them in order of sample.  The germline sample will be
    # duplicated, and we will know the order of the germline sample.
    schwartz = [(x.sample, x) for x in jobs]
    schwartz.sort()
    jobs = [x[-1] for x in schwartz]

    # Copy all the VCF files to a temporary directory.
    for j in jobs:
        shutil.copy2(j.in_filename, j.tmp_filename)

    #for j in jobs:
    #    make_file_smaller(j.tmp_filename, 1000)

    for j in jobs:
        # NextGENe creates broken VCF files.  Fix them.
        fix_nextgene_vcf(j.tmp_filename)
        # JointSNVMix creates broken VCF files.  Fix them.
        fix_jointsnvmix_vcf(j.tmp_filename)

    for j in jobs:
        sort_vcf_file(j.tmp_filename)

    ## # Since we are merging the files, we need to make sure that
    ## # each file has a unique name.  If the names aren't unique,
    ## # then make them unique by adding the name of the file.
    ## all_unique = True
    ## seen = {}
    ## for x in jobs:
    ##     sample, in_filename, tmp_filename = x
    ##     samples = _get_samples_from_vcf(tmp_filename)
    ##     for s in samples:
    ##         if s in seen:
    ##             all_unique = False
    ##             break
    ##         seen[s] = 1
    ##     if not all_unique:
    ##         break
    ## if not all_unique:
    ##     for x in jobs:
    ##         sample, in_filename, tmp_filename = x
    ##         _uniquify_samples_in_vcf(tmp_filename, sample)

    # Compress the VCF files.
    # bgzip file.vcf
    commands = []
    for j in jobs:
        x = "%s %s" % (sq(bgzip), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    metadata["num_cores"] = num_cores
    x = ["%s.gz" % x.tmp_filename for x in jobs]
    filelib.assert_exists_nz_many(x)

    # Index the VCF files.
    # tabix -p vcf file.vcf.gz
    commands = []
    for j in jobs:
        x = "%s -p vcf %s.gz" % (sq(tabix), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    x = ["%s.gz.tbi" % j.tmp_filename for j in jobs]
    filelib.assert_exists_nz_many(x)

    # Run bcftools
    ## For VCF files from somatic calls, the germline sample will
    ## be duplicated.  Add --force-samples to make sure this is
    ## still merged.

    # Since we need to append all the VCF files, it's easy to run
    # into error:
    # OSError: [Errno 7] Argument list too long
    #
    # To reduce the chance of this, figure out the path of the
    # tmp_filename, and run the analysis in that path so we can
    # use relative filenames.
    tmp_path = None
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        if tmp_path is None:
            tmp_path = path
        assert path == tmp_path

    cmd = [
        sq(bcftools),
        "merge",
        "-o %s" % sq(out_filename),
        "-O v",
        "--force-samples",
    ]
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        assert path == tmp_path
        cmd.append("%s.gz" % file_)
    x = " ".join(cmd)
    parallel.sshell(x, path=tmp_path)
    metadata["commands"].append(x)

    return metadata
Пример #21
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # java -jar picard.jar CollectAlignmentSummaryMetrics \
        #   R=reference_sequence.fasta \
        #   I=input.bam \
        #   O=output.txt
        opj = os.path.join
        jobs = []   # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(
                sample=sample,
                bam_filename=bam_filename,
                out_filename=out_filename,
                log_filename=log_filename)
            jobs.append(x)

        # Make the commands to run picard.
        picard_jar = alignlib.find_picard_jar("picard")
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Should have better way of getting java path.
            cmd = [
                "java",
                "-Xmx10g",
                "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics",
                "I=%s" % sq(j.bam_filename),
                "R=%s" % sq(ref.fasta_file_full),
                "O=%s" % sq(j.out_filename),
                ]
            cmd = " ".join(cmd)
            cmd = "%s >& %s" % (cmd, sq(j.log_filename))
            commands.append(cmd)

        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Summarize the insert size files.
        outfile = opj(out_path, "summary.txt")
        _summarize_alignment_summary_metrics(jobs, outfile)
        filelib.assert_exists_nz(outfile)

        return metadata
Пример #22
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        vcf_node, nc_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Filenames:
        # <caller>.vcf

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        genome = mlib.get_user_option(user_options,
                                      "snpeff_genome",
                                      not_empty=True)
        databases = list_snpeff_databases()
        assert genome in databases, "Unknown genome database: %s" % genome

        # For each caller, do the SnpEFF calls.  Some callers include
        # the somatic information, others do not.  If germline samples
        # are present, then do with _cancer.  Otherwise, do not.

        # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt
        #   GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log

        # Don't bother annotating positions that do not pass filter.
        # Filter them out first based on FILTER column.

        opj = os.path.join
        jobs = []
        for in_filename in vcf_filenames:
            path, stem, ext = mlib.splitpath(in_filename)
            samples_file = opj(out_path, "%s.cancerSamples.txt" % stem)
            filtered_filename = opj(out_path, "%s.filtered_input" % stem)
            out_filename = opj(out_path, "%s.vcf" % stem)
            log_filename = opj(out_path, "%s.log" % stem)
            x = filelib.GenericObject(in_filename=in_filename,
                                      samples_file=samples_file,
                                      filtered_filename=filtered_filename,
                                      out_filename=out_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # First, filter each of the VCF files.
        commands = []
        for j in jobs:
            # For debugging.  If this file exists, don't filter it again.
            if os.path.exists(j.filtered_filename):
                continue
            args = j.in_filename, j.filtered_filename, wgs_or_wes
            x = vcflib.filter_vcf_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Make the cancer_samples files.
        for j in jobs:
            # Will generate this if there are cancer samples.
            make_cancer_samples_file(j.filtered_filename, nc_match,
                                     j.samples_file)

        # Make a list of commands.
        commands = []
        for j in jobs:
            cancer = False
            if os.path.exists(j.samples_file):
                cancer = True
            x = make_snpeff_command(j.filtered_filename,
                                    genome,
                                    j.out_filename,
                                    j.log_filename,
                                    is_cancer=cancer,
                                    cancer_samples_file=j.samples_file)
            commands.append(x)

        nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        # Make sure the analysis completed successfully.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Log files should be empty.
        for j in jobs:
            filelib.assert_exists(j.log_filename)
            assert not filelib.exists_nz(j.log_filename), \
                   "Error with %s.\n%s" % (j.stem, j.log_filename)
            filelib.safe_unlink(j.log_filename)

        return metadata
Пример #23
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
Пример #24
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import config
        from Betsy import module_utils as mlib

        mpileup_node = in_data
        mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier,
                                                       endswith=".pileup")
        assert mpileup_filenames, "No .pileup files."
        #nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        #ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        tool = "mpileup2snp"
        if vartype == "indel":
            tool = "mpileup2indel"

        # list of (sample, in_filename, tmp1_filename, tmp2_filename,
        #          out_filename)
        jobs = []
        for in_filename in mpileup_filenames:
            p, sample, ext = mlib.splitpath(in_filename)
            tmp1_filename = os.path.join(out_path, "%s.tmp1" % sample)
            tmp2_filename = os.path.join(out_path, "%s.tmp2" % sample)
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, tmp1_filename, tmp2_filename, out_filename
            jobs.append(x)

        # VarScan will generate a "Parsing Exception" if there are 0
        # reads in a location.  Filter those out.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = "awk -F'\t' '$4 != 0 {print}' %s > %s" % (in_filename,
                                                          tmp1_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        x = [x[2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # java -jar /usr/local/bin/VarScan.jar <tool> $i --output_vcf 1 > $j
        varscan = filelib.which_assert(config.varscan_jar)

        # Make a list of commands.
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = [
                "java",
                "-jar",
                sq(varscan),
                tool,
                tmp1_filename,
                "--p-value",
                0.05,
                "--output-vcf",
                1,
            ]
            x = " ".join(map(str, x))
            x = "%s >& %s" % (x, tmp2_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)
        x = [x[3] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Clean up the VCF files.  VarScan leaves extraneous lines
        # there.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            alignlib.clean_varscan_vcf(sample, tmp2_filename, out_filename)
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # The tmp files are really big.  Don't save those.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            filelib.safe_unlink(tmp1_filename)
            filelib.safe_unlink(tmp2_filename)