def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils as mlib import merge_vcf_folder vcffolders_node = antecedents filelib.safe_mkdir(out_path) metadata = {} x = os.listdir(vcffolders_node.identifier) x = [x for x in x if x.endswith(".vcf")] assert x, "No VCF folders found: %s" % vcffolders_node.identifier x = [os.path.join(vcffolders_node.identifier, x) for x in x] vcf_folders = x jobs = [] for folder in vcf_folders: path, root, ext = mlib.splitpath(folder) assert ext == ".vcf" caller = root vcf_filenames = filelib.list_files_in_path(folder, endswith=".vcf", toplevel_only=True) assert vcf_filenames, "No .vcf files: %s" % folder out_filename = os.path.join(out_path, "%s.vcf" % root) tmp_path = "%s.indexed.vcf" % caller x = filelib.GenericObject(caller=caller, vcf_filenames=vcf_filenames, out_filename=out_filename, tmp_path=tmp_path) jobs.append(x) for j in jobs: m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames, j.out_filename, num_cores, j.tmp_path) if "commands" not in metadata: metadata["commands"] = [] metadata["commands"].extend(m["commands"]) x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) return metadata
def get_radia_files(radia_path, assembly): import os from genomicode import filelib opj = os.path.join radia_py = opj(radia_path, "scripts", "radia.py") filterRadia_py = opj(radia_path, "scripts", "filterRadia.py") mergeChroms_py = opj(radia_path, "scripts", "mergeChroms.py") # For hg19 only. scripts_dir = opj(radia_path, "scripts") blacklist_dir = opj(radia_path, "data/%s/blacklists/1000Genomes/phase1" % assembly) snp_dir = opj(radia_path, "data/%s/snp135" % assembly) retro_dir = opj(radia_path, "data/%s/retroGenes" % assembly) pseudo_dir = opj(radia_path, "data/%s/pseudoGenes" % assembly) cosmic_dir = opj(radia_path, "data/%s/cosmic" % assembly) target_dir = opj(radia_path, "data/%s/gaf/2_1" % assembly) rnageneblck_file = opj(radia_path, "data/rnaGeneBlacklist.tab") rnagenefamilyblck_file = opj(radia_path, "data/rnaGeneFamilyBlacklist.tab") files = [ radia_py, filterRadia_py, mergeChroms_py, rnageneblck_file, rnagenefamilyblck_file, ] paths = [ scripts_dir, blacklist_dir, snp_dir, retro_dir, pseudo_dir, cosmic_dir, target_dir, ] filelib.assert_exists_nz_many(files) filelib.assert_exists_many(paths) x = RadiaFiles(radia_py, filterRadia_py, mergeChroms_py, scripts_dir, blacklist_dir, snp_dir, retro_dir, pseudo_dir, cosmic_dir, target_dir, rnageneblck_file, rnagenefamilyblck_file) return x
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel vcf_node = in_data vcf_files = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", case_insensitive=True) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # in_vcf_filename, out_vcf_filename for vcf_file in vcf_files: path, file_ = os.path.split(vcf_file) out_vcf_file = os.path.join(out_path, file_) x = vcf_file, out_vcf_file jobs.append(x) # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel"] # Generate the commands. commands = [] for x in jobs: in_vcf_file, out_vcf_file = x args = vartype, in_vcf_file, out_vcf_file x = filter_by_vartype, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out MuTect version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals"] cosmic_file = mlib.get_user_option( user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option( user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile, # coverage_outfile, vcf_outfile, logfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) call_outfile = opj(out_path, "%s.call_stats.out" % sample) cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample) raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile jobs.append(x) # java -Xmx2g -jar muTect.jar # --analysis_type MuTect # --reference_sequence <reference> # --cosmic <cosmic.vcf> # --dbsnp <dbsnp.vcf> # --intervals <intervals_to_process> # --input_file:normal <normal.bam> # --input_file:tumor <tumor.bam> # --out <call_stats.out> # --coverage_file <coverage.wig.txt> # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x UNHASHABLE = [ ("input_file:normal", sq(normal_bamfile)), ("input_file:tumor", sq(cancer_bamfile)), ] x = alignlib.make_MuTect_command( analysis_type="MuTect", reference_sequence=sq(ref.fasta_file_full), cosmic=sq(cosmic_file), dbsnp=sq(dbsnp_file), intervals=sq(interval_node.identifier), out=sq(call_outfile), coverage_file=sq(cov_outfile), vcf=sq(raw_vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, x in enumerate(jobs): normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x # Pull out the error lines. x = [x for x in open(log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect error [%s]:\n%s\n%s" % ( cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x[6] for x in jobs] filelib.assert_exists_many(x) # Fix the files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x alignlib.clean_mutect_vcf( normal_bamfile, cancer_bamfile, normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node, insert_size_node, alignment_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # ./pindel -f <reference.fa> -i <bam_configuration_file> # -c <chromosome_name> -o <out_prefix> # -T <num threads> # # Creates files: # <out_prefix>_D Deletion # <out_prefix>_SI Short insertion # <out_prefix>_LI Long insertion # <out_prefix>_INV Inversion # <out_prefix>_TD Tandem deletion # <out_prefix>_BP Breakpoint # <out_prefix>_RP ??? read pair??? # <out_prefix>_CloseEndMapped Only on end could be mapped. # Pindel cannot handle spaces in the BAM filenames (because of # the config file). Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = opj(bam_path, "%s.bam" % x) local_bai = opj(bam_path, "%s.bam.bai" % x) config_filename = opj(out_path, "%s.config.txt" % sample) out_prefix = opj(out_path, sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, config_filename=config_filename, out_prefix=out_prefix, log_filename=log_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # Read the insert sizes. summary_file = opj(insert_size_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2size = _read_insert_sizes(summary_file) # Make sure all the samples have inserts. for j in jobs: assert j.sample in sample2size, \ "Missing in insert size file: %s" % j.sample # Read the fragment sizes. summary_file = opj(alignment_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2readlen = _read_fragment_sizes(summary_file) # Make sure all the samples have read lengths. for j in jobs: assert j.sample in sample2readlen, \ "Missing in alignment summary file: %s" % j.sample # Make the config file. for j in jobs: # <insert size> is the whole length to be sequenced, including # the length of the pair of reads. Picard only counts the # sequence between the reads. size = sample2size[j.sample] read_length = sample2readlen[j.sample] insert_size = size + read_length * 2 handle = open(j.config_filename, 'w') print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample) handle.close() # Make a list of commands. pindel = mlib.get_config("pindel", which_assert_file=True) sq = parallel.quote commands = [] for j in jobs: cmd = [ sq(pindel), "-f", sq(ref.fasta_file_full), "-i", sq(j.config_filename), "-c", "ALL", "-T", 1, "-o", sq(j.out_prefix), ] cmd = " ".join(map(str, cmd)) cmd = "%s >& %s" % (cmd, j.log_filename) commands.append(cmd) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure the analysis completed successfully. If not, try # to diagnose. x = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x) x1 = ["%s_D" % x.out_prefix for x in jobs] x2 = ["%s_SI" % x.out_prefix for x in jobs] x3 = ["%s_LI" % x.out_prefix for x in jobs] x4 = ["%s_INV" % x.out_prefix for x in jobs] x5 = ["%s_TD" % x.out_prefix for x in jobs] x6 = ["%s_BP" % x.out_prefix for x in jobs] x = x1 + x2 + x3 + x4 + x5 + x6 filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile, # fixed_outfile, filtered_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) orig_outfile = opj(out_path, "%s.raw" % sample) fix_outfile = opj(out_path, "%s.vcf" % sample) #filter_outfile = opj(out_path, "%s.vcf" % sample) x = cancer_sample, normal_bamfile, cancer_bamfile, \ orig_outfile, fix_outfile x = filelib.GenericObject(cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, orig_outfile=orig_outfile, fix_outfile=fix_outfile) jobs.append(x) # python /usr/local/museq/classify.py \ # normal:test31/normal.bam tumour:test31/tumor.bam \ # reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # model:/usr/local/museq/model_v4.1.2.npz \ # --config /usr/local/museq/metadata.config \ # -o test51.vcf opj = os.path.join museq = mlib.get_config("museq", assert_exists=True) classify_py = opj(museq, "classify.py") model_file = opj(museq, "model_v4.1.2.npz") config_file = opj(museq, "metadata.config") filelib.assert_exists_nz(classify_py) filelib.assert_exists_nz(model_file) filelib.assert_exists_nz(config_file) # museq's config file generates a broken VCF file. Fix it. fixed_config_file = "fixed.config" fix_config_file(config_file, fixed_config_file) # Generate the commands. sq = mlib.sq commands = [] for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x x = [ "python", # should allow user to specify python sq(classify_py), sq("normal:%s" % j.normal_bamfile), sq("tumour:%s" % j.cancer_bamfile), sq("reference:%s" % ref.fasta_file_full), sq("model:%s" % model_file), "--config", sq(fixed_config_file), "-o", sq(j.orig_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. On Thunderbolts test, # took < 1 Gb. nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # JointSNVMix produces non-standard VCF files. Fix this so it # will work with other programs downstream. for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile) # Filter each of the VCF files. #for x in jobs: # cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x # filter_by_vartype(vartype, fix_outfile, vcf_outfile) #metadata["filter"] = vartype #x = [x[-1] for x in jobs] x = [j.fix_outfile for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "MuSE %s" % alignlib.get_muse_version() wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) dbsnp_file = mlib.get_user_option(user_options, "muse_dbsnp_vcf", not_empty=True, check_file=True) # Make sure dbsnp_file is compressed and indexed. assert dbsnp_file.endswith(".vcf.gz"), \ "muse_dbsnp_vcf must be bgzip compressed." x = "%s.tbi" % dbsnp_file assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed." # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, # logfile1, logfile2) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) muse_call_stem = opj(out_path, "%s.call" % cancer_sample) muse_call_file = "%s.MuSE.txt" % muse_call_stem raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample) vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample) log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 jobs.append(x) # Generate the commands. # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\ # bam04/196B-MG.bam bam04/PIM001_G.bam # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \ # -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz MuSE = mlib.findbin("muse") sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "call", "-O", muse_call_stem, "-f", sq(ref.fasta_file_full), cancer_bamfile, normal_bamfile, ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile1) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[8] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the call files are created and not empty. call_files = [x[5] for x in jobs] filelib.assert_exists_nz_many(call_files) # Run the "sump" step. commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "sump", "-I", sq(muse_call_file), ] assert wgs_or_wes in ["wgs", "wes"] if wgs_or_wes == "wgs": x += ["-G"] else: x += ["-E"] x += [ "-O", sq(raw_vcf_outfile), "-D", sq(dbsnp_file), ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile2) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = metadata["commands"] + commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[9] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the raw files are created and not empty. vcf_files = [x[6] for x in jobs] filelib.assert_exists_nz_many(vcf_files) # Fix the files. commands = [] # Should be python commands. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile x = alignlib.clean_muse_vcf, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Delete the log_outfiles if empty. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x if os.path.exists(log_outfile1): os.unlink(log_outfile1) if os.path.exists(log_outfile2): os.unlink(log_outfile2) # Make sure output VCF files exist. x = [x[7] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals" ] cosmic_file = mlib.get_user_option(user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option(user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(normal_sample=normal_sample, cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, vcf_outfile=vcf_outfile, log_outfile=log_outfile) jobs.append(x) # java -jar GenomeAnalysisTK.jar \ # -T MuTect2 \ # -R reference.fasta \ # -I:tumor tumor.bam \ # -I:normal normal.bam \ # [--dbsnp dbSNP.vcf] \ # [--cosmic COSMIC.vcf] \ # [-L targets.interval_list] \ # -o output.vcf # Generate the commands. sq = mlib.sq commands = [] for j in jobs: UNHASHABLE = [ ("I:normal", sq(normal_bamfile)), ("I:tumor", sq(cancer_bamfile)), # --dbsnp and --cosmic use two dashes, for some # reason. Since make_GATK_command only uses one dash, # add one manually. ("-dbsnp", sq(dbsnp_file)), ("-cosmic", sq(cosmic_file)), ] x = alignlib.make_GATK_command( T="MuTect2", R=sq(ref.fasta_file_full), L=sq(interval_node.identifier), o=sq(j.vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, j.log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, j in enumerate(jobs): # Pull out the error lines. x = [x for x in open(j.log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x.vcf_outfile for x in jobs] filelib.assert_exists_many(x) # Mutect2 names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for j in jobs: call_somatic_varscan._fix_normal_cancer_names( j.vcf_outfile, j.normal_sample, j.cancer_sample) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils bam_node, ref_node, pos_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Positions file has 0-based coordinates (like BAM files). # But samtools requires 1-based coordinates. Convert to # 1-based coordinates. positions_filename = "positions.txt" outhandle = open(positions_filename, 'w') for x in filelib.read_cols(pos_node.identifier): assert len(x) == 2 chrom, pos = x pos = int(pos) + 1 # convert from 0- to 1-based coords. x = chrom, pos print >> outhandle, "\t".join(map(str, x)) outhandle.close() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = filelib.GenericObject(in_filename=in_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) ## Get possible positions file. #positions_filename = module_utils.get_user_option( # user_options, "positions_file", check_file=True) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel", "consensus"] #if cov == "yes": # assert positions_filename, "Missing: positions_file" # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \ # $i > $j" samtools = filelib.which_assert(config.samtools) # Get an error if the BAM files are not indexed. # [W::bam_hdr_read] EOF marker is absent. The input is probably # truncated. #if vartype == "consensus": # args = [ # "-R", # Ignore read group tags. # "-B", # Disable BAQ (base quality) computation. # "-q", 0, # Skip bases with mapQ smaller than this. # "-Q", 0, # Skip bases with BAQ smaller than this. # "-d10000000", # Allow deep reads. # ] #else: # raise NotImplementedError args = [ "-R", # Ignore read group tags. "-B", # Disable BAQ (base quality) computation. "-q", 0, # Skip bases with mapQ smaller than this. "-Q", 0, # Skip bases with BAQ smaller than this. "-d10000000", # Allow deep reads. ] sq = parallel.quote commands = [] for j in jobs: x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] if positions_filename: x.extend(["-l", positions_filename]) x.extend(args) x.append(sq(j.in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename) commands.append(x) #for x in commands: # print x parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands # File may be empty if there are no reads. x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) # Make sure there's no errors in the log files. for j in jobs: check_log_file(j.err_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # vcf_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile jobs.append(x) # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # test31/tumor.bam test31/normal.bam test41.vcf somaticsniper = mlib.get_config("somaticsniper", which_assert_file=True) # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x x = [ sq(somaticsniper), "-q", 1, "-Q", 15, "-G", "-L", "-F", "vcf", "-f", sq(ref.fasta_file_full), sq(cancer_bamfile), sq(normal_bamfile), sq(vcf_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # SomaticSniper names the samples "NORMAL" and "TUMOR". # Replace them with the actual names. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x call_somatic_varscan._fix_normal_cancer_names( vcf_outfile, normal_sample, cancer_sample) x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata