def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib #from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib # TODO: Merge with merge_variants_snp.py. #CALLERS = [ # "gatk", "platypus", "varscan", # ] vcf_paths = [x.identifier for x in antecedents] nodes = [x.data for x in antecedents] CALLERS = [x.attributes["caller"] for x in nodes] assert len(CALLERS) == len(vcf_paths) filelib.safe_mkdir(out_path) metadata = {} # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file) jobs = [] for i, caller in enumerate(CALLERS): inpath = vcf_paths[i] caller_h = hashlib.hash_var(caller) vcf_files = filelib.list_files_in_path( inpath, endswith=".vcf", toplevel_only=True) for file_ in vcf_files: # IN_FILE: <inpath>/<sample>.vcf # OUT_FILE: <out_path>/<caller>.vcf/<sample>.vcf p, sample, e = mlib.splitpath(file_) assert e == ".vcf" out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h) out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample) x = filelib.GenericObject( sample=sample, caller=caller, out_vcf_path=out_vcf_path, in_vcf_file=file_, out_vcf_file=out_vcf_file) jobs.append(x) # Make sure the same samples are found in all callers. caller2samples = {} for j in jobs: if j.caller not in caller2samples: caller2samples[j.caller] = [] caller2samples[j.caller].append(j.sample) comp_samples = None for caller, samples in caller2samples.iteritems(): samples = sorted(samples) if comp_samples is None: comp_samples = samples assert comp_samples == samples, "%s %s" % (comp_samples, samples) for j in jobs: filelib.safe_mkdir(j.out_vcf_path) os.symlink(j.in_vcf_file, j.out_vcf_file) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils # This this is I/O heavy, don't use so many cores. MAX_CORES = 4 filelib.safe_mkdir(out_path) filenames = module_utils.find_fastq_files(in_data.identifier) assert filenames, "I could not find any FASTQ files." REMOVE = [".gz", ".bz2", ".xz"] # Uncompress the files to the new directory in parallel. commands = [] for in_filename in filenames: in_path, in_file = os.path.split(in_filename) x = in_file for r in REMOVE: if x.lower().endswith(r): x = x[:-len(r)] out_file = x out_filename = os.path.join(out_path, out_file) args = in_filename, out_filename keywds = {} x = uncompress_file, args, keywds commands.append(x) nc = min(MAX_CORES, num_cores) parallel.pyfun(commands, num_procs=nc)
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from Betsy import module_utils as mlib import cluster_genes_by_hierarchical as clust filelib.safe_mkdir(out_path) metadata = {} kmeans_k = mlib.get_user_option( user_options, "kmeans_k", not_empty=True, type=int) assert kmeans_k >= 2 and kmeans_k < 100 x = clust.run_cluster30( in_data.identifier, "kmeans", user_options, kmeans_k=kmeans_k) cmd, cluster_files = x metadata["command"] = cmd opj = os.path.join out_cdt_file = opj(out_path, "signal.cdt") out_kag_file = opj(out_path, "array_cluster.kag") out_kgg_file = opj(out_path, "gene_cluster.kgg") assert "cdt" in cluster_files shutil.copy2(cluster_files["cdt"], out_cdt_file) if "kag" in cluster_files: shutil.copy2(cluster_files["kag"], out_kag_file) if "kgg" in cluster_files: shutil.copy2(cluster_files["kgg"], out_kgg_file) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils bam_filenames = module_utils.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.matches.txt" % s) x = in_filename, out_filename jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: in_filename, out_filename = x x = summarize_bam_file, (in_filename, out_filename), None jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores, DELAY=0.1) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib MAX_RAM = 64 # maximum amount of ram to use in Gb. bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = in_filename, log_filename, out_filename jobs.append(x) # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar # -T SplitNCigarReads -R ../hg19.fa -I $i -o $j # -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 # -U ALLOW_N_CIGAR_READS # Start with 5 Gb RAM. commands = make_commands(jobs, ref.fasta_file_full, 5) nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_procs"] = nc # If any of the analyses didn't finish, try again with more # RAM. jobs2 = [] for x in jobs: in_filename, log_filename, out_filename = x if filelib.exists_nz(out_filename): continue jobs2.append(x) if jobs2: commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM) nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] += commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bam_path = in_data.identifier assert os.path.exists(bam_path) assert os.path.isdir(bam_path) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # Find all the BAM files. bam_filenames = filelib.list_files_in_path( bam_path, endswith=".bam", case_insensitive=True) jobs = [] # list of in_filename, out_filename for in_filename in bam_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) assert not os.path.exists(out_filename) x = in_filename, out_filename jobs.append(x) # Symlink the BAM files to the output path. for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Index each of the files. sq = parallel.quote samtools = filelib.which_assert(config.samtools) commands = [] for x in jobs: in_filename, out_filename = x cmd = [ sq(samtools), "index", sq(out_filename), ] x = " ".join(cmd) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores, path=out_path) # TODO: Check for output files. return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = in_filename, err_filename, out_filename jobs.append(x) # samtools mpileup -f [reference sequence] [BAM file(s)] # > myData.mpileup samtools = mlib.findbin("samtools") sq = mlib.sq commands = [] for x in jobs: in_filename, err_filename, out_filename = x x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] x.append(sq(in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, err_filename, out_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node = antecedents #in_filenames = filelib.list_files_in_path( # bam_node.identifier, endswith=".bam", case_insensitive=True) in_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \ # I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta picard_jar = alignlib.find_picard_jar("picard") jobs = [] # list of (in_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) # Make a list of commands. sq = parallel.quote commands = [] for x in jobs: in_filename, out_filename = x x = [ "java", "-Xmx5g", "-jar", sq(picard_jar), "ReorderSam", "I=%s" % sq(in_filename), "O=%s" % sq(out_filename), "REFERENCE=%s" % ref.fasta_file_full, ] x = " ".join(x) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: in_filename, out_filename = x filelib.assert_exists_nz(out_filename)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, sam_filename, log_filename jobs.append(x) sq = mlib.sq commands = [] for x in jobs: sample, pair1, pair2, sam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) x = alignlib.make_bowtie2_command(ref.fasta_file_full, pair1, fastq_file2=pair2, sam_file=sam_filename, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib # If align_with_star is run with two_pass=yes, this will leave # two BAM files for every sample. # p1.<sample>.Aligned.out.bam pass 1 # <sample>.Aligned.out.bam pass 2 # Make sure to ignore the pass1 files. x = filelib.list_files_in_path( in_data.identifier, endswith=".Aligned.out.bam", file_not_startswith="p1.") bam_filenames = x if not bam_filenames: x = filelib.list_files_in_path( in_data.identifier, endswith=".Aligned.out.sam") sam_filenames = x if sam_filenames: assert bam_filenames, \ "No .Aligned.out.bam files. Looks like .sam generated." assert bam_filenames, "No .Aligned.out.bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: # in_filename has format: # <path>/<sample>.Aligned.out.sam path, f = os.path.split(in_filename) sample, x = f.split(".", 1) assert x == "Aligned.out.bam", f out_filename = os.path.join(out_path, "%s.bam" % sample) assert in_filename != out_filename jobs.append((in_filename, out_filename)) # Make sure outfiles are unique. x = [x[-1] for x in jobs] x = {}.fromkeys(x) assert len(jobs) == len(x), "Duplicate sample names." for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib sam_filenames = mlib.find_sam_files(in_data.identifier) assert sam_filenames, "No .sam files." filelib.safe_mkdir(out_path) metadata = {} samtools = mlib.findbin("samtools") jobs = [] # list of (sam_filename, bam_filename) for sam_filename in sam_filenames: p, f = os.path.split(sam_filename) assert f.endswith(".sam") f = f.replace(".sam", ".bam") bam_filename = os.path.join(out_path, f) x = sam_filename, bam_filename jobs.append(x) # Make a list of samtools commands. sq = parallel.quote commands = [] for x in jobs: sam_filename, bam_filename = x # samtools view -bS -o <bam_filename> <sam_filename> x = [ sq(samtools), "view", "-bS", "-o", sq(bam_filename), sq(sam_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils vcf_node = in_data vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." filelib.safe_mkdir(out_path) buildver = module_utils.get_user_option(user_options, "buildver", allowed_values=["hg19"], not_empty=True) jobs = [] # list of (in_filename, log_filename, out_filestem) for in_filename in vcf_filenames: # Annovar takes a filestem, without the ".vcf". p, f = os.path.split(in_filename) f, exp = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % f) out_filestem = os.path.join(out_path, f) x = in_filename, log_filename, out_filestem jobs.append(x) # Make a list of commands. commands = [] for x in jobs: in_filename, log_filename, out_filestem = x x = alignlib.make_annovar_command(in_filename, log_filename, out_filestem, buildver) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] # out_filestems x = ["%s.%s_multianno.vcf" % (x, buildver) for x in x] filelib.assert_exists_nz_many(x)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils as mlib import merge_vcf_folder vcffolders_node = antecedents filelib.safe_mkdir(out_path) metadata = {} x = os.listdir(vcffolders_node.identifier) x = [x for x in x if x.endswith(".vcf")] assert x, "No VCF folders found: %s" % vcffolders_node.identifier x = [os.path.join(vcffolders_node.identifier, x) for x in x] vcf_folders = x jobs = [] for folder in vcf_folders: path, root, ext = mlib.splitpath(folder) assert ext == ".vcf" caller = root vcf_filenames = filelib.list_files_in_path(folder, endswith=".vcf", toplevel_only=True) assert vcf_filenames, "No .vcf files: %s" % folder out_filename = os.path.join(out_path, "%s.vcf" % root) tmp_path = "%s.indexed.vcf" % caller x = filelib.GenericObject(caller=caller, vcf_filenames=vcf_filenames, out_filename=out_filename, tmp_path=tmp_path) jobs.append(x) for j in jobs: m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames, j.out_filename, num_cores, j.tmp_path) if "commands" not in metadata: metadata["commands"] = [] metadata["commands"].extend(m["commands"]) x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib ( bam_node, fastqc_summary1_node, fastqc_folder1_node, fastqc_summary2_node, fastqc_folder2_node, rseqc_node, signal1_node, # TPM signal2_node, # TPM, isoform aligned_reads_node, signal3_node, # count htseq_reads_node) = antecedents filelib.safe_mkdir(out_path) FILES = [ (bam_node.identifier, False, "alignment.bam"), (fastqc_summary1_node.identifier, True, "fastqc.no_trim.xls"), (fastqc_folder1_node.identifier, False, "fastqc.no_trim"), (fastqc_summary2_node.identifier, True, "fastqc.trim.xls"), (fastqc_folder2_node.identifier, False, "fastqc.trim"), (rseqc_node.identifier, False, "RSeQC"), (signal1_node.identifier, True, "expression.gene.tpm"), (signal2_node.identifier, True, "expression.isoform.tpm"), (aligned_reads_node.identifier, True, "aligned.xls"), (signal3_node.identifier, True, "expression.counts"), (htseq_reads_node.identifier, True, "mapped.htseq.txt"), ] for x in FILES: orig_filename, is_file, new_file = x new_filename = os.path.join(out_path, new_file) # Copy or link the data into the right place. if is_file: filelib.assert_exists_nz(orig_filename) else: assert filelib.dir_exists(orig_filename), \ "Directory not found or not directory: %s" % \ orig_filename os.symlink(orig_filename, new_filename)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils align_node = in_data x = module_utils.find_bam_files(align_node.identifier) x = [x for x in x if x.endswith("accepted_hits.bam")] bam_filenames = x assert bam_filenames, "No accepted_hits.bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: # Names must in the format: # <path>/<sample>.tophat/accepted_hits.bam # full_path <path>/<sample>.tophat # path <path> # tophat_dir <sample>.tophat # file_ accepted_hits.bam # sample <sample> full_path, file_ = os.path.split(in_filename) path, tophat_dir = os.path.split(full_path) assert file_ == "accepted_hits.bam" assert tophat_dir.endswith(".tophat") sample = tophat_dir[:-7] out_filename = os.path.join(out_path, "%s.bam" % sample) assert in_filename != out_filename jobs.append((in_filename, out_filename)) # Make sure outfiles are unique. x = [x[-1] for x in jobs] x = {}.fromkeys(x) assert len(jobs) == len(x), "Duplicate sample names." for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): from genomicode import filelib from genomicode import parallel from genomicode import alignlib ref_node, gene_node = antecedents ref = alignlib.standardize_reference_genome(ref_node.identifier, out_path, use_symlinks=True) filelib.safe_mkdir(out_path) x = alignlib.make_STAR_index_command(ref.fasta_file_full, out_path, gtf_file=gene_node.identifier, num_cores=num_cores) x = "%s >& out.txt" % x parallel.sshell(x, path=out_path) # Check to make sure index was created successfully. alignlib.assert_is_STAR_reference(out_path)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel vcf_node = in_data vcf_files = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", case_insensitive=True) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # in_vcf_filename, out_vcf_filename for vcf_file in vcf_files: path, file_ = os.path.split(vcf_file) out_vcf_file = os.path.join(out_path, file_) x = vcf_file, out_vcf_file jobs.append(x) # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel"] # Generate the commands. commands = [] for x in jobs: in_vcf_file, out_vcf_file = x args = vartype, in_vcf_file, out_vcf_file x = filter_by_vartype, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel import filter_variants_GATK vcf_node = in_data vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) assert vcf_filenames, "No VCF files found." filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] metadata["filter"] = vartype jobs = [] # list of filelib.GenericObject for in_filename in vcf_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = filelib.GenericObject(in_filename=in_filename, out_filename=out_filename) jobs.append(x) # Filter each of the VCF files. jobs2 = [] for j in jobs: args = vartype, j.in_filename, j.out_filename x = filter_variants_GATK.filter_by_vartype, args, {} jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. MAX_CORES = 2 filenames = mlib.find_fastq_files(in_data.identifier) assert filenames, "I could not find any FASTQ files." filelib.safe_mkdir(out_path) metadata = {} num_samples = mlib.get_user_option(user_options, "num_samples", not_empty=True, type=int) metadata["num_samples"] = num_samples jobs = [] for in_filename in filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) cmds = [] for x in jobs: in_filename, out_filename = x x = copy_fastq_file, (in_filename, out_filename, num_samples), {} cmds.append(x) nc = min(MAX_CORES, num_cores) metadata["num cores"] = nc parallel.pyfun(cmds, num_procs=nc) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from genomicode import cluster30 from Betsy import module_utils as mlib filelib.safe_mkdir(out_path) metadata = {} LINKAGES = cluster30.METHOD2ID.keys() linkage = mlib.get_user_option(user_options, "linkage", not_empty=True, allowed_values=LINKAGES) x = run_cluster30(in_data.identifier, "hierarchical", user_options, method=linkage) cmd, cluster_files = x metadata["command"] = cmd opj = os.path.join out_cdt_file = opj(out_path, "signal.cdt") out_atr_file = opj(out_path, "array_tree.atr") out_gtr_file = opj(out_path, "gene_tree.gtr") assert "cdt" in cluster_files shutil.copy2(cluster_files["cdt"], out_cdt_file) if "atr" in cluster_files: shutil.copy2(cluster_files["atr"], out_atr_file) if "gtr" in cluster_files: shutil.copy2(cluster_files["gtr"], out_gtr_file) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filenames = mlib.find_fastq_files(in_data.identifier) assert filenames, "FASTQ files not found: %s" % in_data.identifier filelib.safe_mkdir(out_path) metadata = {} fastqc = mlib.findbin("fastqc") fastqc_q = parallel.quote(fastqc) commands = [ "%s --outdir=%s --extract %s" % (fastqc_q, out_path, x) for x in filenames ] metadata["commands"] = commands metadata["num_cores"] = num_cores #commands = ["ls > %s" % x for x in filenames] parallel.pshell(commands, max_procs=num_cores) # Fastqc generates files: # <file>_fastqc/ # <file>_fastqc.zip # The contents of the .zip file are identical to the directories. # If this happens, then delete the .zip files because they are # redundant. files = os.listdir(out_path) filenames = [os.path.join(out_path, x) for x in files] for filename in filenames: zip_filename = "%s.zip" % filename if os.path.exists(zip_filename): os.unlink(zip_filename)
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # java -jar picard.jar CollectAlignmentSummaryMetrics \ # R=reference_sequence.fasta \ # I=input.bam \ # O=output.txt opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, bam_filename=bam_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # Make the commands to run picard. picard_jar = alignlib.find_picard_jar("picard") sq = parallel.quote commands = [] for j in jobs: # Should have better way of getting java path. cmd = [ "java", "-Xmx10g", "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics", "I=%s" % sq(j.bam_filename), "R=%s" % sq(ref.fasta_file_full), "O=%s" % sq(j.out_filename), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, sq(j.log_filename)) commands.append(cmd) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Summarize the insert size files. outfile = opj(out_path, "summary.txt") _summarize_alignment_summary_metrics(jobs, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_folder, sample_node, gene_node, strand_node = antecedents bam_path = bam_folder.identifier assert filelib.dir_exists(bam_path) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} attr2order = { "name": "name", "coordinate": "pos", } x = bam_folder.data.attributes["sorted"] sort_order = attr2order.get(x) assert sort_order, "Cannot handle sorted: %s" % x #attr2stranded = { # "single" : "no", # "paired" : "no", # "paired_ff" : None, # "paired_fr" : "yes", # "paired_rf" : "reverse", # } #x = sample_node.data.attributes["orientation"] #stranded = attr2stranded.get(x) #assert stranded, "Cannot handle orientation: %s" % x ht_stranded = None if stranded.stranded == "unstranded": ht_stranded = "no" elif stranded.stranded == "firststrand": ht_stranded = "reverse" elif stranded.stranded == "secondstrand": ht_stranded = "yes" assert ht_stranded is not None #gtf_file = mlib.get_user_option( # user_options, "gtf_file", not_empty=True) #assert os.path.exists(gtf_file), "File not found: %s" % gtf_file mode = mlib.get_user_option(user_options, "htseq_count_mode", allowed_values=[ "union", "intersection-strict", "intersection-nonempty" ]) # Make a list of the jobs to run. jobs = [] for bam_filename in filelib.list_files_in_path(bam_path, endswith=".bam", case_insensitive=True): x = os.path.split(bam_filename)[1] x = os.path.splitext(x)[0] x = "%s.count" % x out_file = x x = bam_filename, out_file jobs.append(x) # Generate commands for each of the files. sq = parallel.quote commands = [] for x in jobs: bam_filename, out_file = x x = alignlib.make_htseq_count_command(bam_filename, gtf_file, sort_order, ht_stranded, mode=mode) x = "%s >& %s" % (x, sq(out_file)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores, path=out_path) # Make sure the analysis completed successfully. x = [x[1] for x in jobs] x = [os.path.join(out_path, x) for x in x] output_filenames = x filelib.assert_exists_nz_many(output_filenames) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents in_filenames = mlib.find_bam_files(bam_node.identifier) assert in_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) f, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % f) out_filename = os.path.join(out_path, "%s.intervals" % f) x = in_filename, log_filename, out_filename jobs.append(x) filter_reads_with_N_cigar = mlib.get_user_option( user_options, "filter_reads_with_N_cigar", allowed_values=["no", "yes"]) known_sites = [] x1 = mlib.get_user_option(user_options, "realign_known_sites1", check_file=True) x2 = mlib.get_user_option(user_options, "realign_known_sites2", check_file=True) x3 = mlib.get_user_option(user_options, "realign_known_sites3", check_file=True) x = [x1, x2, x3] x = [x for x in x if x] known_sites = x assert known_sites # I/O bound, so not likely to get a big speedup with nt. # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4 # -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j # --known <known_vcf_file> # RealignerTargetCreator takes ~10Gb per process. Each thread # takes the full amount of memory. nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores) # Make a list of commands. commands = [] for x in jobs: in_filename, log_filename, out_filename = x n = max(1, nc / len(jobs)) x = [("-known", x) for x in known_sites] if filter_reads_with_N_cigar == "yes": x.append(("-filter_reads_with_N_cigar", None)) x = alignlib.make_GATK_command(nt=n, T="RealignerTargetCreator", R=ref.fasta_file_full, I=in_filename, o=out_filename, _UNHASHABLE=x) x = "%s >& %s" % (x, log_filename) commands.append(x) parallel.pshell(commands, max_procs=nc) metadata["num_procs"] = nc metadata["commands"] = commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out MuTect version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals"] cosmic_file = mlib.get_user_option( user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option( user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile, # coverage_outfile, vcf_outfile, logfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) call_outfile = opj(out_path, "%s.call_stats.out" % sample) cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample) raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile jobs.append(x) # java -Xmx2g -jar muTect.jar # --analysis_type MuTect # --reference_sequence <reference> # --cosmic <cosmic.vcf> # --dbsnp <dbsnp.vcf> # --intervals <intervals_to_process> # --input_file:normal <normal.bam> # --input_file:tumor <tumor.bam> # --out <call_stats.out> # --coverage_file <coverage.wig.txt> # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x UNHASHABLE = [ ("input_file:normal", sq(normal_bamfile)), ("input_file:tumor", sq(cancer_bamfile)), ] x = alignlib.make_MuTect_command( analysis_type="MuTect", reference_sequence=sq(ref.fasta_file_full), cosmic=sq(cosmic_file), dbsnp=sq(dbsnp_file), intervals=sq(interval_node.identifier), out=sq(call_outfile), coverage_file=sq(cov_outfile), vcf=sq(raw_vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, x in enumerate(jobs): normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x # Pull out the error lines. x = [x for x in open(log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect error [%s]:\n%s\n%s" % ( cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x[6] for x in jobs] filelib.assert_exists_many(x) # Fix the files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x alignlib.clean_mutect_vcf( normal_bamfile, cancer_bamfile, normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib # For debugging. RUN_VARIANT_CALLING = True FILTER_CALLS = True MERGE_CALLS = True FIX_VCF_FILES = True dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier) assert dna_bam_filenames, "No DNA .bam files." rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier) assert rna_bam_filenames, "No RNA .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "Radia %s" % alignlib.get_radia_version() ## Make sure the BAM files do not contain spaces in the ## filenames. Radia doesn't work well with spaces. #filenames = dna_bam_filenames + rna_bam_filenames #has_spaces = [] #for filename in filenames: # if filename.find(" ") >= 0: # has_spaces.append(filename) #x = has_spaces #if len(x) > 5: # x = x[:5] + ["..."] #x = ", ".join(x) #msg = "Radia breaks if there are spaces in filenames: %s" % x #assert not has_spaces, msg # sample -> bam filename dnasample2bamfile = mlib.root2filename(dna_bam_filenames) rnasample2bamfile = mlib.root2filename(rna_bam_filenames) # Make sure files exist for all the samples. The DNA-Seq # should have both normal and cancer. RNA is not needed for # normal sample. mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile) mlib.assert_normal_cancer_samples(nc_match, rnasample2bamfile, ignore_normal_sample=True) # Make sure Radia and snpEff are configured. radia_genome_assembly = mlib.get_user_option(user_options, "radia_genome_assembly", not_empty=True) assert radia_genome_assembly == "hg19", "Only hg19 handled." snp_eff_genome = mlib.get_user_option(user_options, "snp_eff_genome", not_empty=True) radia_path = mlib.get_config("radia_path", assert_exists=True) snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True) radia_files = get_radia_files(radia_path, radia_genome_assembly) # Make a list of the chromosomes to use. Pick an arbitrarily # BAM file. Look at only the chromosomes that are present in # all files. all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values() chroms = list_common_chromosomes(all_bamfiles) assert chroms, "No chromosomes found in all files." # Only use the chromosomes that can be filtered by Radia. chroms = filter_radia_chromosomes(chroms, radia_files) # Make output directories. radia_outpath = "radia1.tmp" filter_outpath = "radia2.tmp" merge_outpath = "radia3.tmp" if not os.path.exists(radia_outpath): os.mkdir(radia_outpath) if not os.path.exists(filter_outpath): os.mkdir(filter_outpath) if not os.path.exists(merge_outpath): os.mkdir(merge_outpath) # Steps: # 1. Call variants (radia.py) # -o <file.vcf> # 2. Filter variants (filterRadia.py) # <outpath> # Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf # 3. Merge (mergeChroms.py) # Takes as input: <filter_outpath> # Produces: <merge_outpath>/<patient_id>.vcf # list of (normal_sample, cancer_sample, chrom, # normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, # radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, # final_vcf_outfile, # radia_logfile, filter_logfile, merge_logfile) opj = os.path.join jobs = [] for i, (normal_sample, cancer_sample) in enumerate(nc_match): normal_bamfile = dnasample2bamfile[normal_sample] dna_tumor_bamfile = dnasample2bamfile[cancer_sample] rna_tumor_bamfile = rnasample2bamfile[cancer_sample] merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample) merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample) final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) for chrom in chroms: radia_vcf_outfile = opj( radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) filter_vcf_outfile = opj( filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) radia_logfile = opj(radia_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) filter_logfile = opj(filter_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) x = normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs.append(x) # Since Radia doesn't work well if there are spaces in the # filenames, symlink these files here to guarantee that there # are no spaces. normal_path = "normal.bam" dna_path = "dna.bam" rna_path = "rna.bam" if not os.path.exists(normal_path): os.mkdir(normal_path) if not os.path.exists(dna_path): os.mkdir(dna_path) if not os.path.exists(rna_path): os.mkdir(rna_path) for i, x in enumerate(jobs): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path) x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path) x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path) clean_normal, clean_dna, clean_rna = x1, x2, x3 x = normal_sample, cancer_sample, chrom, \ clean_normal, clean_dna, clean_rna, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs[i] = x # Generate the commands for doing variant calling. python = mlib.get_config("python", which_assert_file=True) # filterRadia.py calls the "blat" command, and there's no way # to set the path. Make sure "blat" is executable. if not filelib.which("blat"): # Find "blat" in the configuration and add it to the path. x = mlib.get_config("blat", which_assert_file=True) path, x = os.path.split(x) if os.environ["PATH"]: path = "%s:%s" % (os.environ["PATH"], path) os.environ["PATH"] = path # Make sure it's findable now. filelib.which_assert("blat") # STEP 1. Call variants with radia.py. # python radia.py test31 5 \ # -n bam04/PIM001_G.bam \ # -t bam04/196B-MG.bam \ # -r bam34/196B-MG.bam \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # -o test32.vcf # --dnaTumorMitochon MT \ # --rnaTumorMitochon MT \ sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.radia_py), cancer_sample, chrom, "-n", sq(normal_bamfile), "-t", sq(dna_tumor_bamfile), "-r", sq(rna_tumor_bamfile), "-f", sq(ref.fasta_file_full), "-o", radia_vcf_outfile, ] if "MT" in chroms: x += [ "--dnaNormalMitochon MT", "--dnaTumorMitochon MT", "--rnaTumorMitochon MT", ] x = " ".join(x) x = "%s >& %s" % (x, radia_logfile) commands.append(x) assert len(commands) == len(jobs) # Only uses ~200 Mb of ram. if RUN_VARIANT_CALLING: parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure log files are empty. logfiles = [x[10] for x in jobs] filelib.assert_exists_z_many(logfiles) # STEP 2. Filter variants with filterRadia.py. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.filterRadia_py), cancer_sample, chrom, sq(radia_vcf_outfile), sq(filter_outpath), sq(radia_files.scripts_dir), "-b", sq(radia_files.blacklist_dir), "-d", sq(radia_files.snp_dir), "-r", sq(radia_files.retro_dir), "-p", sq(radia_files.pseudo_dir), "-c", sq(radia_files.cosmic_dir), "-t", sq(radia_files.target_dir), "-s", sq(snp_eff_path), "-e", snp_eff_genome, "--rnaGeneBlckFile", sq(radia_files.rnageneblck_file), "--rnaGeneFamilyBlckFile", sq(radia_files.rnagenefamilyblck_file), ] x = " ".join(x) x = "%s >& %s" % (x, filter_logfile) commands.append(x) assert len(commands) == len(jobs) # Sometimes samtools crashes in the middle of a run. Detect # this case, and re-run the analysis if needed. assert len(commands) == len(jobs) py_commands = [] for x, cmd in zip(jobs, commands): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = cmd, cancer_sample, chrom, filter_logfile x = _run_filterRadia_with_restart, args, {} py_commands.append(x) # Takes ~10 Gb each. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) if FILTER_CALLS: parallel.pyfun(py_commands, num_procs=nc) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[11] for x in jobs] filelib.assert_exists_z_many(logfiles) # Make sure filter_vcf_outfile exists. outfiles = [x[7] for x in jobs] filelib.assert_exists_nz_many(outfiles) # STEP 3. Merge the results. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \ # radia2.tmp/ radia3.tmp # The "/" after radia2.tmp is important. If not given, # will generate some files with only newlines. fo = filter_outpath if not fo.endswith("/"): fo = "%s/" % fo x = [ sq(python), sq(radia_files.mergeChroms_py), cancer_sample, fo, merge_outpath, ] x = " ".join(x) x = "%s >& %s" % (x, merge_logfile) commands.append(x) assert len(commands) == len(jobs) # Since the chromosomes were separated for the previous steps, # this will generate one merge for each chromosome. This is # unnecessary, since we only need to merge once per sample. # Get rid of duplicates. commands = sorted({}.fromkeys(commands)) if MERGE_CALLS: parallel.pshell(commands, max_procs=num_cores) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[12] for x in jobs] logfiles = sorted({}.fromkeys(logfiles)) filelib.assert_exists_z_many(logfiles) # Fix the VCF files. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = normal_sample, cancer_sample, \ merge_vcf_outfile, final_vcf_outfile x = alignlib.clean_radia_vcf, args, {} commands.append(x) if FIX_VCF_FILES: parallel.pyfun(commands, num_procs=num_cores) # Make sure output VCF files exist. x = [x[9] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, group_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(group_node.identifier, fastq_node.identifier) assert fastq_files, "No FASTQ files found." ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Make sure no duplicate samples. x1 = [x[0] for x in fastq_files] x2 = {}.fromkeys(x1).keys() assert len(x1) == len(x2), "dup sample" # Make a list of all FASTQ files to align. fastq_filenames = [] for x in fastq_files: sample, pair1, pair2 = x assert pair1 fastq_filenames.append(pair1) if pair2: fastq_filenames.append(pair2) # Make a list of all the jobs to do. jobs = [] # list of (fastq_filename, sai_filename) for in_filename in fastq_filenames: in_path, in_file = os.path.split(in_filename) x = in_file if x.lower().endswith(".fq"): x = x[:-3] elif x.lower().endswith(".fastq"): x = x[:-6] sai_filename = os.path.join(out_path, "%s.sai" % x) log_filename = os.path.join(out_path, "%s.log" % x) x = in_filename, sai_filename, log_filename jobs.append(x) # Calculate the number of threads per job. nc = max(1, num_cores / len(jobs)) # Make the bwa commands. commands = [] for x in jobs: fastq_filename, sai_filename, log_filename = x x = alignlib.make_bwa_aln_command(ref.fasta_file_full, fastq_filename, sai_filename, log_filename, num_threads=nc) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: in_filename, sai_filename, log_filename = x assert filelib.exists_nz(sai_filename), \ "Missing: %s" % sai_filename return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils ## Importing pysam is hard! #import sys #sys_path_old = sys.path[:] #sys.path = [x for x in sys.path if x.find("RSeQC") < 0] #import pysam #sys.path = sys_path_old bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) assert in_filename != out_filename x = in_filename, log_filename, out_filename jobs.append(x) # Don't do this. Need MD, NM, NH in # summarize_alignment_cigar. To be sure, just redo it. ## If the files already have MD tags, then just symlink the ## files. Don't add again. #i = 0 #while i < len(jobs): # in_filename, out_filename = jobs[i] # # handle = pysam.AlignmentFile(in_filename, "rb") # align = handle.next() # tag_dict = dict(align.tags) # if "MD" not in tag_dict: # i += 1 # continue # # Has MD tags. Just symlink and continue. # os.symlink(in_filename, out_filename) # del jobs[i] # Make a list of samtools commands. # Takes ~200 Mb per process, so should not be a big issue. samtools = filelib.which_assert(config.samtools) sq = parallel.quote commands = [] for x in jobs: in_filename, log_filename, out_filename = x # samtools calmd -b <in.bam> <ref.fasta> > <out.bam> # May generate error: # [bam_fillmd1] different NM for read # 'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19 # Pipe stderr to different file. x = [ samtools, "calmd", "-b", sq(in_filename), sq(ref.fasta_file_full), ] x = " ".join(x) x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib #import call_variants_GATK bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # Platypus generates an error if there are spaces in the BAM # filename. Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = os.path.join(bam_path, "%s.bam" % x) local_bai = os.path.join(bam_path, "%s.bam.bai" % x) log_filename = os.path.join(out_path, "%s.log" % sample) err_filename = os.path.join(out_path, "%s.err" % sample) # Unfiltered file. #raw_filename = os.path.join(out_path, "%s.raw" % sample) # Final VCF file. out_filename = os.path.join(out_path, "%s.vcf" % sample) x = filelib.GenericObject(bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, log_filename=log_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # TODO: Keep better track of the metadata. buffer_size = 100000 max_reads = 5E6 # Running into errors sometimes, so increase these numbers. # WARNING - Too many reads (5000000) in region # 1:500000-600000. Quitting now. Either reduce --bufferSize or # increase --maxReads. buffer_size = buffer_size * 10 max_reads = max_reads * 10 # Make a list of commands. commands = [] for j in jobs: #nc = max(1, num_cores/len(jobs)) x = alignlib.make_platypus_command(bam_file=j.local_bam, ref_file=ref.fasta_file_full, log_file=j.log_filename, out_file=j.out_filename, buffer_size=buffer_size, max_reads=max_reads) x = "%s >& %s" % (x, j.err_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. If not, try # to diagnose. for j in jobs: if filelib.exists_nz(j.out_filename): continue for line in open(j.err_filename): if line.find("WARNING - Too many reads") >= 0: print line, x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) # Filter each of the VCF files. #for j in jobs: # call_variants_GATK.filter_by_vartype( # vartype, j.raw_filename, j.out_filename) #metadata["filter"] = vartype return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_filenames = mlib.find_bam_files(in_data.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bam2fastx (unknown version)" # Somehow bam2fastx doesn't work if there are spaces in the # filename. Make a temporary filename with no spaces, and # then rename it later. # Actually, may not be bam2fastx's fault. jobs = [] for i, bam_filename in enumerate(bam_filenames): p, f, e = mlib.splitpath(bam_filename) #bai_filename = alignlib.find_bai_file(bam_filename) #assert bai_filename, "Missing index for: %s" % bam_filename #temp_bam_filename = "%d.bam" % i #temp_bai_filename = "%d.bam.bai" % i #temp_fa_filename = "%d.fa" % i fa_filename = os.path.join(out_path, "%s.fa" % f) x = filelib.GenericObject( bam_filename=bam_filename, #bai_filename=bai_filename, #temp_bam_filename=temp_bam_filename, #temp_bai_filename=temp_bai_filename, #temp_fa_filename=temp_fa_filename, fa_filename=fa_filename) jobs.append(x) bam2fastx = mlib.findbin("bam2fastx") # Link all the bam files. #for j in jobs: # assert not os.path.exists(j.temp_bam_filename) # #assert not os.path.exists(j.temp_bai_filename) # os.symlink(j.bam_filename, j.temp_bam_filename) # #os.symlink(j.bai_filename, j.temp_bai_filename) commands = [] for j in jobs: # bam2fastx -A --fasta -o rqc14.fa rqc11.bam x = [ mlib.sq(bam2fastx), "-A", "--fasta", #"-o", mlib.sq(j.temp_fa_filename), #mlib.sq(j.temp_bam_filename), "-o", mlib.sq(j.fa_filename), mlib.sq(j.bam_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_fa_filename, j.fa_filename) # # Remove the link to the BAM file. # os.unlink(j.temp_bam_filename) x = [j.fa_filename for x in jobs] filelib.assert_exists_nz_many(x) return metadata