def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import shutil from genomicode import parallel vcf_folder = in_data vcf_files = find_vcf_files(vcf_folder.identifier) metadata = {} TEMPFILE = "temp.txt" handle = open(TEMPFILE, 'w') header = ("Caller", "File", "Sample", "Chrom", "Pos", "Ref", "Alt", "Source", "Num Ref", "Num Alt", "Total Reads", "VAF", "Filter", "Call", "GQ") print >> handle, "\t".join(header) handle.close() # Write out data from each of the VCF files. jobs = [] for x in vcf_files: filestem, filename = x # filestem 197B-MG # filename /data/jchang/biocore/call01/radia.vcf/197B-MG.vcf args = filename, filestem, header, TEMPFILE x = summarize_vcf_file, args, {} jobs.append(x) parallel.pyfun(jobs, num_procs=num_cores, lock_keyword="lock") metadata["num_cores"] = num_cores shutil.move(TEMPFILE, out_filename) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils # This this is I/O heavy, don't use so many cores. MAX_CORES = 4 filelib.safe_mkdir(out_path) filenames = module_utils.find_fastq_files(in_data.identifier) assert filenames, "I could not find any FASTQ files." REMOVE = [".gz", ".bz2", ".xz"] # Uncompress the files to the new directory in parallel. commands = [] for in_filename in filenames: in_path, in_file = os.path.split(in_filename) x = in_file for r in REMOVE: if x.lower().endswith(r): x = x[:-len(r)] out_file = x out_filename = os.path.join(out_path, out_file) args = in_filename, out_filename keywds = {} x = uncompress_file, args, keywds commands.append(x) nc = min(MAX_CORES, num_cores) parallel.pyfun(commands, num_procs=nc)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils bam_filenames = module_utils.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.matches.txt" % s) x = in_filename, out_filename jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: in_filename, out_filename = x x = summarize_bam_file, (in_filename, out_filename), None jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores, DELAY=0.1) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] for bam_filename in bam_filenames: x = count_duplicates, (bam_filename,), {} jobs.append(x) results = parallel.pyfun(jobs, num_procs=num_cores) metadata["num_cores"] = num_cores assert len(results) == len(bam_filenames) handle = open(outfile, 'w') header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated" print >>handle, "\t".join(header) for i in range(len(bam_filenames)): x, sample, x = mlib.splitpath(bam_filenames[i]) total_reads, dup_reads = results[i] perc_dup = float(dup_reads) / total_reads * 100 perc_dup = "%.2f" % perc_dup x = sample, dup_reads, total_reads, perc_dup print >>handle, "\t".join(map(str, x)) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel vcf_node = in_data vcf_files = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", case_insensitive=True) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # in_vcf_filename, out_vcf_filename for vcf_file in vcf_files: path, file_ = os.path.split(vcf_file) out_vcf_file = os.path.join(out_path, file_) x = vcf_file, out_vcf_file jobs.append(x) # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel"] # Generate the commands. commands = [] for x in jobs: in_vcf_file, out_vcf_file = x args = vartype, in_vcf_file, out_vcf_file x = filter_by_vartype, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel import filter_variants_GATK vcf_node = in_data vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) assert vcf_filenames, "No VCF files found." filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] metadata["filter"] = vartype jobs = [] # list of filelib.GenericObject for in_filename in vcf_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = filelib.GenericObject(in_filename=in_filename, out_filename=out_filename) jobs.append(x) # Filter each of the VCF files. jobs2 = [] for j in jobs: args = vartype, j.in_filename, j.out_filename x = filter_variants_GATK.filter_by_vartype, args, {} jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. MAX_CORES = 2 filenames = mlib.find_fastq_files(in_data.identifier) assert filenames, "I could not find any FASTQ files." filelib.safe_mkdir(out_path) metadata = {} num_samples = mlib.get_user_option(user_options, "num_samples", not_empty=True, type=int) metadata["num_samples"] = num_samples jobs = [] for in_filename in filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) cmds = [] for x in jobs: in_filename, out_filename = x x = copy_fastq_file, (in_filename, out_filename, num_samples), {} cmds.append(x) nc = min(MAX_CORES, num_cores) metadata["num cores"] = nc parallel.pyfun(cmds, num_procs=nc) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib fastq_node, sample_node, align_node = antecedents fastq_data = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_data, "I could not find any FASTQ files." align_filenames = filelib.list_files_in_path(align_node.identifier, endswith=".matches.txt") assert align_filenames, "No .matches.txt files." align_filenames.sort() metadata = {} assert len(fastq_data) == len(align_filenames), \ "Mismatch: num samples %d %d" % ( len(fastq_data), len(align_filenames)) num_mismatches = mlib.get_user_option(user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2fastqdata = {} for x in fastq_data: sample, f1, f2 = x sample2fastqdata[sample] = x # list of (sample, align_filename, summary_filename, # fastq_filename1, fastq_filename2) jobs = [] for in_filename in align_filenames: p, f = os.path.split(in_filename) # <sample>.matches.txt ext = ".matches.txt" assert f.endswith(ext) sample = f[:-len(ext)] assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample summary_filename = "%s.summary.txt" % sample x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample] x = sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, align_filename, summary_filename, \ fastq_file1, fastq_file2 = x args = align_filename, fastq_file1, fastq_file2, num_mismatches keywds = { "temp_path": ".", "outfile": summary_filename, } x = summarize_matches_file, args, keywds jobs2.append(x) # Since this can take a lot of memory (depending on the number # of reads, can easily take 8 Gb), do just 1 process at a # time. Also, I/O intensive. Don't do too many at a time. #MAX_PROCS = 1 MAX_PROCS = 4 nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS) #nc = min(MAX_PROCS, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1) metadata["num_cores"] = nc assert len(results) == len(jobs2) # Put together the results in a table. handle = open(out_filename, 'w') header = "sample", "match", "total", "RPM", "match", "mismatch" print >> handle, "\t".join(header) for x in zip(jobs, results): x, d = x sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 = x match = d["perfect_alignments"] total = d["total_alignments"] rpm = int(float(match) / total * 1E6) perc_match = d["perc_perfect"] perc_mismatch = 1 - d["perc_perfect"] x = sample, match, total, rpm, perc_match, perc_mismatch assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib MAX_CORES = 4 # I/O intensive. fastq_node, sample_node, bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier, as_dict=True) metadata = {} jobs = [] # list of (sample, bam_file, fastq_file) for filename in bam_filenames: path, sample, ext = mlib.splitpath(filename) assert sample in sample2fastq, "Missing fastq: %s" % sample fastq1, fastq2 = sample2fastq[sample] x = sample, filename, fastq1 jobs.append(x) funcalls = [] for x in jobs: sample, bam_filename, fastq_filename = x # Count the number of reads. x1 = count_reads, (fastq_filename, ), {} # Count the number of alignments. x2 = count_alignments, (bam_filename, ), {} funcalls.append(x1) funcalls.append(x2) assert len(funcalls) == len(jobs) * 2 nc = min(num_cores, MAX_CORES) results = parallel.pyfun(funcalls, num_procs=nc) metadata["num_cores"] = nc # list of (sample, aligns, aligned_reads, total_reads, perc_aligned). results2 = [] for i, x in enumerate(jobs): sample, bam_filename, fastq_filename = x x1 = results[i * 2] x2 = results[i * 2 + 1] total_reads = x1 aligned_reads, alignments = x2 perc_aligned = float(aligned_reads) / total_reads x = sample, alignments, aligned_reads, total_reads, perc_aligned results2.append(x) results = results2 # sort by sample name results.sort() # Make table where the rows are the samples and the columns # are the statistics. table = [] header = ("Sample", "Alignments", "Aligned Reads", "Total Reads", "Perc Aligned") table.append(header) for x in results: sample, alignments, aligned_reads, total_reads, perc_aligned = x x1 = parselib.pretty_int(alignments) x2 = parselib.pretty_int(aligned_reads) x3 = parselib.pretty_int(total_reads) x4 = "%.2f%%" % (perc_aligned * 100) x = sample, x1, x2, x3, x4 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = mlib.findbin("txt2xls", quote=True) parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile)) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib in_vcf_node, bf_vcf_node = antecedents in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier, endswith=".vcf", toplevel_only=True) bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier, endswith=".vcf", toplevel_only=True) filelib.safe_mkdir(out_path) metadata = {} common_only = mlib.get_user_option(user_options, "backfill_common_only", allowed_values=["no", "yes"], not_empty=True) in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames] bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames] # Make sure there are no duplicate sample names. x1 = {}.fromkeys(in_vcf_samples).keys() x2 = {}.fromkeys(bf_vcf_samples).keys() assert len(in_vcf_samples) == len(x1), "Duplicate samples" assert len(bf_vcf_samples) == len(x2), "Duplicate samples" # Find the samples. common = [x for x in in_vcf_samples if x in bf_vcf_samples] in_only = [x for x in in_vcf_samples if x not in common] bf_only = [x for x in bf_vcf_samples if x not in common] assert common, "No common samples." pretty_in = parselib.pretty_list(in_only, max_items=5) pretty_bf = parselib.pretty_list(bf_only, max_items=5) if common_only == "no": assert not (in_only and bf_only), \ "Extra samples in both sets:\n%s\n%s" % ( pretty_in, pretty_bf) assert not in_only, "Target VCF file has extra samples: %s" % \ pretty_in assert not bf_only, "Source VCF file has extra samples: %s." % \ pretty_bf SAMPLES = common # list of sample, in_vcf_filename, bf_vcf_filename, out_filename jobs = [] for sample in SAMPLES: assert sample in in_vcf_samples assert sample in bf_vcf_samples i = in_vcf_samples.index(sample) j = bf_vcf_samples.index(sample) in_filename = in_vcf_filenames[i] bf_filename = bf_vcf_filenames[j] out_filename = os.path.join(out_path, "%s.vcf" % sample) x = sample, in_filename, bf_filename, out_filename jobs.append(x) jobs2 = [] for x in jobs: sample, in_filename, bf_filename, out_filename = x fn = backfill_vcf args = in_filename, bf_filename, out_filename keywds = {} jobs2.append((fn, args, keywds)) #num_cores = 1 parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "MuSE %s" % alignlib.get_muse_version() wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) dbsnp_file = mlib.get_user_option(user_options, "muse_dbsnp_vcf", not_empty=True, check_file=True) # Make sure dbsnp_file is compressed and indexed. assert dbsnp_file.endswith(".vcf.gz"), \ "muse_dbsnp_vcf must be bgzip compressed." x = "%s.tbi" % dbsnp_file assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed." # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, # logfile1, logfile2) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) muse_call_stem = opj(out_path, "%s.call" % cancer_sample) muse_call_file = "%s.MuSE.txt" % muse_call_stem raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample) vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample) log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 jobs.append(x) # Generate the commands. # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\ # bam04/196B-MG.bam bam04/PIM001_G.bam # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \ # -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz MuSE = mlib.findbin("muse") sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "call", "-O", muse_call_stem, "-f", sq(ref.fasta_file_full), cancer_bamfile, normal_bamfile, ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile1) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[8] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the call files are created and not empty. call_files = [x[5] for x in jobs] filelib.assert_exists_nz_many(call_files) # Run the "sump" step. commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "sump", "-I", sq(muse_call_file), ] assert wgs_or_wes in ["wgs", "wes"] if wgs_or_wes == "wgs": x += ["-G"] else: x += ["-E"] x += [ "-O", sq(raw_vcf_outfile), "-D", sq(dbsnp_file), ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile2) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = metadata["commands"] + commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[9] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the raw files are created and not empty. vcf_files = [x[6] for x in jobs] filelib.assert_exists_nz_many(vcf_files) # Fix the files. commands = [] # Should be python commands. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile x = alignlib.clean_muse_vcf, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Delete the log_outfiles if empty. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x if os.path.exists(log_outfile1): os.unlink(log_outfile1) if os.path.exists(log_outfile2): os.unlink(log_outfile2) # Make sure output VCF files exist. x = [x[7] for x in jobs] filelib.assert_exists_many(x) return metadata
def main(): import os import sys import itertools import argparse import arrayio import analyze_clinical_outcome as aco import boxplot from genomicode import parallel parser = argparse.ArgumentParser( description="Associate gene expression patterns with a " "categorical phenotype.") parser.add_argument( 'expression_file', help='Either a gene expression file (GCT,CDT,PCL format) or gene set ' 'scores from score_geneset.py.') parser.add_argument( 'phenotype_file', help="Table of phenotypes (tab-delimited text " "file).") parser.add_argument( "--ignore_samples", help="Ignore the samples where an annotation " "(a column in the phenotype file) matches a specific value. " "Format:<header>,<value>") parser.add_argument( "-j", dest="num_procs", type=int, default=1, help="Number of processors to use.") group = parser.add_argument_group(title='Analysis') group.add_argument( '--phenotype', default=[], action='append', help='Header in the phenotype file (MULTI). Format: <header>') group.add_argument( '--all_phenotypes', action="store_true", help="Analyze all phenotypes in the file.") parser.add_argument( "--ignore_phenotype", default=[], action="append", help="Ignore this column in the phenotype file. " "Helpful to get rid of the sample column when using " "--all_phenotypes. Format: <header> (MULTI)") group.add_argument( '--ignore_insufficient_groups', action="store_true", help="If a phenotype only has one group, then ignore it rather " "than raising an error.") group.add_argument( '--gene', default=[], action='append', help='Comma separated name or ID of genes to analyze. ' 'I will search for this gene in the annotations of the ' 'expression_file. ' 'You can use this parameter multiple times to search more genes.') group.add_argument( "--empty_vs_filled", action="store_true", help="Instead of categorizing by the contents of the cells, " "compare the ones that are empty against the ones that are filled.") group.add_argument( "--all_genes", action="store_true", help="Run analysis on all genes in this file.") group.add_argument( '--geneset', default=[], action='append', help='Name of the geneset to analyze. To specify multiple gene sets, ' 'use this parameter multiple times.') group.add_argument( "--center_by_phenotype", help="Center the scores or gene expression values seen for a " "phenotype to 0. Only one --phenotype can be analyzed in this way " "at a time. This phenotype should have two possible values. " "If there are more values, they need to be merged into two groups. " "Each phenotype must be seen in each BATCH. " "Format: <BATCH_HEADER>;<PHENO 1 VALUE>[,<PHENO 1 VALUE>,...];" "<PHENO 2 VALUE>[,<PHENO 2 VALUE>,...]") group = parser.add_argument_group(title='Output') group.add_argument( '-o', dest='filestem', default=None, help='Prefix used to name files. e.g. "myanalysis".') group.add_argument( "--gene_header", action="append", default=[], help="When naming the output file, use the gene name(s) under this " "Header (MULTI). If not given, will try to use a combination of the " "probe ID and gene symbol.") group = parser.add_argument_group(title='Formatting the boxplot') group.add_argument( "--box_mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--box_mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0 (no scaling).") group.add_argument( "--box_mar_top", default=1.0, type=float, help="Scale margin at top of plot. Default 1.0 (no scaling).") group.add_argument( "--water_mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--water_mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0 (no scaling).") group.add_argument( "--water_mar_top", default=1.0, type=float, help="Scale margin at top of plot. Default 1.0 (no scaling).") group.add_argument( "--water_xlabel_off", action="store_true", help="Do not label the X axis on the waterfall plot.") ## group.add_argument( ## '--km_title', default=None, help='Title for the Kaplan-Meier plot.') ## group.add_argument( ## '--km_title_size', default=1.0, type=float, ## help='Scale the size of the title. Default 1.0 (no scaling).') ## group.add_argument( ## '--km_mar_title', default=1.0, type=float, ## help="Scale margin for the title. Default 1.0 (no scaling).") ## group.add_argument( ## '--km_subtitle_size', default=1.0, type=float, ## help='Scale the size of the subtitle. Default 1.0 (no scaling).') ## group.add_argument( ## '--km_mar_subtitle', default=1.0, type=float, ## help="Scale margin for the subtitle. Default 1.0 (no scaling).") ## group.add_argument( ## '--km_xlab', default=None, ## help='x-axis label for the Kaplan-Meier plot.') ## group.add_argument( ## '--km_ylab', default=None, ## help='y-axis label for the Kaplan-Meier plot.') ## group.add_argument( ## '--km_legend_size', default=1.0, type=float, ## help='Scale the size of the legend. Default 1.0 (no scaling).') args = parser.parse_args() # Check inputs. assert args.expression_file, ( 'Please specify a gene expression or gene set score file.') assert os.path.exists(args.expression_file), "File not found: %s" % \ args.expression_file assert args.phenotype_file, 'Please specify a phenotype file.' assert os.path.exists(args.phenotype_file), "File not found: %s" % \ args.phenotype_file assert args.num_procs >= 1 and args.num_procs < 100 assert args.phenotype or args.all_phenotypes, \ 'Please specify the phenotype to analyze.' assert not (args.phenotype and args.all_phenotypes) assert args.gene or args.geneset or args.all_genes, \ 'Please specify a gene or gene set.' assert not (args.gene and args.all_genes) has_gene = args.gene or args.all_genes assert not (has_gene and args.geneset), \ 'Please specify either a gene or a gene set, not both.' assert args.box_mar_bottom > 0 and args.box_mar_bottom < 10 assert args.box_mar_left > 0 and args.box_mar_left < 10 assert args.box_mar_top > 0 and args.box_mar_top < 10 assert args.water_mar_bottom > 0 and args.water_mar_bottom < 10 assert args.water_mar_left > 0 and args.water_mar_left < 10 assert args.water_mar_top > 0 and args.water_mar_top < 10 ## assert args.km_title_size > 0 and args.km_title_size < 10 ## assert args.km_mar_title > 0 and args.km_mar_title < 10 ## assert args.km_subtitle_size > 0 and args.km_subtitle_size < 10 ## assert args.km_mar_subtitle > 0 and args.km_mar_subtitle < 10 ## assert args.km_legend_size > 0 and args.km_legend_size < 10 # Clean up the input. phenotypes = parse_phenotypes(args.phenotype) genes = aco.parse_genes(args.gene) gene_sets = aco.parse_gene_sets(args.geneset) x = parse_groups(args.center_by_phenotype) center_batch, center_group1, center_group2 = x filestem = aco.parse_filestem(args.filestem) if center_batch: assert len(phenotypes) == 1, \ "Only 1 phenotype can be centered by groups." # Read the input files. M = aco.read_expression_or_geneset_scores( genes, args.all_genes, gene_sets, args.expression_file) x = aco.read_clinical_annotations(M, args.phenotype_file) M, clinical_annots = x # Filter the phenotype files. if args.ignore_samples: x = ignore_samples(M, clinical_annots, args.ignore_samples) M, clinical_annots = x if args.all_phenotypes: phenotypes = sorted(clinical_annots) phenotypes = [x for x in phenotypes if x not in args.ignore_phenotype] # Make sure at least one of the phenotypes are in the clinical # annotations. x = [x for x in phenotypes if x in clinical_annots] assert x, "Could not find phenotypes: %s" % ", ".join(phenotypes) phenotypes = x # Select the genes or gene sets of interest. if not args.all_genes: x = genes or gene_sets M = M.matrix(row=x) assert M.nrow(), "I could not find any of the genes or gene sets." # Make sure the batch information is valid. if center_batch: assert center_batch in clinical_annots, "Missing annotation: %s" % \ center_batch assert len(phenotypes) == 1 pheno = phenotypes[0] values = clinical_annots[pheno] for x in values: assert x in center_group1 or x in center_group2, \ "Unknown phenotype: %s" % x # Calculate the association of each gene and each phenotype. #expression_or_score = "Expression" #if gene_sets: # expression_or_score = "Score" jobs = [] # list of (function, args, keywds) keys = [] for x in itertools.product(phenotypes, range(M.nrow())): pheno_header, i = x phenotype = clinical_annots[pheno_header] if args.empty_vs_filled: x = ["0"] * len(phenotype) for j in range(len(phenotype)): if phenotype[j].strip(): x[j] = "1" phenotype = x scores = M.value(i, None) if center_batch: batch = clinical_annots[center_batch] scores = center_scores( scores, batch, phenotype, center_group1, center_group2) x = phenotype, scores, args.ignore_insufficient_groups x = calc_association, x, {} jobs.append(x) keys.append((pheno_header, i)) retvals = parallel.pyfun(jobs, num_procs=args.num_procs) assert len(retvals) == len(keys) # (header, gene_index) -> returned from calc_association gene_phenotype_scores = {} for (pheno_header, i), x in zip(keys, retvals): if x is None: continue gene_phenotype_scores[(pheno_header, i)] = x # Files generated: # <filestem>.stats.txt Or to STDOUT if no <filestem> given. # <filestem>.<outcome>.<gene_id>.waterfall.png # <filestem>.<outcome>.<gene_id>.boxplot.png # <filestem>.<outcome>.<gene_id>.prism.txt Prism format. # Write the output in a table with headers: # <headers> # From the expression or gene set file. # Phenotype # Groups # one for each group # Num Samples # one for each group, separated by semicolon # Average Expression # one for each group, separated by semicolon # Relationship # p-value outhandle = sys.stdout if filestem: outhandle = open("%s.stats.txt" % filestem, 'w') # Figure out the header for the table. header = M.row_names() + [ "Phenotype", "Groups", "Num Samples", "Average Expression", "Delta", "Relationship", "p-value"] print >>outhandle, "\t".join(header) # Write out each row of the table. for x in itertools.product(phenotypes, range(M.nrow())): pheno_header, gene_i = x SCORE = gene_phenotype_scores.get((pheno_header, gene_i)) if not SCORE: # couldn't calculate. continue gene_names = [M.row_names(x)[gene_i] for x in M.row_names()] phenotype = pheno_header group_names = SCORE["group_names"] I = range(len(group_names)) num_samples = [SCORE["num_samples"][x] for x in I] mean_score = [SCORE["mean_score"][x] for x in I] delta = "" if len(group_names) == 2: delta = SCORE["delta"] relationship = SCORE["relationship"] p_value = SCORE["p_value"] _fmt = aco._format_list x = gene_names + [ phenotype, _fmt(group_names), _fmt(num_samples), _fmt(mean_score), delta, relationship, p_value] assert len(x) == len(header) print >>outhandle, "\t".join(map(str, x)) if filestem: outhandle.close() # Write out other files. if not filestem: return jobs = [] # list of (fn, args, keywds) for x in itertools.product(phenotypes, range(M.nrow())): pheno_header, gene_i = x SCORE = gene_phenotype_scores.get((pheno_header, gene_i)) if not SCORE: continue # Write the PRISM file. gene_id = aco.format_gene_name(M, None, gene_i) sample_names = M.col_names(arrayio.COL_ID) filename = aco._make_filename( M, gene_i, filestem, pheno_header, args.gene_header, "prism", "txt") x1 = (filename, SCORE["scores"], SCORE["phenotypes"], SCORE["group_names"]) x = write_prism_file, x1, {} jobs.append(x) # Make a boxplot. filename = aco._make_filename( M, gene_i, filestem, pheno_header, args.gene_header, "boxplot", "png") pretty_gene = aco.pretty_gene_name(M, args.gene_header, gene_i) group_names = SCORE["group_names"] pheno2scores = {} for pheno, score in zip(SCORE["phenotypes"], SCORE["scores"]): if pheno not in pheno2scores: pheno2scores[pheno] = [] pheno2scores[pheno].append(score) p_value = "p=%.2g" % SCORE["p_value"] x1 = (filename, group_names, pheno2scores) x2 = { "height" : 1600, "width" : 1600, "title" : pretty_gene, "subtitle" : p_value, "subtitle_col" : "#A60400", "subtitle_size" : 1.2, "subtitle_line" : 0.5, "ylab" : "Gene Expression", "mar_bottom" : args.box_mar_bottom, "mar_left" : args.box_mar_left, "mar_top" : 1.25, } x = boxplot.plot_boxplot, x1, x2 jobs.append(x) # Make a waterfall plot. #filename = "%s%s.%s.waterfall.png" % ( # filestem, pheno_header, gene_id_h) filename = aco._make_filename( M, gene_i, filestem, pheno_header, args.gene_header, "waterfall", "png") pretty = aco.pretty_gene_name(M, args.gene_header, gene_i) x1 = ( filename, SCORE["scores"], SCORE["phenotypes"], SCORE["group_names"], sample_names, SCORE["p_value"], pretty, args.water_mar_bottom, args.water_mar_left, args.water_mar_top, args.water_xlabel_off) x = plot_waterfall, x1, {} jobs.append(x) parallel.pyfun(jobs, num_procs=args.num_procs)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import vcflib from Betsy import module_utils as mlib vcf_node, nc_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Filenames: # <caller>.vcf wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) genome = mlib.get_user_option(user_options, "snpeff_genome", not_empty=True) databases = list_snpeff_databases() assert genome in databases, "Unknown genome database: %s" % genome # For each caller, do the SnpEFF calls. Some callers include # the somatic information, others do not. If germline samples # are present, then do with _cancer. Otherwise, do not. # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt # GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log # Don't bother annotating positions that do not pass filter. # Filter them out first based on FILTER column. opj = os.path.join jobs = [] for in_filename in vcf_filenames: path, stem, ext = mlib.splitpath(in_filename) samples_file = opj(out_path, "%s.cancerSamples.txt" % stem) filtered_filename = opj(out_path, "%s.filtered_input" % stem) out_filename = opj(out_path, "%s.vcf" % stem) log_filename = opj(out_path, "%s.log" % stem) x = filelib.GenericObject(in_filename=in_filename, samples_file=samples_file, filtered_filename=filtered_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # First, filter each of the VCF files. commands = [] for j in jobs: # For debugging. If this file exists, don't filter it again. if os.path.exists(j.filtered_filename): continue args = j.in_filename, j.filtered_filename, wgs_or_wes x = vcflib.filter_vcf_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Make the cancer_samples files. for j in jobs: # Will generate this if there are cancer samples. make_cancer_samples_file(j.filtered_filename, nc_match, j.samples_file) # Make a list of commands. commands = [] for j in jobs: cancer = False if os.path.exists(j.samples_file): cancer = True x = make_snpeff_command(j.filtered_filename, genome, j.out_filename, j.log_filename, is_cancer=cancer, cancer_samples_file=j.samples_file) commands.append(x) nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_cores"] = nc # Make sure the analysis completed successfully. x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Log files should be empty. for j in jobs: filelib.assert_exists(j.log_filename) assert not filelib.exists_nz(j.log_filename), \ "Error with %s.\n%s" % (j.stem, j.log_filename) filelib.safe_unlink(j.log_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import ngslib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} features_bed = mlib.get_user_option(user_options, "features_bed", check_file=True) if features_bed: metadata["features_bed"] = features_bed # Applies to genomecov. min_coverage = user_options.get("ignore_coverage_below") if min_coverage == "": min_coverage = None if min_coverage is not None: min_coverage = int(min_coverage) assert min_coverage >= 0 metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version() metadata["num_cores"] = num_cores metadata["commands"] = [] # Set up the filenames. # list of ( # sample, # orig_bam_filename, Original bam filename. # bam_filename, bam file, after filtering out unmapped reads. # genomecov_filename, Generated by genomecov. Histogram. # histo_datafile, Data file to generate histogram (from cov). # histo_plotfile, Histogram plot. # histo_prismfile, To make histogram in PRISM. # # ONLY USED IF features_bed # intervallist_file, Made from BED file. # cov_filename, Generated by Picard. # targetcov_filename, Generated by Picard. Per target coverage. # log_filename, Output from Picard. # ) opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" clean_bam_filename = opj(out_path, "%s.bam" % sample) assert clean_bam_filename != bam_filename genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample) histo_datafile = opj(out_path, "%s.histo.txt" % sample) histo_plotfile = opj(out_path, "%s.histo.png" % sample) histo_prismfile = opj(out_path, "%s.prism.txt" % sample) intervallist_file = opj(out_path, "%s.interval.txt" % sample) cov_filename = opj(out_path, "%s.coverage.txt" % sample) targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample) log_filename = opj(out_path, "%s.picard.log" % sample) x = filelib.GenericObject(sample=sample, orig_bam_filename=bam_filename, bam_filename=clean_bam_filename, genomecov_filename=genomecov_filename, histo_datafile=histo_datafile, histo_plotfile=histo_plotfile, histo_prismfile=histo_prismfile, intervallist_file=intervallist_file, cov_filename=cov_filename, targetcov_filename=targetcov_filename, log_filename=log_filename) #x = sample, bam_filename, genomecov_filename, \ # histo_datafile, histo_plotfile, histo_prismfile, \ # intervallist_file, cov_filename, targetcov_filename, \ # log_filename jobs.append(x) # Remove unmapped reads from the BAM files. # Need to remove the unmapped reads or Picard might complain: # Exception in thread "main" # htsjdk.samtools.SAMFormatException: SAM validation error: # ERROR: Record 154286082, Read name # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0 # for unmapped read. # # This can happen with BWA generated alignments. cmds = [] for x in jobs: x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename) cmds.append(x) parallel.pshell(cmds, max_procs=num_cores) x = [x.bam_filename for x in jobs] filelib.assert_exists_nz_many(x) # Generate the intervallist_file(s). if features_bed: cmds = [] for x in jobs: args = x.intervallist_file, features_bed, x.bam_filename x = _make_intervallist_file, args, {} cmds.append(x) parallel.pyfun(cmds, num_procs=num_cores) # Make the commands to run picard. if features_bed: commands = [] for x in jobs: x = _make_calculatehsmetrics_command( x.intervallist_file, x.bam_filename, x.cov_filename, x.targetcov_filename, ref.fasta_file_full, x.log_filename) commands.append(x) metadata["commands"].append(commands) parallel.pshell(commands, max_procs=num_cores) x1 = [x.cov_filename for x in jobs] x2 = [x.targetcov_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) # Use genomecov to count read depth. x = _run_genomecov(jobs, ref_node.identifier, num_cores) metadata["commands"].append(x) # Summarize the average read depth. summary_file = opj(out_path, "summary.xls") _summarize_average_read_depth(jobs, min_coverage, summary_file) # Make histograms of the distribution of the read depth for # each sample. for x in jobs: _make_histo_file(x.genomecov_filename, x.histo_datafile) # Delete the filtered BAM files to save space. for x in jobs: filelib.assert_exists_nz(x.bam_filename) os.unlink(x.bam_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib svm_node, vcf_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) metadata = {} # 1. vcf_filenames # 2. parsed_snpeff_files one for each VCF file # 3. merged_snpeff_file just one file # 4. clean_snpeff_file clean up the annotations to final form # 5. outfile merged_snpeff_file = "snpeff.merged.txt" cleaned_snpeff_file = "snpeff.clean.txt" jobs = [] for vcf_filename in vcf_filenames: path, caller, ext = mlib.splitpath(vcf_filename) parsed_snpeff_file = "%s.parsed.txt" % caller j = filelib.GenericObject( caller=caller, vcf_filename=vcf_filename, parsed_snpeff_file=parsed_snpeff_file, ) jobs.append(j) # Parse each of the snpeff files. commands = [] for j in jobs: args = j.vcf_filename, j.parsed_snpeff_file # Debugging. If this file exists, do not generate it # again. if os.path.exists(j.parsed_snpeff_file): continue x = parse_snpeff_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores # Merge the parsed files. x = [j.parsed_snpeff_file for j in jobs] x = [x for x in x if os.path.exists(x)] parsed_files = x # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(merged_snpeff_file): merge_parsed_files(parsed_files, merged_snpeff_file) # Clean up the snpEff file. Coordinates should be unique. # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(cleaned_snpeff_file): clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file) # Merge the snpEff annotations into the SimpleVariantMatrix. add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out Strelka version. skip_depth_filter = False x = mlib.get_user_option(user_options, "strelka_skip_depth_filter", allowed_values=["no", "yes"], not_empty=True) if x == "yes": skip_depth_filter = True assert "vartype" in out_attributes, "Missing attribute: vartype" x = out_attributes["vartype"] assert x in ["snp", "indel"] vartype = x # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # Make sure each cancer sample is unique. Otherwise, the # analysis directories will conflict. tumor_samples = [x[-1] for x in nc_match] dups = {} for i in range(1, len(tumor_samples)): if tumor_samples[i] in tumor_samples[:i]: dups[tumor_samples[i]] = 1 assert not dups, "NormalCancerFile contains multiple instances of: %s"\ % ", ".join(sorted(dups)) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # config_file, output_dir opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) config_file = opj(out_path, "config.%s.ini" % cancer_sample) analysis_path = opj(out_path, "analysis.%s" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path jobs.append(x) # Make each of the config files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x _make_config_file(config_file, skip_depth_filter=skip_depth_filter) # Make the analysis directories. jobs2 = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x fn = _make_analysis_directory args = (analysis_path, config_file, ref.fasta_file_full, normal_bamfile, cancer_bamfile) keywds = None jobs2.append((fn, args, keywds)) parallel.pyfun(jobs2, num_procs=num_cores) # Run the analysis. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x cmd = "make -j %d" % num_cores parallel.sshell(cmd, path=analysis_path) metadata["num_cores"] = num_cores # Make sure files exists. x = [x[-1] for x in jobs] x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x] filelib.assert_exists_nz_many(x) # Clean the VCF files and save into the out_path. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x # <analysis_path>/results/all.somatic.snvs.vcf # <analysis_path>/results/all.somatic.indels.vcf vartype2file = { "snp": "all.somatic.snvs.vcf", "indel": "all.somatic.indels.vcf", } assert vartype in vartype2file x = vartype2file[vartype] src_file = os.path.join(analysis_path, "results", x) dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample) alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file, dst_file) #metadata["commands"] = commands return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils # This this is I/O heavy, don't use so many cores. MAX_CORES = 2 fastq_node, group_node = antecedents fastq_path = fastq_node.identifier sample_group_file = group_node.identifier filelib.safe_mkdir(out_path) metadata = {} module_utils.assert_sample_group_file(sample_group_file, fastq_path) x = module_utils.read_sample_group_file(group_node.identifier) x = module_utils.fix_sample_group_filenames(x, fastq_path) sample_groups = x # For merging, the order of the files in the sample_group_file # must be maintainted. Otherwise, will be merged out of order. # The new files should be named: # <Sample>.fastq # if single end # <Sample>_<Pair>.fastq # if paired end jobs = [] for x in sample_groups: in_filename, sample, pair = x #in_filename = os.path.join(fastq_path, file_) assert os.path.exists(in_filename) out_file = "%s.fastq" % sample if pair: out_file = "%s_%s.fastq" % (sample, pair) out_filename = os.path.join(out_path, out_file) x = in_filename, sample, pair, out_filename jobs.append(x) out2ins = {} # out_filename -> list of in_filenames for x in jobs: in_filename, sample, pair, out_filename = x if out_filename not in out2ins: out2ins[out_filename] = [] out2ins[out_filename].append(in_filename) commands = [] for out_filename, in_filenames in out2ins.iteritems(): # Debugging. Don't merge again if it already exists. if os.path.exists(out_filename): continue args = in_filenames, out_filename keywds = {} x = merge_or_symlink_files, args, keywds commands.append(x) commands.sort() nc = min(MAX_CORES, num_cores) parallel.pyfun(commands, nc) metadata["num_cores"] = nc # If the files are paired, make sure they are paired # correctly. sample2outfiles = {} # sample -> list of out filenames for x in jobs: in_filename, sample, pair, out_filename = x if sample not in sample2outfiles: sample2outfiles[sample] = [] if out_filename not in sample2outfiles[sample]: sample2outfiles[sample].append(out_filename) commands = [] all_samples = sorted(sample2outfiles) for sample in all_samples: out_filenames = sorted(sample2outfiles[sample]) if len(out_filenames) == 1: continue # Make sure they are aligned. x = check_fastq_alignment, (sample, out_filenames), {} commands.append(x) commands.sort() retvals = parallel.pyfun(commands, nc) assert len(retvals) == len(commands) errors = [x for x in retvals if x] assert not errors, "\n".join(errors) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib # For debugging. RUN_VARIANT_CALLING = True FILTER_CALLS = True MERGE_CALLS = True FIX_VCF_FILES = True dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier) assert dna_bam_filenames, "No DNA .bam files." rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier) assert rna_bam_filenames, "No RNA .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "Radia %s" % alignlib.get_radia_version() ## Make sure the BAM files do not contain spaces in the ## filenames. Radia doesn't work well with spaces. #filenames = dna_bam_filenames + rna_bam_filenames #has_spaces = [] #for filename in filenames: # if filename.find(" ") >= 0: # has_spaces.append(filename) #x = has_spaces #if len(x) > 5: # x = x[:5] + ["..."] #x = ", ".join(x) #msg = "Radia breaks if there are spaces in filenames: %s" % x #assert not has_spaces, msg # sample -> bam filename dnasample2bamfile = mlib.root2filename(dna_bam_filenames) rnasample2bamfile = mlib.root2filename(rna_bam_filenames) # Make sure files exist for all the samples. The DNA-Seq # should have both normal and cancer. RNA is not needed for # normal sample. mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile) mlib.assert_normal_cancer_samples(nc_match, rnasample2bamfile, ignore_normal_sample=True) # Make sure Radia and snpEff are configured. radia_genome_assembly = mlib.get_user_option(user_options, "radia_genome_assembly", not_empty=True) assert radia_genome_assembly == "hg19", "Only hg19 handled." snp_eff_genome = mlib.get_user_option(user_options, "snp_eff_genome", not_empty=True) radia_path = mlib.get_config("radia_path", assert_exists=True) snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True) radia_files = get_radia_files(radia_path, radia_genome_assembly) # Make a list of the chromosomes to use. Pick an arbitrarily # BAM file. Look at only the chromosomes that are present in # all files. all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values() chroms = list_common_chromosomes(all_bamfiles) assert chroms, "No chromosomes found in all files." # Only use the chromosomes that can be filtered by Radia. chroms = filter_radia_chromosomes(chroms, radia_files) # Make output directories. radia_outpath = "radia1.tmp" filter_outpath = "radia2.tmp" merge_outpath = "radia3.tmp" if not os.path.exists(radia_outpath): os.mkdir(radia_outpath) if not os.path.exists(filter_outpath): os.mkdir(filter_outpath) if not os.path.exists(merge_outpath): os.mkdir(merge_outpath) # Steps: # 1. Call variants (radia.py) # -o <file.vcf> # 2. Filter variants (filterRadia.py) # <outpath> # Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf # 3. Merge (mergeChroms.py) # Takes as input: <filter_outpath> # Produces: <merge_outpath>/<patient_id>.vcf # list of (normal_sample, cancer_sample, chrom, # normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, # radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, # final_vcf_outfile, # radia_logfile, filter_logfile, merge_logfile) opj = os.path.join jobs = [] for i, (normal_sample, cancer_sample) in enumerate(nc_match): normal_bamfile = dnasample2bamfile[normal_sample] dna_tumor_bamfile = dnasample2bamfile[cancer_sample] rna_tumor_bamfile = rnasample2bamfile[cancer_sample] merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample) merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample) final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) for chrom in chroms: radia_vcf_outfile = opj( radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) filter_vcf_outfile = opj( filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) radia_logfile = opj(radia_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) filter_logfile = opj(filter_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) x = normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs.append(x) # Since Radia doesn't work well if there are spaces in the # filenames, symlink these files here to guarantee that there # are no spaces. normal_path = "normal.bam" dna_path = "dna.bam" rna_path = "rna.bam" if not os.path.exists(normal_path): os.mkdir(normal_path) if not os.path.exists(dna_path): os.mkdir(dna_path) if not os.path.exists(rna_path): os.mkdir(rna_path) for i, x in enumerate(jobs): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path) x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path) x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path) clean_normal, clean_dna, clean_rna = x1, x2, x3 x = normal_sample, cancer_sample, chrom, \ clean_normal, clean_dna, clean_rna, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs[i] = x # Generate the commands for doing variant calling. python = mlib.get_config("python", which_assert_file=True) # filterRadia.py calls the "blat" command, and there's no way # to set the path. Make sure "blat" is executable. if not filelib.which("blat"): # Find "blat" in the configuration and add it to the path. x = mlib.get_config("blat", which_assert_file=True) path, x = os.path.split(x) if os.environ["PATH"]: path = "%s:%s" % (os.environ["PATH"], path) os.environ["PATH"] = path # Make sure it's findable now. filelib.which_assert("blat") # STEP 1. Call variants with radia.py. # python radia.py test31 5 \ # -n bam04/PIM001_G.bam \ # -t bam04/196B-MG.bam \ # -r bam34/196B-MG.bam \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # -o test32.vcf # --dnaTumorMitochon MT \ # --rnaTumorMitochon MT \ sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.radia_py), cancer_sample, chrom, "-n", sq(normal_bamfile), "-t", sq(dna_tumor_bamfile), "-r", sq(rna_tumor_bamfile), "-f", sq(ref.fasta_file_full), "-o", radia_vcf_outfile, ] if "MT" in chroms: x += [ "--dnaNormalMitochon MT", "--dnaTumorMitochon MT", "--rnaTumorMitochon MT", ] x = " ".join(x) x = "%s >& %s" % (x, radia_logfile) commands.append(x) assert len(commands) == len(jobs) # Only uses ~200 Mb of ram. if RUN_VARIANT_CALLING: parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure log files are empty. logfiles = [x[10] for x in jobs] filelib.assert_exists_z_many(logfiles) # STEP 2. Filter variants with filterRadia.py. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.filterRadia_py), cancer_sample, chrom, sq(radia_vcf_outfile), sq(filter_outpath), sq(radia_files.scripts_dir), "-b", sq(radia_files.blacklist_dir), "-d", sq(radia_files.snp_dir), "-r", sq(radia_files.retro_dir), "-p", sq(radia_files.pseudo_dir), "-c", sq(radia_files.cosmic_dir), "-t", sq(radia_files.target_dir), "-s", sq(snp_eff_path), "-e", snp_eff_genome, "--rnaGeneBlckFile", sq(radia_files.rnageneblck_file), "--rnaGeneFamilyBlckFile", sq(radia_files.rnagenefamilyblck_file), ] x = " ".join(x) x = "%s >& %s" % (x, filter_logfile) commands.append(x) assert len(commands) == len(jobs) # Sometimes samtools crashes in the middle of a run. Detect # this case, and re-run the analysis if needed. assert len(commands) == len(jobs) py_commands = [] for x, cmd in zip(jobs, commands): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = cmd, cancer_sample, chrom, filter_logfile x = _run_filterRadia_with_restart, args, {} py_commands.append(x) # Takes ~10 Gb each. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) if FILTER_CALLS: parallel.pyfun(py_commands, num_procs=nc) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[11] for x in jobs] filelib.assert_exists_z_many(logfiles) # Make sure filter_vcf_outfile exists. outfiles = [x[7] for x in jobs] filelib.assert_exists_nz_many(outfiles) # STEP 3. Merge the results. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \ # radia2.tmp/ radia3.tmp # The "/" after radia2.tmp is important. If not given, # will generate some files with only newlines. fo = filter_outpath if not fo.endswith("/"): fo = "%s/" % fo x = [ sq(python), sq(radia_files.mergeChroms_py), cancer_sample, fo, merge_outpath, ] x = " ".join(x) x = "%s >& %s" % (x, merge_logfile) commands.append(x) assert len(commands) == len(jobs) # Since the chromosomes were separated for the previous steps, # this will generate one merge for each chromosome. This is # unnecessary, since we only need to merge once per sample. # Get rid of duplicates. commands = sorted({}.fromkeys(commands)) if MERGE_CALLS: parallel.pshell(commands, max_procs=num_cores) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[12] for x in jobs] logfiles = sorted({}.fromkeys(logfiles)) filelib.assert_exists_z_many(logfiles) # Fix the VCF files. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = normal_sample, cancer_sample, \ merge_vcf_outfile, final_vcf_outfile x = alignlib.clean_radia_vcf, args, {} commands.append(x) if FIX_VCF_FILES: parallel.pyfun(commands, num_procs=num_cores) # Make sure output VCF files exist. x = [x[9] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. Also, # takes 4-5 Gb RAM per process. MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4) fastq_node, sample_node, summary_node = antecedents fastq_path = fastq_node.identifier fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_path) assert fastq_files, "I could not find any FASTQ files." summary_filenames = filelib.list_files_in_path( summary_node.identifier, endswith=".matches.txt") assert summary_filenames, "No .matches.txt files." filelib.safe_mkdir(out_path) metadata = {} num_mismatches = mlib.get_user_option( user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2summary = {} # sample -> summary_filename for filename in summary_filenames: # <sample>.matches.txt p, f = os.path.split(filename) assert f.endswith(".matches.txt") sample = f.replace(".matches.txt", "") assert sample not in sample2summary sample2summary[sample] = filename # list of (sample, fastq_file1, fastq_file2, summary_filename, # out_file1, out_file2, subtracted_file1, subtracted_file2) jobs = [] for x in fastq_files: sample, pair1_fastq, pair2_fastq = x assert sample in sample2summary, \ "Missing summary for sample: %s" % sample p1, f1 = os.path.split(pair1_fastq) if pair2_fastq: p2, f2 = os.path.split(pair2_fastq) assert p1 == p2 out1_fastq = os.path.join(out_path, f1) sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1) out2_fastq = None sub2_fastq = None if pair2_fastq: out2_fastq = os.path.join(out_path, f2) sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2) x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, pair1_fastq, pair2_fastq, summary_file, \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) if pair2_fastq: x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) nc = min(MAX_CORES, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5) assert len(results) == len(jobs2) metadata["num_cores"] = nc # Make sure the fastq files were generated. x1 = [x[4] for x in jobs] x2 = [x[5] for x in jobs] x = x1 + x2 x = [x for x in x if x] # BUG: If all reads were removed, then this will fail incorrectly. filelib.assert_exists_nz_many(x) return metadata