def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, sam_filename, log_filename jobs.append(x) sq = mlib.sq commands = [] for x in jobs: sample, pair1, pair2, sam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) x = alignlib.make_bowtie2_command(ref.fasta_file_full, pair1, fastq_file2=pair2, sam_file=sam_filename, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def set_out_attributes(self, antecedents, out_attributes): #import os #from genomicode import config #from genomicode import filelib from genomicode import alignlib from Betsy import module_utils group_node, fastq_node, reference_node = antecedents fastq_files = module_utils.find_merged_fastq_files( group_node.identifier, fastq_node.identifier) assert fastq_files, "No fastq files." ref = alignlib.create_reference_genome(reference_node.identifier) # Possibilities: # 1. All single. # 2. All paired. # 3. Mixed. attrs = out_attributes.copy() all_pair2 = [x[-1] for x in fastq_files] uniq_pair2 = {}.fromkeys(all_pair2).keys() if uniq_pair2 == [None]: # All single. attrs["orientation"] = "single" return attrs if None in all_pair2: # Mixed. raise AssertionError, "Mixed single and paired-end." # All paired. # Optimization: check just the first group of FASTQ files and # assume they all have the same orientation. sample, pair1_filename, pair2_filename = fastq_files[0] x = get_paired_orientation( ref.fasta_file_full, pair1_filename, pair2_filename) orient, reads_ns, reads_fr, reads_rf, reads_ff = x #orientation = "paired" #if x: # orientation = "paired_%s" % x #attrs["orientation"] = orientation attrs["orientation"] = "paired_%s" % orient key = (group_node.identifier, fastq_node.identifier, reference_node.identifier) self.cache[key] = reads_ns, reads_fr, reads_rf, reads_ff return attrs
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, group_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(group_node.identifier, fastq_node.identifier) assert fastq_files, "No fastq files." ref = alignlib.create_reference_genome(reference_node.identifier) metadata = {} orientation = None reads_ns = reads_fr = reads_rf = reads_ff = None # Possibilities: # 1. All single. # 2. All paired. # 3. Mixed. (not handled) all_pair2 = [x[-1] for x in fastq_files] uniq_pair2 = {}.fromkeys(all_pair2).keys() if uniq_pair2 == [None]: # All single. orientation = "single" elif None in all_pair2: # Mixed. raise AssertionError, "Mixed single and paired-end." else: # All paired. # Optimization: check just the first group of FASTQ files and # assume they all have the same orientation. sample, pair1_filename, pair2_filename = fastq_files[0] x = get_paired_orientation(ref.fasta_file_full, pair1_filename, pair2_filename) orient, reads_ns, reads_fr, reads_rf, reads_ff = x orientation = "paired_%s" % orient assert orientation x = mlib.Orientation(orientation, reads_ns, reads_fr, reads_rf, reads_ff) mlib.write_orientation(x, outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import alignlib from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_files, "I could not find any FASTQ files." ref = alignlib.create_reference_genome(reference_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version() # Figure out whether to align to genome or transcriptome. x = out_attributes["align_to"] assert x in ["genome", "transcriptome"] align_to_genome = (x == "genome") # RSEM makes files: # <sample_name>.genome.bam # <sample_name>.transcript.bam # <sample_name>.genes.results # <sample_name>.isoforms.results # <sample_name>.stat # # Does not work right if there is a space in the sample name. # Therefore, give a hashed sample name, and then re-name # later. # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sample_h = hashlib.hash_var(sample) x1, x2, x3 = mlib.splitpath(pair1) x = "%s%s" % (hashlib.hash_var(x2), x3) pair1_h = os.path.join(out_path, x) if pair2: x1, x2, x3 = mlib.splitpath(pair2) x = "%s%s" % (hashlib.hash_var(x2), x3) pair2_h = os.path.join(out_path, x) results_filename = os.path.join(out_path, "%s.genes.results" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, sample_h=sample_h, pair1=pair1, pair2=pair2, pair1_h=pair1_h, pair2_h=pair2_h, results_filename=results_filename, log_filename=log_filename) jobs.append(x) # Make sure hashed samples are unique. seen = {} for j in jobs: assert j.sample_h not in seen, \ "Dup (%d): %s" % (len(jobs), j.sample_h) assert j.pair1_h not in seen assert j.pair2_h not in seen seen[j.sample_h] = 1 seen[j.pair1_h] = 1 seen[j.pair2_h] = 1 # Symlink the fastq files. for j in jobs: os.symlink(j.pair1, j.pair1_h) if j.pair2: os.symlink(j.pair2, j.pair2_h) s2fprob = { "unstranded": None, "firststrand": 0.0, "secondstrand": 1.0, } assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \ stranded.stranded forward_prob = s2fprob[stranded.stranded] # How much memory for bowtie. May need to increase this if # there are lots of memory warnings in the log files: # Warning: Exhausted best-first chunk memory for read # ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1 # (patid 2076693); skipping read # Default is 64. # Seems like too high a value can cause problems. #chunkmbs = 4*1024 # Generates warnings. chunkmbs = 512 # Get lots of warnings with bowtie: # Warning: Detected a read pair whose two mates have different names # Use STAR aligner instead. use_STAR = True sq = parallel.quote commands = [] for j in jobs: # Debug: If the results file exists, don't run it again. if filelib.exists_nz(j.results_filename) and \ filelib.exists(j.log_filename): continue # If using the STAR aligner, then most memory efficient # way is to let STAR take care of the multiprocessing. nc = max(1, num_cores / len(jobs)) if use_STAR: nc = num_cores keywds = {} if use_STAR: keywds["align_with_star"] = True else: keywds["align_with_bowtie2"] = True x = alignlib.make_rsem_command(ref.fasta_file_full, j.sample_h, j.pair1_h, fastq_file2=j.pair2_h, forward_prob=forward_prob, output_genome_bam=align_to_genome, bowtie_chunkmbs=chunkmbs, num_threads=nc, **keywds) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores # Need to run in out_path. Otherwise, files will be everywhere. nc = num_cores if use_STAR: nc = 1 parallel.pshell(commands, max_procs=nc, path=out_path) # Rename the hashed sample names back to the original unhashed # ones. files = os.listdir(out_path) rename_files = [] # list of (src, dst) for j in jobs: if j.sample == j.sample_h: continue for f in files: if not f.startswith(j.sample_h): continue src = os.path.join(out_path, f) x = j.sample + f[len(j.sample_h):] dst = os.path.join(out_path, x) rename_files.append((src, dst)) for src, dst in rename_files: filelib.assert_exists(src) os.rename(src, dst) # Delete the symlinked fastq files. for j in jobs: filelib.safe_unlink(j.pair1_h) filelib.safe_unlink(j.pair2_h) # Make sure the analysis completed successfully. x1 = [x.results_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, ref_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # Do a quick check to make sure the reference is correct. # Otherwise, error may be hard to disgnose. alignlib.assert_is_STAR_reference(ref.path) metadata = {} metadata["tool"] = "STAR %s" % alignlib.get_STAR_version() # Figure out the strandedness. is_stranded = False # STAR --runThreadN 40 --genomeDir test05 \ # --readFilesIn test.fastq/test03_R1_001.fastq \ # test.fastq/test03_R2_001.fastq --outFileNamePrefix test06. # If unstranded, add --outSAMstrandField intronMotif # Make a list of the jobs to run. jobs = [] # list of filelib.GenericObject objects for x in fastq_files: sample, pair1, pair2 = x out_prefix = "%s." % sample bam_filename = os.path.join(out_path, "%sAligned.out.bam" % out_prefix) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, pair1=pair1, pair2=pair2, out_prefix=out_prefix, bam_filename=bam_filename, log_filename=log_filename, ) jobs.append(x) # Run pass 1. commands = [] for j in jobs: x = os.path.join(out_path, j.out_prefix) cmd = alignlib.make_STAR_command(ref.path, x, num_cores, is_stranded, j.pair1, j.pair2, j.log_filename) # For debugging. If this file already exists, skip it. if not filelib.exists_nz(j.bam_filename): parallel.sshell(cmd, path=out_path) filelib.assert_exists_nz(j.bam_filename) commands.append(cmd) metadata["commands"] = commands metadata["num_cores"] = num_cores return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, ref_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) # Do a quick check to make sure the reference is correct. # Otherwise, error may be hard to disgnose. alignlib.assert_is_STAR_reference(ref.path) metadata = {} metadata["tool"] = "STAR %s" % alignlib.get_STAR_version() x = mlib.get_user_option(user_options, "two_pass", allowed_values=["no", "yes"]) two_pass = (x == "yes") # Figure out the strandedness. is_stranded = stranded.stranded != "unstranded" # STAR --runThreadN 40 --genomeDir test05 \ # --readFilesIn test.fastq/test03_R1_001.fastq \ # test.fastq/test03_R2_001.fastq --outFileNamePrefix test06. # If unstranded, add --outSAMstrandField intronMotif # Make a list of the jobs to run. jobs = [] # list of filelib.GenericObject objects for x in fastq_files: sample, pair1, pair2 = x pass1_out_prefix = "p1.%s." % sample pass2_out_prefix = "%s." % sample pass1_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass1_out_prefix) pass2_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass2_out_prefix) sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample) log1_filename = os.path.join(out_path, "p1.%s.log" % sample) log2_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, pair1=pair1, pair2=pair2, pass1_out_prefix=pass1_out_prefix, pass2_out_prefix=pass2_out_prefix, pass1_bam_filename=pass1_bam_filename, pass2_bam_filename=pass2_bam_filename, sjdb_filename=sjdb_filename, log1_filename=log1_filename, log2_filename=log2_filename, ) jobs.append(x) # Run pass 1. commands = [] for j in jobs: x = os.path.join(out_path, j.pass1_out_prefix) cmd = alignlib.make_STAR_command(ref.path, x, num_cores, is_stranded, j.pair1, j.pair2, j.log1_filename) # For debugging. If this file already exists, skip it. if not filelib.exists_nz(j.pass1_bam_filename): parallel.sshell(cmd, path=out_path) filelib.assert_exists_nz(j.pass1_bam_filename) commands.append(cmd) if two_pass: # Make a new index with the splice junction information. sj_index = os.path.join(out_path, "genome.2pass") x = [x.sjdb_filename for x in jobs] filelib.assert_exists_nz_many(x) x = alignlib.make_STAR_index_command(ref.fasta_file_full, sj_index, sjdb_files=x, num_cores=num_cores) x = "%s >& genome.2pass.log" % x commands.append(x) # For debugging. If this file already exists, skip it. if not filelib.exists_nz("genome.2pass.log"): parallel.sshell(x, path=out_path) alignlib.assert_is_STAR_reference(sj_index) # Run pass 2. for j in jobs: # For debugging. If this file already exists, skip it. if os.path.exists(j.pass2_bam_filename): continue if two_pass: x = os.path.join(out_path, j.pass2_out_prefix) cmd = alignlib.make_STAR_command(sj_index, x, num_cores, is_stranded, j.pair1, j.pair2, j.log2_filename) parallel.sshell(cmd, path=out_path) commands.append(cmd) else: # link pass1_bam_filename to pass2_bam_filename os.symlink(j.pass1_bam_filename, j.pass2_bam_filename) continue filelib.assert_exists_nz(j.pass2_bam_filename) metadata["commands"] = commands metadata["num_cores"] = num_cores # STAR takes 28 Gb per process. Make sure we don't use up # more memory than is available on the machine. # Defaults: # --limitGenomeGenerateRAM 31000000000 # --outFilterMismatchNmax 10 Num mismatches. #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores) #metadata["num_cores"] = nc #parallel.pshell(commands, max_procs=nc, path=out_path) # Make sure the analysis completed successfully. #x = [x[-2] for x in jobs] # sam_filename #filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib fastq_node, sample_node, align_node = antecedents fastq_data = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_data, "I could not find any FASTQ files." align_filenames = filelib.list_files_in_path(align_node.identifier, endswith=".matches.txt") assert align_filenames, "No .matches.txt files." align_filenames.sort() metadata = {} assert len(fastq_data) == len(align_filenames), \ "Mismatch: num samples %d %d" % ( len(fastq_data), len(align_filenames)) num_mismatches = mlib.get_user_option(user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2fastqdata = {} for x in fastq_data: sample, f1, f2 = x sample2fastqdata[sample] = x # list of (sample, align_filename, summary_filename, # fastq_filename1, fastq_filename2) jobs = [] for in_filename in align_filenames: p, f = os.path.split(in_filename) # <sample>.matches.txt ext = ".matches.txt" assert f.endswith(ext) sample = f[:-len(ext)] assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample summary_filename = "%s.summary.txt" % sample x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample] x = sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, align_filename, summary_filename, \ fastq_file1, fastq_file2 = x args = align_filename, fastq_file1, fastq_file2, num_mismatches keywds = { "temp_path": ".", "outfile": summary_filename, } x = summarize_matches_file, args, keywds jobs2.append(x) # Since this can take a lot of memory (depending on the number # of reads, can easily take 8 Gb), do just 1 process at a # time. Also, I/O intensive. Don't do too many at a time. #MAX_PROCS = 1 MAX_PROCS = 4 nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS) #nc = min(MAX_PROCS, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1) metadata["num_cores"] = nc assert len(results) == len(jobs2) # Put together the results in a table. handle = open(out_filename, 'w') header = "sample", "match", "total", "RPM", "match", "mismatch" print >> handle, "\t".join(header) for x in zip(jobs, results): x, d = x sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 = x match = d["perfect_alignments"] total = d["total_alignments"] rpm = int(float(match) / total * 1E6) perc_match = d["perc_perfect"] perc_mismatch = 1 - d["perc_perfect"] x = sample, match, total, rpm, perc_match, perc_mismatch assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib MAX_CORES = 4 # I/O intensive. fastq_node, sample_node, bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier, as_dict=True) metadata = {} jobs = [] # list of (sample, bam_file, fastq_file) for filename in bam_filenames: path, sample, ext = mlib.splitpath(filename) assert sample in sample2fastq, "Missing fastq: %s" % sample fastq1, fastq2 = sample2fastq[sample] x = sample, filename, fastq1 jobs.append(x) funcalls = [] for x in jobs: sample, bam_filename, fastq_filename = x # Count the number of reads. x1 = count_reads, (fastq_filename, ), {} # Count the number of alignments. x2 = count_alignments, (bam_filename, ), {} funcalls.append(x1) funcalls.append(x2) assert len(funcalls) == len(jobs) * 2 nc = min(num_cores, MAX_CORES) results = parallel.pyfun(funcalls, num_procs=nc) metadata["num_cores"] = nc # list of (sample, aligns, aligned_reads, total_reads, perc_aligned). results2 = [] for i, x in enumerate(jobs): sample, bam_filename, fastq_filename = x x1 = results[i * 2] x2 = results[i * 2 + 1] total_reads = x1 aligned_reads, alignments = x2 perc_aligned = float(aligned_reads) / total_reads x = sample, alignments, aligned_reads, total_reads, perc_aligned results2.append(x) results = results2 # sort by sample name results.sort() # Make table where the rows are the samples and the columns # are the statistics. table = [] header = ("Sample", "Alignments", "Aligned Reads", "Total Reads", "Perc Aligned") table.append(header) for x in results: sample, alignments, aligned_reads, total_reads, perc_aligned = x x1 = parselib.pretty_int(alignments) x2 = parselib.pretty_int(aligned_reads) x3 = parselib.pretty_int(total_reads) x4 = "%.2f%%" % (perc_aligned * 100) x = sample, x1, x2, x3, x4 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = mlib.findbin("txt2xls", quote=True) parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile)) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils fastq_node, group_node, reference_node = antecedents fastq_path = fastq_node.identifier assert os.path.exists(fastq_path) assert os.path.isdir(fastq_path) ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) #reference_fa = module_utils.find_bwa_reference(index_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Find the merged fastq files. x = module_utils.find_merged_fastq_files( group_node.identifier, fastq_path) grouped_fastq_files = x # Make sure no duplicate samples. x1 = [x[0] for x in grouped_fastq_files] x2 = {}.fromkeys(x1).keys() assert len(x1) == len(x2), "dup sample" # Make a list of all the jobs to do. jobs = [] # list of (sample, pair1, pair2, bam_filename) for x in grouped_fastq_files: sample, pair1, pair2 = x bam_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, bam_filename, log_filename jobs.append(x) # Uses ~6 Gb per process. # Calculate the number of cores per job. nc = max(1, num_cores/len(jobs)) metadata["num cores"] = nc # Make the bwa commands. commands = [] for x in jobs: sample, pair1, pair2, bam_filename, log_filename = x x = alignlib.make_bwa_mem_command( ref.fasta_file_full, log_filename, pair1, fastq_file2=pair2, bam_filename=bam_filename, num_threads=nc) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x1 = [x[-2] for x in jobs] x2 = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, reference_node, gene_node = \ antecedents fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "TopHat %s" % alignlib.get_tophat_version() # Get the GTF file, if any. #gtf_file = module_utils.get_user_option( # user_options, "tophat_gtf_file", check_file=True) transcriptome_fa = mlib.get_user_option( user_options, "tophat_transcriptome_fa", check_file=True) assert gtf_file or transcriptome_fa, ( "Either tophat_gtf_file or tophat_transcriptome_fa (preferred) " "must be provided.") # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x tophat_path = os.path.join(out_path, "%s.tophat" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, tophat_path, log_filename jobs.append(x) # Generate tophat commands for each of the files. s2ltype = { "unstranded" : "fr-unstranded", "firststrand" : "fr-firststrand", "secondstrand" : "fr-secondstrand", } assert stranded.stranded in s2ltype, "Unknown stranded: %s" % \ stranded.stranded library_type = s2ltype[stranded.stranded] # Takes ~3 Gb per process. sq = parallel.quote commands = [] for x in jobs: sample, pair1, pair2, tophat_path, log_filename = x nc = max(1, num_cores/len(jobs)) x = alignlib.make_tophat_command( ref.fasta_file_full, tophat_path, pair1, fastq_file2=pair2, gtf_file=gtf_file, transcriptome_fa=transcriptome_fa, library_type=library_type, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[3] for x in jobs] # out_path x = [os.path.join(x, "accepted_hits.bam") for x in x] bam_filenames = x filelib.assert_exists_nz_many(bam_filenames) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, orient_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie1 %s" % alignlib.get_bowtie1_version() # With low alignment percentages, might want to play around with: # - insert size # - maximum mismatch # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, sam_filename, log_filename jobs.append(x) # Generate bowtie1 commands for each of the files. attr2orient = { "single": None, "paired_fr": "fr", "paired_rf": "rf", "paired_ff": "ff", } orientation = attr2orient[orient.orientation] #x = sample_node.data.attributes["orientation"] #orientation = attr2orient[x] sq = parallel.quote commands = [] for x in jobs: sample, pair1, pair2, sam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) x = alignlib.make_bowtie1_command(ref.fasta_file_full, sam_filename, pair1, fastq_file2=pair2, orientation=orientation, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: sample, pair1, pair2, sam_filename, log_filename = x # Make sure sam file created. assert filelib.exists_nz(sam_filename), \ "Missing: %s" % sam_filename # Make sure there are some alignments. x = open(log_filename).read() assert x.find("No alignments") < 0, "No alignments" return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, orient_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Bowtie2 doesn't handle files with spaces in them. Make # temporary files without spaces. # Make a list of the jobs to run. jobs = [] for i, x in enumerate(fastq_files): sample, pair1, pair2 = x bam_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) sample_h = hashlib.hash_var(sample) temp_pair1 = "%d_%s_1.fa" % (i, sample_h) temp_pair2 = None if pair2: temp_pair2 = "%d_%s_2.fa" % (i, sample_h) j = filelib.GenericObject(sample=sample, pair1=pair1, pair2=pair2, temp_pair1=temp_pair1, temp_pair2=temp_pair2, bam_filename=bam_filename, log_filename=log_filename) jobs.append(j) for j in jobs: os.symlink(j.pair1, j.temp_pair1) if pair2: os.symlink(j.pair2, j.temp_pair2) # Generate bowtie2 commands for each of the files. attr2orient = { "single": None, "paired_fr": "fr", "paired_rf": "rf", "paired_ff": "ff", } orientation = attr2orient[orient.orientation] #x = sample_node.data.attributes["orientation"] #orientation = attr2orient[x] # Takes ~4 Gb per job. samtools = mlib.findbin("samtools") sq = parallel.quote commands = [] for j in jobs: #sample, pair1, pair2, bam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr # 2> test.log | samtools view -bS -o test.bam - x1 = alignlib.make_bowtie2_command(ref.fasta_file_full, j.temp_pair1, fastq_file2=j.temp_pair2, orientation=orientation, num_threads=nc) x2 = [ sq(samtools), "view", "-bS", "-o", sq(j.bam_filename), "-", ] x2 = " ".join(x2) x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2) #x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x1 = [x.bam_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sai_node, orient_node, sample_node, reference_node = \ antecedents fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_node.identifier) sai_path = sai_node.identifier assert filelib.dir_exists(sai_path) orient = mlib.read_orientation(orient_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Technically, doesn't need the SampleGroupFile, since that's # already reflected in the sai data. But better, because the # sai data might not always be generated by BETSY. # Find the merged fastq files. # Find the sai files. sai_filenames = filelib.list_files_in_path( sai_path, endswith=".sai", case_insensitive=True) assert sai_filenames, "No .sai files." bwa = mlib.findbin("bwa") # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq> # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai> # <input_1.fq> <input_2.fq> > # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam) # all full paths jobs = [] for x in fastq_files: sample, pair1_fq, pair2_fq = x # The sai file should be in the format: # <sai_path>/<sample>.sai Single end read # <sai_path>/<sample>_1.sai Paired end read # <sai_path>/<sample>_2.sai Paired end read # Look for pair1_sai and pair2_sai. pair1_sai = pair2_sai = None for sai_filename in sai_filenames: p, s, e = mlib.splitpath(sai_filename) assert e == ".sai" if s == sample: assert not pair1_sai pair1_sai = sai_filename elif s == "%s_1" % (sample): assert not pair1_sai pair1_sai = sai_filename elif s == "%s_2" % (sample): assert not pair2_sai pair2_sai = sai_filename assert pair1_sai, "Missing .sai file: %s" % sample if pair2_fq: assert pair2_sai, "Missing .sai file 2: %s" % sample if pair2_sai: assert pair2_fq, "Missing .fq file 2: %s" % sample sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename jobs.append(x) orientation = orient.orientation #orientation = sample_node.data.attributes["orientation"] assert orientation in ["single", "paired_fr", "paired_rf"] # Make a list of bwa commands. sq = mlib.sq commands = [] for x in jobs: sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename = x if orientation == "single": assert not pair2_fq assert not pair2_sai samse = "samse" if orientation.startswith("paired"): samse = "sampe" x = [ sq(bwa), samse, "-f", sq(sam_filename), sq(ref.fasta_file_full), ] if orientation == "single": x += [ sq(pair1_sai), sq(pair1_fq), ] else: y = [ sq(pair1_sai), sq(pair2_sai), sq(pair1_fq), sq(pair2_fq), ] if orientation == "paired_rf": y = [ sq(pair2_sai), sq(pair1_sai), sq(pair2_fq), sq(pair1_fq), ] x += y x += [ ">&", sq(log_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import genomelib from genomicode import config from Betsy import module_utils as mlib fasta_node, bam_node, sample_node, orient_node = antecedents fasta_data = mlib.find_merged_fastq_files(sample_node.identifier, fasta_node.identifier, find_fasta=True) bam_filenames = mlib.find_bam_files(bam_node.identifier) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) # TODO: Try to figure out version. metadata = {} metadata["tool"] = "RSeQC (unknown version)" pyrseqc = mlib.findbin("pyrseqc") gene_model = mlib.get_user_option(user_options, "gene_model", not_empty=True, allowed_values=["hg19"]) if gene_model == "hg19": gene_path = config.rseqc_hg19 else: raise AssertionError, "Unhandled: %s" % gene_model filelib.dir_exists(gene_path) gene_model_bed = os.path.join(gene_path, "RefSeq.bed12") housekeeping_model_bed = os.path.join(gene_path, "HouseKeepingGenes.bed") sample2fastadata = {} for x in fasta_data: sample, f1, f2 = x sample2fastadata[sample] = x is_paired = orient.orientation.startswith("paired") # Guess the read length. Read the first fasta. assert sample2fastadata x = sample2fastadata.keys()[0] filename = sample2fastadata[x][1] lengths = {} # length -> count for i, x in enumerate(genomelib.read_fasta_many(filename)): if i >= 100: break title, sequence = x l = len(sequence) lengths[l] = lengths.get(l, 0) + 1 # Use the most common length. c_length = c_count = None for (l, c) in lengths.iteritems(): if c_count is None or c > c_count: c_length, c_count = l, c assert c_length read_length = c_length jobs = [] # sample, bam_filename, fasta_file1, fasta_file2, outdir for bam_filename in bam_filenames: # <path>/<sample>.bam p, sample, e = mlib.splitpath(bam_filename) assert sample in sample2fastadata x, f1, f2 = sample2fastadata[sample] outdir = os.path.join(out_path, sample) x = sample, bam_filename, f1, f2, outdir jobs.append(x) # Some of the modules of RSeQC uses a lot of memory. Have # seen a Python process take 33 Gb, and an R process take 200 # Gb. However, most of the modules use much less memory. So # run one pyrseqc at a time, and run each one of those # processes in parallel. Is probably slower than running # multiple pyrseqc, but takes less memory. commands = [] for x in jobs: sample, bam_filename, fasta_filename1, fasta_filename2, outdir = x # pyrseqc.py -j 20 --paired_end rqc11.bam rqc14.fa 76 \ # mod07.txt hg19.HouseKeepingGenes.bed rqc21 --dry_run x = [ mlib.sq(pyrseqc), "-j", str(num_cores), ] if is_paired: x += ["--paired_end"] x += [ mlib.sq(bam_filename), mlib.sq(fasta_filename1), str(read_length), mlib.sq(gene_model_bed), mlib.sq(housekeeping_model_bed), mlib.sq(outdir), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores # pyrseqc takes up to ~40 Gb per process. # read_distribution.py takes 33 Gb. # read_quality.py spins off an R process that takes ~200 Gb. # Make sure we don't use up more memory than is available on # the machine. #nc = mlib.calc_max_procs_from_ram(60, upper_max=num_cores) #metadata["num cores"] = nc #x = parallel.pshell(commands, max_procs=nc) # Because of memory, just run one at a time, but each one, use # multiple cores. for cmd in commands: x = parallel.sshell(cmd) assert x.find("Traceback") < 0, x filelib.assert_exists_nz(out_path) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. Also, # takes 4-5 Gb RAM per process. MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4) fastq_node, sample_node, summary_node = antecedents fastq_path = fastq_node.identifier fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_path) assert fastq_files, "I could not find any FASTQ files." summary_filenames = filelib.list_files_in_path( summary_node.identifier, endswith=".matches.txt") assert summary_filenames, "No .matches.txt files." filelib.safe_mkdir(out_path) metadata = {} num_mismatches = mlib.get_user_option( user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2summary = {} # sample -> summary_filename for filename in summary_filenames: # <sample>.matches.txt p, f = os.path.split(filename) assert f.endswith(".matches.txt") sample = f.replace(".matches.txt", "") assert sample not in sample2summary sample2summary[sample] = filename # list of (sample, fastq_file1, fastq_file2, summary_filename, # out_file1, out_file2, subtracted_file1, subtracted_file2) jobs = [] for x in fastq_files: sample, pair1_fastq, pair2_fastq = x assert sample in sample2summary, \ "Missing summary for sample: %s" % sample p1, f1 = os.path.split(pair1_fastq) if pair2_fastq: p2, f2 = os.path.split(pair2_fastq) assert p1 == p2 out1_fastq = os.path.join(out_path, f1) sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1) out2_fastq = None sub2_fastq = None if pair2_fastq: out2_fastq = os.path.join(out_path, f2) sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2) x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, pair1_fastq, pair2_fastq, summary_file, \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) if pair2_fastq: x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) nc = min(MAX_CORES, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5) assert len(results) == len(jobs2) metadata["num_cores"] = nc # Make sure the fastq files were generated. x1 = [x[4] for x in jobs] x2 = [x[5] for x in jobs] x = x1 + x2 x = [x for x in x if x] # BUG: If all reads were removed, then this will fail incorrectly. filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, group_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(group_node.identifier, fastq_node.identifier) assert fastq_files, "No FASTQ files found." ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Make sure no duplicate samples. x1 = [x[0] for x in fastq_files] x2 = {}.fromkeys(x1).keys() assert len(x1) == len(x2), "dup sample" # Make a list of all FASTQ files to align. fastq_filenames = [] for x in fastq_files: sample, pair1, pair2 = x assert pair1 fastq_filenames.append(pair1) if pair2: fastq_filenames.append(pair2) # Make a list of all the jobs to do. jobs = [] # list of (fastq_filename, sai_filename) for in_filename in fastq_filenames: in_path, in_file = os.path.split(in_filename) x = in_file if x.lower().endswith(".fq"): x = x[:-3] elif x.lower().endswith(".fastq"): x = x[:-6] sai_filename = os.path.join(out_path, "%s.sai" % x) log_filename = os.path.join(out_path, "%s.log" % x) x = in_filename, sai_filename, log_filename jobs.append(x) # Calculate the number of threads per job. nc = max(1, num_cores / len(jobs)) # Make the bwa commands. commands = [] for x in jobs: fastq_filename, sai_filename, log_filename = x x = alignlib.make_bwa_aln_command(ref.fasta_file_full, fastq_filename, sai_filename, log_filename, num_threads=nc) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: in_filename, sai_filename, log_filename = x assert filelib.exists_nz(sai_filename), \ "Missing: %s" % sai_filename return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib fastq_node, sample_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_files, "I could not find any FASTQ files." filelib.safe_mkdir(out_path) metadata = {} adapters_filename = mlib.get_user_option(user_options, "adapters_fasta", not_empty=True, check_file=True) if " " in adapters_filename: os.symlink(adapters_filename, "adapters.txt") adapters_filename = "adapters.txt" jobs = [] for x in fastq_files: sample, pair1, pair2 = x p1, f1 = os.path.split(pair1) trimmed1 = os.path.join(out_path, f1) trimmed2 = None if pair2: p2, f2 = os.path.split(pair2) trimmed2 = os.path.join(out_path, f2) # BUG: Will be overwritten. Need to give unpaired files # unique names. unpaired1 = os.path.join(out_path, "unpaired_1.fasta") unpaired2 = os.path.join(out_path, "unpaired_2.fasta") log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, trimmed1, trimmed2, \ unpaired1, unpaired2, log_filename jobs.append(x) sq = parallel.quote commands = [] for x in jobs: sample, pair1, pair2, trimmed1, trimmed2, unpaired1, unpaired2, \ log_filename = x nc = max(1, num_cores / len(jobs)) x = _make_trimmomatic_cmd(pair1, pair2, trimmed1, trimmed2, unpaired1, unpaired2, adapters_filename, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: sample, pair1, pair2, trimmed1, trimmed2, unpaired1, unpaired2, \ log_filename = x # Make sure outfile created. assert filelib.exists_nz(trimmed1), \ "Missing: %s" % trimmed1 if trimmed2: assert filelib.exists_nz(trimmed2), \ "Missing: %s" % trimmed2 x = open(log_filename).read() assert not x.startswith("Usage:"), "usage problem" return metadata