def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1, pair2, sam_filename, log_filename jobs.append(x) sq = mlib.sq commands = [] for x in jobs: sample, pair1, pair2, sam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) x = alignlib.make_bowtie2_command(ref.fasta_file_full, pair1, fastq_file2=pair2, sam_file=sam_filename, num_threads=nc) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def get_paired_orientation_bowtie2( reference_genome, filename1, filename2, outpath=None): # Return tuple of ("ff", "fr", or "rf"; reads_ns; reads_fr; # reads_rf; reads_ff). import os import shutil import tempfile import multiprocessing #from genomicode import genomelib from genomicode import alignlib from genomicode import parallel # Strategy: run bowtie2 in all orientations. Return the one with # most reads aligned. Do with a subset of the data, so this # doesn't take a long time. # 100 is too low. Gave is wrong result (fr instead of rf) on the # Thunderbolts miSEQ data. Minimum number that gives right answer # is 250. # NUM_READS Time (s) 1 core # 50 2.4 # 100 2.4 # 250 2.5 # 500 2.7 # 1000 2.9 # 2000 3.3 # 5000 4.6 # 10000 7.8 # 100000 52.6 NUM_READS = 1000 # If outpath is None, then put everything into a temporary # directory. path = outpath # where to write the results tempdir = None # temporary directory to be deleted try: if path is None: tempdir = tempfile.mkdtemp(dir=".") path = tempdir # write into a temporary directory #short_filename1 = os.path.join(path, "short_1.fq") #short_filename2 = os.path.join(path, "short_2.fq") #copy_fastq(filename1, short_filename1, NUM_READS) #copy_fastq(filename2, short_filename2, NUM_READS) sam_ff = os.path.join(path, "orient_ff.sam") sam_fr = os.path.join(path, "orient_fr.sam") sam_rf = os.path.join(path, "orient_rf.sam") sam_ns = os.path.join(path, "orient_ns.sam") log_ff = os.path.join(path, "orient_ff.log") log_fr = os.path.join(path, "orient_fr.log") log_rf = os.path.join(path, "orient_rf.log") log_ns = os.path.join(path, "orient_ns.log") nc = multiprocessing.cpu_count() nc = int(max(nc/4.0, 1)) nc = 1 x1 = alignlib.make_bowtie2_command( reference_genome, fastq_file1=filename1, fastq_file2=filename2, sam_file=sam_ff, orientation="ff", max_reads=NUM_READS, num_threads=nc) x2 = alignlib.make_bowtie2_command( reference_genome, fastq_file1=filename1, fastq_file2=filename2, sam_file=sam_fr, orientation="fr", max_reads=NUM_READS, num_threads=nc) x3 = alignlib.make_bowtie2_command( reference_genome, fastq_file1=filename1, fastq_file2=filename2, sam_file=sam_rf, orientation="rf", max_reads=NUM_READS, num_threads=nc) x4 = alignlib.make_bowtie2_command( reference_genome, fastq_file1=filename1, fastq_file2=filename2, sam_file=sam_ns, orientation=None, max_reads=NUM_READS, num_threads=nc) x1 += " >& %s" % log_ff x2 += " >& %s" % log_fr x3 += " >& %s" % log_rf x4 += " >& %s" % log_ns commands = [x1, x2, x3, x4] parallel.pshell(commands) # Read the results. output_ff = alignlib.parse_bowtie2_output(log_ff) output_fr = alignlib.parse_bowtie2_output(log_fr) output_rf = alignlib.parse_bowtie2_output(log_rf) output_ns = alignlib.parse_bowtie2_output(log_ns) finally: if tempdir is not None and os.path.exists(tempdir): shutil.rmtree(tempdir) reads_ff = output_ff["concordant_reads"] reads_fr = output_fr["concordant_reads"] reads_rf = output_rf["concordant_reads"] reads_ns = output_ns["concordant_reads"] assert type(reads_ff) is type(0) orient = [ (reads_ff, "ff"), (reads_fr, "fr"), (reads_rf, "rf"), #(reads_ns, None), ] orient.sort(reverse=True) # Debug: if False: print orient raise AssertionError # If highest is within 10% of the un-stranded one, then it's #cutoff = reads_ns * 0.10 #if reads_ns >= orient[3][0] - reads_ns*0.10: # return None return orient[0][-1], reads_ns, reads_fr, reads_rf, reads_ff
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, orient_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Bowtie2 doesn't handle files with spaces in them. Make # temporary files without spaces. # Make a list of the jobs to run. jobs = [] for i, x in enumerate(fastq_files): sample, pair1, pair2 = x bam_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) sample_h = hashlib.hash_var(sample) temp_pair1 = "%d_%s_1.fa" % (i, sample_h) temp_pair2 = None if pair2: temp_pair2 = "%d_%s_2.fa" % (i, sample_h) j = filelib.GenericObject(sample=sample, pair1=pair1, pair2=pair2, temp_pair1=temp_pair1, temp_pair2=temp_pair2, bam_filename=bam_filename, log_filename=log_filename) jobs.append(j) for j in jobs: os.symlink(j.pair1, j.temp_pair1) if pair2: os.symlink(j.pair2, j.temp_pair2) # Generate bowtie2 commands for each of the files. attr2orient = { "single": None, "paired_fr": "fr", "paired_rf": "rf", "paired_ff": "ff", } orientation = attr2orient[orient.orientation] #x = sample_node.data.attributes["orientation"] #orientation = attr2orient[x] # Takes ~4 Gb per job. samtools = mlib.findbin("samtools") sq = parallel.quote commands = [] for j in jobs: #sample, pair1, pair2, bam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr # 2> test.log | samtools view -bS -o test.bam - x1 = alignlib.make_bowtie2_command(ref.fasta_file_full, j.temp_pair1, fastq_file2=j.temp_pair2, orientation=orientation, num_threads=nc) x2 = [ sq(samtools), "view", "-bS", "-o", sq(j.bam_filename), "-", ] x2 = " ".join(x2) x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2) #x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x1 = [x.bam_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata