def main(args, outs): chunk = args.chunk if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw( "must supply a read1 and read2 when reads_interleave == False") if chunk['reads_interleaved']: reads = chunk['read1'] else: reads = [chunk['read1']] if chunk['read2'] is not None: reads.append(chunk['read2']) a = tenkit.align.Aligner(reads, outs.default) aligner = args.aligner ref_fasta = tenkit.reference.get_fasta(args.reference_path) rg_string = chunk['read_group'] read_group_header = tk_bam.make_rg_header(rg_string) a.output_alignment(aligner=aligner, aligner_params={ 'ref_fasta': ref_fasta, 'algorithm': args.aligner_method }, num_threads=args.__threads, read_group_header=read_group_header)
def main(args, outs): # this silences a weird non-failure in --strict=error mode # TODO(lhepler): remove this when martian upstream handles this itself outs.default = [] chunk = args.chunk if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw( "must supply a read1 and read2 when reads_interleave == False") if chunk['reads_interleaved']: reads = chunk['read1'] else: reads = [chunk['read1']] if chunk['read2'] is not None: reads.append(chunk['read2']) a = tenkit.align.Aligner(reads, outs.output) aligner = args.aligner ref_fasta = tenkit.reference.get_fasta(args.reference_path) rg_string = chunk['read_group'] read_group_header = tk_bam.make_rg_header(rg_string) a.output_alignment(aligner=aligner, aligner_params={ 'ref_fasta': ref_fasta, 'algorithm': args.aligner_method }, num_threads=args.__threads, read_group_header=read_group_header)
def create_unaligned_bam(args, outs): star_ref_path = cr_utils.get_reference_star_path(args.reference_path) header_buf = cStringIO.StringIO() header_buf.write('@HD\tVN:1.4\n') # SQ header lines with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f: for line in f: chr_name, chr_len = line.strip().split('\t') header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len)) # RG header lines for packed_rg in args.read_groups: header_buf.write( re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n') # Get read group ID for this chunk of reads read_group = args.read_group # pysam doesn't support reading SAM from a StringIO object with open('tmphdr', 'w') as f: f.write(header_buf.getvalue()) samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False) outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile) fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk) fastq_file2 = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else None read1s = tk_fasta.read_generator_fastq(fastq_file1) read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else [] record = pysam.AlignedSegment() record.flag = 4 for read1, read2 in itertools.izip_longest(read1s, read2s): name, seq, qual = read1 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) if read2: name, seq, qual = read2 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) samfile.close() fastq_file1.close() if fastq_file2 is not None: fastq_file2.close() outbam.close()
def bwa_align_unpaired(ref_fasta, read_fastq, out_name, algorithm='ALN', max_hits=None, read_group_header=None, num_threads=24): """ Runs bwa aligner on reads without using paired-information (using bam as input format). """ assert (type(read_fastq) != list) if read_group_header is None: read_group_header = tk_bam.make_rg_header() if algorithm == 'MEM': # Temp file names sam_name = out_name + '.sam' sam_out_file = open(sam_name, 'w') log_subprocess.check_call([ 'bwa', 'mem', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq ], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Remove temp files subprocess.check_call(['rm', sam_name]) elif algorithm == 'ALN': # Temp file names sam_name = out_name + '.sam' index_name = out_name + '.sai' sam_out_file = open(sam_name, 'w') index_file = open(index_name, 'w') log_subprocess.check_call( ['bwa', 'aln', '-t', str(num_threads), ref_fasta, read_fastq], stdout=index_file) index_file.close() if max_hits: log_subprocess.check_call([ 'bwa', 'samse', '-n', str(max_hits), ref_fasta, index_name, read_fastq ], stdout=sam_out_file) else: log_subprocess.check_call( ['bwa', 'samse', ref_fasta, index_name, read_fastq], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Remove temp files subprocess.check_call(['rm', index_name]) subprocess.check_call(['rm', sam_name]) else: raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
def bwa_align_paired(ref_fasta, read_fastq, out_name, algorithm='ALN', max_hits=None, read_group_header=None, num_threads=24): """Runs bwa paired-end aligner on reads using paired-end information Algorithm choices are currently MEM: Maximal Exact Matching (better for longer reads) ALN: Better for longer reads Haven't yet implemented BWA-SW Currently assumes the input read_fastq is in interleaved format, i.e. the reads of a pair are alternating. """ if read_group_header is None: read_group_header = tk_bam.make_rg_header() if algorithm == 'MEM': if type(read_fastq) == list: assert (len(read_fastq) == 2) ## This restricts to primary alignments only out_file = open(out_name, 'w') ps = log_subprocess.Popen([ 'bwa', 'mem', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq[0], read_fastq[1] ], stdout=subprocess.PIPE) #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed errors_file = open(out_name + '_ERRORS', 'w') log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file, stderr=errors_file) out_file.close() errors_file.close() else: ## This restricts to primary alignments only out_file = open(out_name, 'w') ps = log_subprocess.Popen([ 'bwa', 'mem', '-p', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq ], stdout=subprocess.PIPE) #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed errors_file = open(out_name + '_ERRORS', 'w') log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file, stderr=errors_file) out_file.close() errors_file.close() elif algorithm == 'ALN': # Temp file names temp_fastq_name1 = out_name + '1.fastq' temp_fastq_name2 = out_name + '2.fastq' index_name_1 = out_name + '1.sai' index_name_2 = out_name + '2.sai' sam_name = out_name + '.sam' # Create the temp non-interleaved files in_fastq = open(read_fastq, 'r') temp_fastq1 = open(temp_fastq_name1, 'w') temp_fastq2 = open(temp_fastq_name2, 'w') tk_fasta.uninterleave_fastq(in_fastq, temp_fastq1, temp_fastq2) temp_fastq1.close() temp_fastq2.close() # Create the bwa index files index_file_1 = open(index_name_1, 'w') index_file_2 = open(index_name_2, 'w') log_subprocess.check_call([ 'bwa', 'aln', '-t', str(num_threads), ref_fasta, temp_fastq_name1 ], stdout=index_file_1) log_subprocess.check_call([ 'bwa', 'aln', '-t', str(num_threads), ref_fasta, temp_fastq_name2 ], stdout=index_file_2) index_file_1.close() index_file_2.close() # Create the sorted SAM file sam_out_file = open(sam_name, 'w') if max_hits: log_subprocess.check_call([ 'bwa', 'sampe', '-n', str(max_hits), ref_fasta, index_name_1, index_name_2, temp_fastq_name1, temp_fastq_name2 ], stdout=sam_out_file) else: log_subprocess.check_call([ 'bwa', 'sampe', ref_fasta, index_name_1, index_name_2, temp_fastq_name1, temp_fastq_name2 ], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Clean up temporary files subprocess.check_call(['rm', temp_fastq_name1]) subprocess.check_call(['rm', temp_fastq_name2]) subprocess.check_call(['rm', index_name_1]) subprocess.check_call(['rm', index_name_2]) subprocess.check_call(['rm', sam_name]) else: raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)