def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = { 'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0' } bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output( [get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_rev_compl_fragmented_reads(self): reference_seq = GENOME #with unpaired_reads query_f = '>seq1\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG' query_f += 'AGGACACCCAGTCTCCCGGGAGTCTTTTCCAAGGTGTGCTCCTGATCGCCGTGTTA\n' query_r = '>seq2\nTAACACGGCGATCAGGAGCACACCTTGGAAAAGACTCCCGGGAGACTGGGTG' query_r += 'TCCTCAGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query = query_f + query_r in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() index_fpath = get_or_create_bowtie2_index(ref_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, extra_params=['-a', '-f'], unpaired_fpaths=[in_fhand.name]) map_process_to_bam(bowtie2, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name) #for aligned_read in samfile: # print aligned_read #with paired_reads. #f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' #f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' query_f = query1 + query3 query_r = query2 + query4 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = [[f_fhand.name], [r_fhand.name]] ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() index_fpath = get_or_create_bowtie2_index(ref_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, extra_params=['-a', '-f'], paired_fpaths=paired_fpaths) map_process_to_bam(bowtie2, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)
def test_rev_compl_fragmented_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # with unpaired_reads query_f = '>seq1\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG' query_f += 'AGGACACCCAGTCTCCCGGGAGTCTTTTCCAAGGTGTGCTCCTGATCGCCGTGTTA\n' query_r = '>seq2\nTAACACGGCGATCAGGAGCACACCTTGGAAAAGACTCCCGGGAGACTGGGTG' query_r += 'TCCTCAGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query = query_f + query_r in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, extra_params=['-a', '-f'], unpaired_fpath=in_fhand.name) map_process_to_bam(bowtie2, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name) # for aligned_read in samfile: # print aligned_read # with paired_reads. # f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' # f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' query_f = query1 + query3 query_r = query2 + query4 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = (f_fhand.name, r_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, extra_params=['-a', '-f'], paired_fpaths=paired_fpaths) map_process_to_bam(bowtie2, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output([get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output( [get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = {'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0'} bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output([get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath get_or_create_bowtie2_index(index_fpath) seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_process = map_with_bowtie2(index_fpath, unpaired_fpaths=[reads_fhand.name], extra_params=extra_params) map_process_to_bam(map_process, bam_fhand.name) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def test_rev_compl_fragmented_reads(self): reference_seq = GENOME #with paired_reads. #f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' #f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' #f is fragmented in two reference sequences. r mapps completely query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC' query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n' query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC' query8 += 'TGAGTAATATTATAGAAAGT\n' query_f = query1 + query3 + query7 query_r = query2 + query4 + query8 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = [f_fhand.name, r_fhand.name] ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() index_fpath = get_or_create_bwa_index(ref_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bwa, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_process = map_with_bowtie2(index_fpath, unpaired_fpath=reads_fhand.name, extra_params=extra_params, threads=self.threads) map_process_to_bam(map_process, bam_fhand.name) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def test_map_with_bowtie2(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bowtie2_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bowtie2, bam_fhand.name) directory.close() #With paired_fpahts option reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') forward_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') reverse_fpath = NamedTemporaryFile().name paired_fpaths = (forward_fpath, reverse_fpath) directory = TemporaryDir() index_fpath = get_or_create_bowtie2_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bowtie2, bam_fhand.name) directory.close()
def test_map_with_bowtie2(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bowtie2_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bowtie2, bam_fhand.name) directory.close() # With paired_fpahts option reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') forward_fpath = os.path.join(TEST_DATA_DIR, 'arabreads_1.fastq') reverse_fpath = os.path.join(TEST_DATA_DIR, 'arabreads_2.fastq') paired_fpaths = (forward_fpath, reverse_fpath) directory = TemporaryDir() index_fpath = get_or_create_bowtie2_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bowtie2 = map_with_bowtie2(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bowtie2, bam_fhand.name) directory.close()
def test_rev_compl_fragmented_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # with paired_reads. # f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' # f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' # f is fragmented in two reference sequences. r mapps completely query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC' query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n' query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC' query8 += 'TGAGTAATATTATAGAAAGT\n' query_f = query1 + query3 + query7 query_r = query2 + query4 + query8 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = (f_fhand.name, r_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bwa, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)