def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def main(): args = parse_args() with args.source as source: reader = csv.DictReader(source) for (run, sample), rows in groupby(reader, itemgetter('run', 'enum')): sample_name = format_sample_name(run, sample) filename1 = os.path.join(args.dest, sample_name + '_R1_001.fastq.gz') filename2 = os.path.join(args.dest, sample_name + '_R2_001.fastq.gz') print(filename1) with open(filename1, 'wb') as dest1, open(filename2, 'wb') as dest2: dest1_zip = GzipFile(fileobj=dest1) dest2_zip = GzipFile(fileobj=dest2) for i, row in enumerate(rows): seq = row['string'].replace('-', '') for j in range(3): # Three duplicates so that G2P doesn't ignore it. prefix = '@M454:01:000000000-AAAAA:1:1101:{}:{}'.format( 10*i + j, row['count']) dest1_zip.write(prefix + ' 1:N:0:1\n') dest2_zip.write(prefix + ' 2:N:0:1\n') dest1_zip.write(seq + '\n') dest2_zip.write(reverse_and_complement(seq) + '\n') dest1_zip.write('+\n') dest2_zip.write('+\n') quality = 'A' * len(seq) dest1_zip.write(quality + '\n') dest2_zip.write(quality + '\n') dest1_zip.close() dest2_zip.close() print('Done.')
def testSimple(self): fwd = 'ACTG' expected = 'CAGT' rev = reverse_and_complement(fwd) self.assertEqual(expected, rev)
def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 aligned1 = aligned2 = None else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def write_fastq(self, fields, fastq, is_reversed=False): qname = fields[0] seq = fields[9] quality = fields[10] if is_reversed: seq = reverse_and_complement(seq) quality = ''.join(reversed(quality)) fastq.write('@{}\n{}\n+\n{}\n'.format(qname, seq, quality))
def write_fastq(fields, fastq, is_reversed=False): qname = fields[0] seq = fields[9] quality = fields[10] if is_reversed: seq = reverse_and_complement(seq) quality = ''.join(reversed(quality)) fastq.write('@{}\n{}\n+\n{}\n'.format(qname, seq, quality))
def __init__(self, contig_seq: str, target_seq: str): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 best_acontig = best_atarget = best_target = best_score = None best_reversed = None for target_nucs, is_reversed in unpack_mixtures_and_reverse( target_seq): aligned_contig, aligned_target, score = align_it( contig_seq, target_nucs, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) if best_score is None or score > best_score: best_acontig = aligned_contig best_atarget = aligned_target best_target = target_nucs best_score = score best_reversed = is_reversed aligned_contig = best_acontig aligned_target = best_atarget target_nucs = best_target self.score = best_score self.is_reversed = best_reversed if self.is_reversed: aligned_contig = reverse_and_complement(aligned_contig) aligned_target = reverse_and_complement(aligned_target) match = re.match('-*([^-](.*[^-])?)', aligned_target) self.start = match.start(1) end = match.end(1) self.contig_match = aligned_contig[self.start:end].replace('-', '') self.dist = Levenshtein.distance(target_nucs, self.contig_match) stripped_contig = aligned_contig.lstrip('-') overhang = len(aligned_contig) - len(stripped_contig) if overhang > 0: stripped_target = target_nucs[overhang:] self.end_dist = Levenshtein.distance(stripped_target, self.contig_match) else: stripped_contig = aligned_contig.rstrip('-') overhang = len(aligned_contig) - len(stripped_contig) if overhang == 0: self.end_dist = self.dist else: stripped_target = target_nucs[:-overhang] self.end_dist = Levenshtein.distance(stripped_target, self.contig_match)
def test_probe_finder_reversed(): target_seq = 'ATCGACCTAGCT' contig_seq = reverse_and_complement('ATCGACCTGGCTAATTCCAGT') expected_match = 'ATCGACCTGGCT' finder = ProbeFinder(contig_seq, target_seq) assert finder.contig_match == expected_match assert finder.is_reversed
def main(): fastq_files = [ FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'), )) ] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end - ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
def unpack_mixtures_and_reverse( seq: str) -> typing.Set[typing.Tuple[str, bool]]: """ Unpack mixture nucleotide codes, and add reverse complements. :param seq: nucleotide sequence, possibly including mixture codes :return: unpacked and reversed sequences, along with is_reversed flag """ old_mixtures = {''} for mixture in seq: new_mixtures = set() for nuc in mixture_dict.get(mixture, mixture): for old_mixture in old_mixtures: new_mixtures.add(old_mixture + nuc) old_mixtures = new_mixtures forward_results = {(mixture, False) for mixture in old_mixtures} reversed_results = {(reverse_and_complement(mixture), True) for mixture in old_mixtures} return forward_results | reversed_results
def test_trim(tmpdir): read1_content = 'TATCTACTAACTGTCGGTCTAC' read2_content = reverse_and_complement(read1_content) expected1 = build_fastq(read1_content) expected2 = build_fastq(read2_content) tmp_path = Path(str(tmpdir)) fastq1_path = tmp_path / 'read1.fastq' fastq2_path = tmp_path / 'read2.fastq' trimmed1_path = tmp_path / 'trimmed1.fastq' trimmed2_path = tmp_path / 'trimmed2.fastq' fastq1_path.write_text(expected1) fastq2_path.write_text(expected2) trim([fastq1_path, fastq2_path], 'no_bad_cycles.csv', [str(trimmed1_path), str(trimmed2_path)], use_gzip=False) trimmed1 = trimmed1_path.read_text() trimmed2 = trimmed2_path.read_text() assert trimmed1 == expected1 assert trimmed2 == expected2
def main(): projects = ProjectConfig.loadDefault() sections_2100hcv_1, sections_2100hcv_2 = make_random_sections( 'HCV1A-H77-NS5a', 1, 300, projects, 400) sections_2100v3_1, sections_2100v3_2 = ([ FastqSection('HIV1-B-FR-K03455-seed', 7056, 7312, 50), FastqSection('HIV1-B-FR-K03455-seed', 7062, 7312, 50) ], [ FastqSection('HIV1-B-FR-K03455-seed', 7123, 7373, 50), FastqSection('HIV1-B-FR-K03455-seed', 7123, 7376, 50) ]) sections_2100hiv_1, sections_2100hiv_2 = make_random_sections( 'RT', 1, 300, projects, 400) sections_2160_1, sections_2160_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 1, 230, projects, mutations=(CodonMutation(159, 'GTC'), )) sections_2160midi_1, sections_2160midi_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 231, 561, projects, mutations=(CodonMutation(316, 'AGC'), )) sections_2170_1a_1, sections_2170_1a_2 = make_random_sections( 'HCV-1a', 6258, 9375) sections_2170_2_1, sections_2170_2_2 = make_random_sections( 'HCV-2a', 6269, 9440) sections_2180_1, sections_2180_2 = make_random_sections( 'HIV1-B-FR-K03455-seed', 6225, 7757) hxb2_ref = projects.getReference('HIV1-B-FR-K03455-seed') projects.config['regions']['HXB2-with-deletion'] = dict( reference=hxb2_ref[617:928] + hxb2_ref[9358:9652], is_nucleotide=True, seed_group=None) sections_2210_1, sections_2210_2 = make_random_sections( 'HXB2-with-deletion', projects=projects) fastq_files = [ FastqFile('2010A-V3LOOP_S3_L001_R1_001.fastq', '2010', False, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2010A-V3LOOP_S3_L001_R2_001.fastq', '2010', True, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2020A-GP41_S4_L001_R1_001.fastq', '2020', False, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2020A-GP41_S4_L001_R2_001.fastq', '2020', True, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2040A-HLA-B_S6_L001_R1_001.fastq', '2040', False, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile('2040A-HLA-B_S6_L001_R2_001.fastq', '2040', True, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile( '2070A-PR_S9_L001_R1_001.fastq', '2070', False, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile( '2070A-PR_S9_L001_R2_001.fastq', '2070', True, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R1_001.fastq', '2100', False, sections_2100hcv_1 + sections_2100v3_1 + sections_2100hiv_1), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R2_001.fastq', '2100', True, sections_2100hcv_2 + sections_2100v3_2 + sections_2100hiv_2), FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 66, 100), FastqSection('HCV2-JFH-1-NS5b', 115, 181, 100, (CodonMutation(159, 'GTC'), )))), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 51, 114, 100), FastqSection('HCV2-JFH-1-NS5b', 165, 230, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 315, 100), FastqSection('HCV2-JFH-1-NS5b', 398, 485, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 305, 397, 100, (CodonMutation(316, 'AGC'), )), FastqSection('HCV2-JFH-1-NS5b', 470, 561, 100))), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100, (CodonMutation(24, 'ATA'), )), )), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100, (CodonMutation(24, 'ATA'), )), )), # Simplify with one_contig. FastqFile('2160A-HCV_S19_L001_R1_001.fastq', '2160', False, sections_2160_1), FastqFile('2160A-HCV_S19_L001_R2_001.fastq', '2160', True, sections_2160_2), # Simplify with one_contig. FastqFile('2160AMIDI-MidHCV_S20_L001_R1_001.fastq', '2160', False, sections_2160midi_1), FastqFile('2160AMIDI-MidHCV_S20_L001_R2_001.fastq', '2160', True, sections_2160midi_2), # Simplify with two_long_contigs. FastqFile('2170A-HCV_S21_L001_R1_001.fastq', '2170', False, sections_2170_1a_1 + sections_2170_2_1), FastqFile('2170A-HCV_S21_L001_R2_001.fastq', '2170', True, sections_2170_1a_2 + sections_2170_2_2), FastqFile('2180A-HIV_S22_L001_R1_001.fastq', '2180', False, sections_2180_1), FastqFile('2180A-HIV_S22_L001_R2_001.fastq', '2180', True, sections_2180_2), FastqFile('2190A-SARSCOV2_S23_L001_R1_001.fastq', '2190', False, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2190A-SARSCOV2_S23_L001_R2_001.fastq', '2190', True, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2200A-SARSCOV2_S24_L001_R1_001.fastq', '2200', False, (FastqSection('SARS-CoV-2-nsp1', 20, 66, 100), )), FastqFile('2200A-SARSCOV2_S24_L001_R2_001.fastq', '2200', True, (FastqSection('SARS-CoV-2-nsp1', 56, 102, 100), )), FastqFile('2210A-NFLHIVDNA_S25_L001_R1_001.fastq', '2210', False, sections_2210_1), FastqFile('2210A-NFLHIVDNA_S25_L001_R2_001.fastq', '2210', True, sections_2210_2) ] for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) is_nucleotide = ((ref_start, ref_end) == (section.start_pos, section.end_pos)) for mutation in section.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = mutation.pos - section.start_pos if not is_nucleotide: section_pos *= 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * len(ref_nuc_section) file_num = '2' if fastq_file.is_reversed else '1' # noinspection PyTypeChecker for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
""" Reverse a nucleotide sequence and replace with complementary nucleotides. Mixtures are allowed, as well as *, N, and -. If you want to compare the result to an expected sequence, put the expected sequence in reverse_compare. Source: https://github.com/ArtPoon/bioinfo/blob/master/seqUtils.py#L143 """ from micall.utils.translation import reverse_and_complement nuc_seq = ''.join([ "TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGC", "AACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT" ]) reverse_compare = ''.join([ "ACAATGTGCTTGTCTTATATCTCCTATTATTTCTCCTGTTGCATAAAATGCTCTCCCTGGTCCTA", "TATGTATACTTTTTCTTGTATTGTTGTTGGGTCTTGTACA" ]) reverse_seq = reverse_and_complement(nuc_seq) pairs = zip(reverse_seq, reverse_compare) diffs = [' ' if a == b else '*' for a, b in pairs] print 'result ', reverse_seq print 'diffs ', ''.join( diffs) if reverse_seq != reverse_compare else 'no diffs' print 'compare', reverse_compare
""" Reverse a nucleotide sequence and replace with complementary nucleotides. Mixtures are allowed, as well as *, N, and -. If you want to compare the result to an expected sequence, put the expected sequence in reverse_compare. Source: https://github.com/ArtPoon/bioinfo/blob/master/seqUtils.py#L143 """ from micall.utils.translation import reverse_and_complement nuc_seq = ''.join([ "TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGC", "AACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT" ]) reverse_compare = ''.join([ "ACAATGTGCTTGTCTTATATCTCCTATTATTTCTCCTGTTGCATAAAATGCTCTCCCTGGTCCTA", "TATGTATACTTTTTCTTGTATTGTTGTTGGGTCTTGTACA" ]) reverse_seq = reverse_and_complement(nuc_seq) pairs = zip(reverse_seq, reverse_compare) diffs = [' ' if a == b else '*' for a, b in pairs] print 'result ', reverse_seq print 'diffs ', ''.join(diffs) if reverse_seq != reverse_compare else 'no diffs' print 'compare', reverse_compare
def main(): fastq_files = [FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100),), (CodonMutation(24, 'ATA'),)), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100),), (CodonMutation(24, 'ATA'),))] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos(projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos+3] = list(mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end-ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'.format( fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section+'\n') f.write('+\n') f.write(phred_scores+'\n') next_cluster += section.count