def testTwoCodons(self): nucs = 'TTTCCT' expected_aminos = 'FP' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testSingleDashAmbiguous(self): nucs = '-TT' expected_aminos = '?' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testAmbiguousBasesThatAreSynonyms(self): nucs = 'TTY' # TTC or TTT: both map to F expected_aminos = 'F' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def load_reading_frames(self, seed_name): """ Calculate reading frames along a consensus sequence. :param seed_name: the name of the seed to look up :return: {pos: frame} zero-based position and reading frame for each position. Frame 1 needs one nucleotide inserted at start. """ result = Counter() conseq = self.remap_conseqs[seed_name] coord_refs = self.projects.getCoordinateReferences(seed_name) for coord_ref in coord_refs.values(): best_alignment = (-1000000, '', '', 0) for frame_index in range(3): conseq_aminos = translate('-' * frame_index + conseq) aconseq, acoord, score = self._pair_align( conseq_aminos, coord_ref, GAP_OPEN_COORD, GAP_EXTEND_COORD) best_alignment = max(best_alignment, (score, aconseq, acoord, frame_index)) score, aconseq, acoord, frame_index = best_alignment if frame_index == 0: continue # defaults to 0, no need to record conseq_codon_index = -1 coord_codon_index = -1 for conseq_amino, coord_amino in zip(aconseq, acoord): if conseq_amino != '-': conseq_codon_index += 1 if coord_amino == '-': continue coord_codon_index += 1 nuc_pos = conseq_codon_index * 3 - frame_index for i in range(3): result[nuc_pos + i] = frame_index return result
def testTwoAmbiguousBasesThatAreNotSynonyms(self): nucs = 'RGR' # GGA, GGG, AGA, or AGG: map to G and R, respectively expected_aminos = '?' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testTwoAmbiguousBasesThatAreSynonyms(self): nucs = 'MGR' # CGA, CGG, AGA, or AGG: all map to R expected_aminos = 'R' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testPartialCodon(self): nucs = 'TTTCC' expected_aminos = 'F' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testTwoDashes(self): nucs = '--T' expected_aminos = '?' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testAmbiguousAminosListed(self): nucs = 'TTM' # TTA or TTC: map to L or F expected_aminos = '[FL]' aminos = translate(nucs, list_ambiguous=True) self.assertEqual(expected_aminos, aminos)
def testReturnList(self): nucs = 'CGATTM' # TTA or TTC: map to L or F expected_aminos = [['R'], ['F', 'L']] aminos = translate(nucs, return_list=True) self.assertEqual(expected_aminos, aminos)
def testMixturesNotTranslated(self): nucs = 'TTY' # TTC or TTT: both map to F expected_aminos = '?' aminos = translate(nucs, translate_mixtures=False) self.assertEqual(expected_aminos, aminos)
def testListAmbiguousOverridesMixturesNotTranslated(self): nucs = 'TTY' expected_aminos = 'F' aminos = translate(nucs, translate_mixtures=False, list_ambiguous=True) self.assertEqual(expected_aminos, aminos)
def extract_target(seed_ref, coordinate_ref): """ Extract a portion of the seed that aligns with the coordinate reference. :param seed_ref: seed reference (nucleotide sequence) :param coordinate_ref: coordinate reference (amino acid sequence) :return: subsequence of seed_ref that maps to coordinate_ref """ best_alignment = (-1000000, '', '', 0) for frame_index in range(3): seed_aminos = translate('-' * frame_index + seed_ref) aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) best_alignment = max(best_alignment, (score, aseed, acoord, frame_index)) score, aseed, acoord, frame_index = best_alignment assert score >= len(coordinate_ref) // 2, score target = [] seed_index = -frame_index for s, c in zip(aseed, acoord): if s == '-': continue seed_index += 3 if c == '-': continue target.append(seed_ref[seed_index - 3:seed_index]) return ''.join(target)
def find_coord_pos(projects, coord_name, start_pos, end_pos): coord_seq = projects.getReference(coord_name) gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None for ref_name in sorted(projects.getProjectSeeds('HCV')): if not ref_name.startswith('HCV-2'): continue ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if coord_amino != '-': coord_pos += 1 if ref_amino != '-': ref_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset return ref_name, ref_start, ref_end
def check_hiv_wild_types(project_config): print("""\ HIV wild types for resistance reports are extracted from Consensus B. """) sequences = fetch_alignment_sequences(2004, 'CON', # Consensus/Ancestral 'POL') consensus_b = sequences['CONSENSUS_B'].upper() with open(WILD_TYPES_PATH) as wild_types_file: wild_types = safe_load(wild_types_file) boundaries = {'PR': (171, 468), 'RT': (468, 1788), 'INT': (2148, 3014)} ref_names = sorted(boundaries.keys()) source_wild_types = {} for ref_name, (start, end) in boundaries.items(): source_nuc_sequence = consensus_b[start:end] source_wild_types[ref_name] = translate(source_nuc_sequence) report, error_count = compare_config(ref_names, project_config, source_wild_types, reference_overrides=wild_types) print(report) return error_count
def testThreeDashes(self): nucs = '---' expected_aminos = '-' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def load_reading_frames(self, seed_name): """ Calculate reading frames along a consensus sequence. :param seed_name: the name of the seed to look up :return: {pos: frame} zero-based position and reading frame for each position. Frame 1 needs one nucleotide inserted at start. """ result = Counter() conseq = self.remap_conseqs[seed_name] coord_refs = self.projects.getCoordinateReferences(seed_name) for coord_ref in coord_refs.values(): best_alignment = (-1000000, '', '', 0) for frame_index in range(3): conseq_aminos = translate('-'*frame_index + conseq) aconseq, acoord, score = self._pair_align(conseq_aminos, coord_ref, GAP_OPEN_COORD, GAP_EXTEND_COORD) best_alignment = max(best_alignment, (score, aconseq, acoord, frame_index)) score, aconseq, acoord, frame_index = best_alignment if frame_index == 0: continue # defaults to 0, no need to record conseq_codon_index = -1 coord_codon_index = -1 for conseq_amino, coord_amino in zip(aconseq, acoord): if conseq_amino != '-': conseq_codon_index += 1 if coord_amino == '-': continue coord_codon_index += 1 nuc_pos = conseq_codon_index * 3 - frame_index for i in range(3): result[nuc_pos+i] = frame_index return result
def testSingleDashUnambiguous(self): nucs = 'CG-' # CGA, CGC, CGG, CGT all map to R expected_aminos = 'R' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testLowerCase(self): nucs = 'TttCCT' expected_aminos = 'FP' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def extract_target(seed_ref, coordinate_ref): """ Extract a portion of the seed that aligns with the coordinate reference. :param seed_ref: seed reference (nucleotide sequence) :param coordinate_ref: coordinate reference (amino acid sequence) :return: subsequence of seed_ref that maps to coordinate_ref """ best_alignment = (-1000000, '', '', 0) for frame_index in range(3): seed_aminos = translate('-'*frame_index + seed_ref) aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) best_alignment = max(best_alignment, (score, aseed, acoord, frame_index)) score, aseed, acoord, frame_index = best_alignment assert score >= len(coordinate_ref) // 2, score target = [] seed_index = -frame_index for s, c in zip(aseed, acoord): if s == '-': continue seed_index += 3 if c == '-': continue target.append(seed_ref[seed_index-3:seed_index]) return ''.join(target)
def testSingleCodon(self): nucs = 'TTT' expected_aminos = 'F' aminos = translate(nucs) self.assertEqual(expected_aminos, aminos)
def testOffset(self): nucs = "TTTCCT" offset = 3 expected_aminos = "-FP" aminos = translate(nucs, offset) self.assertEqual(expected_aminos, aminos)
def check_hcv_coordinates(project_config, unchecked_ref_names: set): print("""\ Most HCV coordinate references were listed in the FDA guidance: https://www.fda.gov/downloads/Drugs/GuidanceComplianceRegulatoryInformation/Guidances/UCM340712.pdf This script contains a complete list of the reference accession numbers. """) accession_numbers = {'HCV1A': 'NC_004102', 'HCV1B': 'AJ238799', 'HCV2': 'AB047639', 'HCV3': 'GU814263', 'HCV4': 'GU814265', 'HCV5': 'AF064490', 'HCV6': 'Y12083', 'HCV7': 'EF108306'} source_nuc_sequences = { genotype: fetch_hcv_by_accession(accession_number) for genotype, accession_number in accession_numbers.items()} gene_names = [ 'Core', 'E1', 'E2', 'p7', 'NS2', 'NS3', 'NS4a', 'NS4b', 'NS5a', 'NS5b'] # Boundary positions are from the European HCV database records. # https://euhcvdb.ibcp.fr/euHCVdb/do/displayHCVEntry?primaryAC=AF009606 # That is the original H77 accession number for HCV1A. NC_004102 is the # curated and annotated version that was derived from the AF009606 entry. # All the other genotypes can be found by their regular accession numbers. genotype_boundaries = { # Core E1 E2 p7 NS2 NS3 NS4a NS4b NS5a NS5b 'HCV1A': [342, 915, 1491, 2580, 2769, 3420, 5313, 5475, 6258, 7602, 9375], 'HCV1B': [342, 915, 1491, 2580, 2769, 3420, 5313, 5475, 6258, 7599, 9372], 'HCV2': [341, 914, 1490, 2591, 2780, 3431, 5324, 5486, 6269, 7667, 9440], 'HCV3': [340, 913, 1489, 2596, 2785, 3436, 5329, 5491, 6274, 7630, 9403], 'HCV4': [341, 914, 1490, 2579, 2768, 3419, 5312, 5474, 6257, 7592, 9365], 'HCV5': [247, 820, 1396, 2488, 2677, 3328, 5221, 5383, 6166, 7516, 9289], 'HCV6': [284, 857, 1433, 2534, 2723, 3374, 5267, 5429, 6212, 7565, 9338], 'HCV7': [309, 882, 1458, 2559, 2748, 3399, 5292, 5454, 6237, 7575, 9348]} hcv_project = project_config.config['projects']['HCV'] ref_names = {project_region['coordinate_region'] for project_region in hcv_project['regions']} unchecked_ref_names.difference_update(ref_names) source_sequences = {} for ref_name in sorted(ref_names): ref_parts = ref_name.split('-') genotype = ref_parts[0] gene_name = ref_parts[-1] gene_index = gene_names.index(gene_name) boundaries = genotype_boundaries[genotype] start, stop = boundaries[gene_index:gene_index+2] nuc_seq_ref_trimmed = source_nuc_sequences[genotype][start-1:stop-1] source_sequences[ref_name] = translate(nuc_seq_ref_trimmed) report, error_count = compare_config(ref_names, project_config, source_sequences) print(report) return error_count
def testReturnListWithoutMixtures(self): """ Don't know why you would use this combination, but stay sane. """ nucs = 'CGATTM' # TTA or TTC: map to L or F expected_aminos = [['R'], ['?']] aminos = translate(nucs, return_list=True, translate_mixtures=False) self.assertEqual(expected_aminos, aminos)
def testStatisticsUnambiguous(self): nucs = 'TTATTCTTTTTA' expected_aminos = 'LFFL' stats = {} expected_stats = dict(length=4, ambiguous=0, max_aminos=1) aminos = translate(nucs, stats=stats, list_ambiguous=True) self.assertEqual(expected_aminos, aminos) self.assertEqual(expected_stats, stats)
def testStatisticsBlank(self): nucs = '' expected_aminos = '' stats = {} expected_stats = dict(length=0, ambiguous=0, max_aminos=0) aminos = translate(nucs, stats=stats, list_ambiguous=True) self.assertEqual(expected_aminos, aminos) self.assertEqual(expected_stats, stats)
def testStatisticsAmbiguous(self): nucs = 'TTMTTCNTTTTA' expected_aminos = '[FL]F[FILV]L' stats = {} expected_stats = dict(length=4, ambiguous=2, max_aminos=4) aminos = translate(nucs, stats=stats, list_ambiguous=True) self.assertEqual(expected_aminos, aminos) self.assertEqual(expected_stats, stats)
def count_aminos(self, codon_seq, count): """ Record a set of reads at this position in the seed reference. @param codon_seq: a string of three nucleotides that were read at this position @param count: the number of times they were read """ amino = translate(codon_seq.upper()) if amino in AMINO_ALPHABET: self.counts[amino] += count for i in range(3): self.nucleotides[i].count_nucleotides(codon_seq[i], count)
def check_hcv_coordinates(project_config, unchecked_ref_names: set): print("""\ Most HCV coordinate references were listed in the FDA guidance: https://www.fda.gov/downloads/Drugs/GuidanceComplianceRegulatoryInformation/Guidances/UCM340712.pdf This script contains a complete list of the reference accession numbers. """) accession_numbers = { 'HCV1A': 'NC_004102', 'HCV1B': 'AJ238799', 'HCV2': 'AB047639', 'HCV3': 'GU814263', 'HCV4': 'GU814265', 'HCV5': 'AF064490', 'HCV6': 'Y12083', # EF108306.2 is available, but only extends 5' and 3'. 'HCV7': 'EF108306.1' } source_nuc_sequences = { genotype: fetch_by_accession(accession_number) for genotype, accession_number in accession_numbers.items() } # Boundary positions in landmarks are from the European HCV database records. # https://euhcvdb.ibcp.fr/euHCVdb/do/displayHCVEntry?primaryAC=AF009606 # That is the original H77 accession number for HCV1A. NC_004102 is the # curated and annotated version that was derived from the AF009606 entry. # All the other genotypes can be found by their regular accession numbers. hcv_project = project_config.config['projects']['HCV'] ref_names = { project_region['coordinate_region'] for project_region in hcv_project['regions'] } unchecked_ref_names.difference_update(ref_names) landmark_reader = LandmarkReader.load() source_sequences = {} for ref_name in sorted(ref_names): ref_parts = ref_name.split('-') genotype = ref_parts[0] seed_name = f'HCV-{genotype[3:].lower()}' if len(seed_name) == 5: seed_name += 'a' region = landmark_reader.get_gene(seed_name, ref_name) start, stop = region['start'], region['end'] nuc_seq_ref_trimmed = source_nuc_sequences[genotype][start - 1:stop - 1] source_sequences[ref_name] = translate(nuc_seq_ref_trimmed) report, error_count = compare_config(ref_names, project_config, source_sequences) print(report) return error_count
def find_coord_pos(projects: ProjectConfig, coord_name: str, start_pos: int = None, end_pos: int = None): coord_seq = projects.getReference(coord_name) if start_pos is None: start_pos = 1 if end_pos is None: end_pos = len(coord_seq) + 1 if projects.config['regions'][coord_name]['is_nucleotide']: # Already have a nucleotide sequence, nothing to do. return coord_name, start_pos, end_pos gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
def count_aminos(self, codon_seq, count): """ Record a set of reads at this position in the seed reference. @param codon_seq: a string of three nucleotides that were read at this position, may be padded with spaces at the start or end of a sequence, or dashes for deletions @param count: the number of times they were read """ if 'N' in codon_seq: self.low_quality += count elif '---' == codon_seq: self.deletions += count elif '-' in codon_seq: self.partial += count elif ' ' not in codon_seq and 'n' not in codon_seq: amino = translate(codon_seq.upper()) self.counts[amino] += count for i, nuc in enumerate(codon_seq): if nuc != ' ': seed_nucleotide = self.nucleotides[i] seed_nucleotide.count_nucleotides(nuc, count)
def check_hla_coordinates(project_config, unchecked_ref_names: set): print("""\ HLA coordinate references are translated from the seed reference. """) ref_names = ('HLA-B-exon2', 'HLA-B-exon3') seed_sequence = project_config.getReference('HLA-B-seed') unchecked_ref_names.difference_update(ref_names) landmark_reader = LandmarkReader.load() source_sequences = {} for ref_name in ref_names: region = landmark_reader.get_gene('HLA-B-seed', ref_name[6:]) source_nuc_sequence = seed_sequence[region['start']:region['end']] source_sequences[ref_name] = translate(source_nuc_sequence) report, error_count = compare_config(ref_names, project_config, source_sequences) print(report) return error_count
def find_coord_pos(projects, coord_name, start_pos, end_pos): coord_seq = projects.getReference(coord_name) gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
def check_hla_coordinates(project_config, unchecked_ref_names: set): print("""\ HLA coordinate references are translated from the seed reference. """) boundaries = {'HLA-B-exon2': (200, 470), 'HLA-B-exon3': (716, 992)} seed_sequence = project_config.getReference('HLA-B-seed') ref_names = sorted(boundaries.keys()) unchecked_ref_names.difference_update(ref_names) source_sequences = {} for ref_name, (start, end) in boundaries.items(): source_nuc_sequence = seed_sequence[start:end] source_sequences[ref_name] = translate(source_nuc_sequence) report, error_count = compare_config(ref_names, project_config, source_sequences) print(report) return error_count
def check_sars_coordinates(project_config, unchecked_ref_names: set): print("""\ SARS-CoV-2 coordinate references are translated from the seed reference. """) ref_names = ('SARS-CoV-2-ORF1ab', 'SARS-CoV-2-S', 'SARS-CoV-2-ORF3a', 'SARS-CoV-2-E', 'SARS-CoV-2-M', 'SARS-CoV-2-ORF6', 'SARS-CoV-2-ORF7a', 'SARS-CoV-2-ORF7b', 'SARS-CoV-2-ORF8', 'SARS-CoV-2-N', 'SARS-CoV-2-ORF10', 'SARS-CoV-2-nsp1', 'SARS-CoV-2-nsp2', 'SARS-CoV-2-nsp3', 'SARS-CoV-2-nsp4', 'SARS-CoV-2-nsp5', 'SARS-CoV-2-nsp6', 'SARS-CoV-2-nsp7', 'SARS-CoV-2-nsp8', 'SARS-CoV-2-nsp9', 'SARS-CoV-2-nsp10', 'SARS-CoV-2-nsp12', 'SARS-CoV-2-nsp13', 'SARS-CoV-2-nsp14', 'SARS-CoV-2-nsp15', 'SARS-CoV-2-nsp16') # Funky translation at this base: it gets duplicated. duplicated_base = 13468 seed_sequence = project_config.getReference('SARS-CoV-2-seed') unchecked_ref_names.difference_update(ref_names) landmark_reader = LandmarkReader.load() source_sequences = {} for ref_name in ref_names: region = landmark_reader.get_gene('SARS-CoV-2-seed', ref_name) start = region['start'] end = region['end'] source_nuc_sequence = seed_sequence[start - 1:end - 3] # Trim stop codons. if start <= duplicated_base <= end: source_nuc_sequence = ( source_nuc_sequence[:duplicated_base - start + 1] + source_nuc_sequence[duplicated_base - start:]) source_sequences[ref_name] = translate(source_nuc_sequence) print(ref_name, len(source_sequences[ref_name])) report, error_count = compare_config(ref_names, project_config, source_sequences) print(report) return error_count
def check_hiv_wild_types(project_config): print("""\ HIV wild types for resistance reports are extracted from Consensus B. """) sequences = fetch_alignment_sequences( 2004, 'CON', # Consensus/Ancestral 'POL') consensus_b = sequences['CONSENSUS_B'].upper() with open(WILD_TYPES_PATH) as wild_types_file: wild_types = safe_load(wild_types_file) boundaries = {'PR': (171, 468), 'RT': (468, 1788), 'INT': (2148, 3014)} ref_names = sorted(boundaries.keys()) source_wild_types = {} for ref_name, (start, end) in boundaries.items(): source_nuc_sequence = consensus_b[start:end] source_wild_types[ref_name] = translate(source_nuc_sequence) report, error_count = compare_config(ref_names, project_config, source_wild_types, reference_overrides=wild_types) print(report) return error_count
def write(self, inserts, region, report_aminos=None): """ Write any insert ranges to the file. Sequence data comes from the reads that were added to the current group. @param inserts: indexes of positions in the reads that should be reported as insertions. @param region: the name of the coordinate region the current group was mapped to @param report_aminos: a list of ReportAmino objects that represent the sequence that successfully mapped to the coordinate reference. """ if len(inserts) == 0: return report_aminos = report_aminos or [] region_insert_pos_counts = self.insert_pos_counts[(self.seed, region)] inserts = list(inserts) inserts.sort() # convert insertion coordinates into contiguous ranges insert_ranges = [] for insert in inserts: if not insert_ranges or insert != insert_ranges[-1][1]: # just starting or we hit a gap insert_ranges.append([insert, insert + 3]) else: insert_ranges[-1][1] += 3 # enumerate insertions by popping out all AA sub-string variants insert_counts = OrderedDict() # {left: {insert_seq: count}} insert_targets = {} # {left: inserted_before_pos} for left, right in insert_ranges: for report_amino in report_aminos: seed_amino = report_amino.seed_amino if seed_amino.consensus_nuc_index == right: insert_targets[left] = report_amino.position break current_counts = Counter() insert_counts[left] = current_counts for nuc_seq, count in self.nuc_seqs.items(): insert_nuc_seq = nuc_seq[left:right] is_valid = (insert_nuc_seq and 'n' not in insert_nuc_seq and '-' not in insert_nuc_seq) if is_valid: insert_amino_seq = translate(insert_nuc_seq) if insert_amino_seq: current_counts[insert_amino_seq] += count # record insertions to CSV for left, counts in insert_counts.items(): for insert_seq, count in counts.most_common(): insert_before = insert_targets.get(left) # Only care about insertions in the middle of the sequence, # so ignore any that come before or after the reference. # Also report if we're in test mode (no report_aminos). if not report_aminos or insert_before not in (1, None): row = dict(seed=self.seed, region=region, qcut=self.qcut, left=left + 1, insert=insert_seq, count=count, before=insert_before) self.insert_writer.writerow(row) if insert_before is not None: region_insert_pos_counts[insert_before-1] += count
def align_aminos(self, seq, gapIns=3, removeinserts=False, qachecks=False): """ Align amino acids to a standard reference using gotoh.cpp :param seq: AA sequence in list form, to align against reference standard :param removeinserts: Whether to remove insertions relative to standard :param qachecks: These are not used when [seq] is a list :return: """ std = self.std_v3 if qachecks: if seq is None: return -1, None if (len(seq) % 3 != 0) or len(seq) < 96: return -1, None if seq.startswith('----') or seq.endswith('----'): return -1, None if type(seq) is list: aa_lists = seq # aa_seq in pssm_lib.rb else: # assume this is a codon sequence aa_lists = translate(seq=seq, offset=0, resolve=False, return_list=True, ambig_char='X') for i, aa_list in enumerate(aa_lists): for j, aa in enumerate(aa_list): if aa == 'X': aa_list[j] = '-' if len(aa_list) > 1 and '*' in aa_list: aa_lists[i] = [aa for aa in aa_list if aa != '*'] while ['-'] in aa_lists: aa_lists.remove(['-']) # resolve into string aa_seq = ''.join(aa_list[0] for aa_list in aa_lists) # aa_seq_s in pssm_lib.rb if qachecks: if any(['*' in aa_list for aa_list in aa_lists]): return -1, None std = std.replace('-', 'X') # fix gaps in reference aligned_std, aligned_seq = gotoh.align_it_aa_rb(std, aa_seq, gapIns, 1) # method_recall aligned_std = aligned_std.replace('X', '-') std = std.replace('X', '-') # restore original state # apply alignment to lists aa_seq = aligned_seq for i in range(len(aa_seq)): if aa_seq[i] == '-': aa_lists.insert(i, ['-']) # insert before index, like Ruby indels = False if removeinserts and '-' in aligned_std: new_aa_lists = [] indices = range(len(aligned_std)) indices.reverse() for i in indices: if aligned_std[i] == '-': # skip positions that are insertions relative to standard indels = True continue new_aa_lists.append(aa_lists[i]) aa_lists = new_aa_lists else: if aligned_std != std: # reject sequences with insertions relative to standard return -2, None return aa_lists, indels