def gapped_imgt_realignment(self): ''' Aligns to gapped IMGT germline sequence. Used to determine IMGT-formatted position numberings so that identifying antibody regions is simplified. ''' self.imgt_germline = get_imgt_germlines(species=self.species, gene_type=self.gene_type, gene=self.full) query = self.germline_alignment.replace('-', '') aln_params = self._realignment_scoring_params(self.gene_type) aln_params['gap_open'] = -11 aln_matrix = self._get_gapped_imgt_substitution_matrix() self.imgt_gapped_alignment = local_alignment( query, self.imgt_germline.gapped_nt_sequence, matrix=aln_matrix, **aln_params) self.alignment_reading_frame = ( (2 * (self.imgt_gapped_alignment.target_begin % 3)) % 3) + ( self.imgt_germline.coding_start - 1 ) # IMGT coding start is 1-based self.coding_region = self._get_coding_region() self.aa_sequence = self._get_aa_sequence() try: self._imgt_numbering() except: self.exception('IMGT NUMBERING', traceback.format_exc(), sep='\n')
def _fallback_find_junc_nt_start(self, antibody): self.fallback_5prime = True # get the FR3 nt sequence of the IMGT gapped germline germ_fr3_sequence = antibody.v.imgt_germline.gapped_nt_sequence[ 196:309].replace('.', '') antibody.log('GERM FR3 SEQUENCE:', germ_fr3_sequence) # find the start of the junction (immediately after the end of FR3) aln = local_alignment(antibody.oriented_input, germ_fr3_sequence) antibody.log(' QUERY: ', aln.aligned_query) antibody.log(' ', aln.alignment_midline) antibody.log('GERM FR3:', aln.aligned_target) fr3_end = aln.query_end + (len(germ_fr3_sequence) - aln.target_end) junc_start_codon = antibody.oriented_input[fr3_end:fr3_end + 3] antibody.log('JUNC START:', junc_start_codon, codons[junc_start_codon], fr3_end) return fr3_end
def assign_dgene(self, seq, species): db_file = os.path.join(self.germline_directory, 'ungapped/d.fasta') with open(db_file, 'r') as db_handle: germs = [Sequence(s) for s in SeqIO.parse(db_handle, 'fasta')] rc_germs = [Sequence(s.reverse_complement, id=s.id) for s in germs] germs.extend(rc_germs) alignments = local_alignment(seq, targets=germs, gap_open=-20, gap_extend=-2) alignments.sort(key=lambda x: x.score, reverse=True) all_gls = [a.target.id for a in alignments] all_scores = [a.score for a in alignments] if not all([all_gls, all_scores]): return None top_gl = all_gls[0] top_score = all_scores[0] others = [GermlineSegment(germ, species, score=score) for germ, score in zip(all_gls[1:6], all_scores[1:6])] return GermlineSegment(top_gl, species, score=top_score, others=others, assigner_name=self.name)
def realign_germline(self, antibody, query_start=None, query_end=None): ''' Due to restrictions on the available scoring parameters in BLASTn, incorrect truncation of the v-gene alignment can occur. This function re-aligns the query sequence with the identified germline variable gene using more appropriate alignment parameters. Args: oriented_input (str): the raw input sequence, correctly oriented query_start (int): 5' position in `oriented_input` at which the sequence should be truncated prior to alignment with the germline sequence. query_end (int): 3' position in `oriented_input` at which the seqeunce should be truncated prior to alignment with the germline sequence ''' oriented_input = antibody.oriented_input germline_seq = self._get_germline_sequence_for_realignment() aln_params = self._realignment_scoring_params(self.gene_type) # if the alignment start/end positions have been annotated by the assigner, # force re-alignment using those parameters if all([ x is not None for x in [ self.query_start, self.query_end, self.germline_start, self.germline_end ] ]): query = oriented_input.sequence[self.query_start:self.query_end] germline = germline_seq[self.germline_start:self.germline_end] alignment = global_alignment(query, germline, **aln_params) # use local alignment to determine alignment start/end positions if # they haven't already been determined by the assigner else: query = oriented_input.sequence[query_start:query_end] alignment = local_alignment(query, germline_seq, **aln_params) if alignment: self._process_realignment(antibody, alignment, query_start) else: antibody.log('GERMLINE REALIGNMENT ERROR') antibody.log('REALIGNMENT QUERY SEQUENCE:', query) antibody.log('QUERY START:', query_start) antibody.log('QUERN END:', query_end)
def _fallback_find_junc_nt_end(self, antibody): self.fallback_3prime = True # need to find the start of FR4 in the IMGT germline sequence end_res = 'W' if antibody.chain == 'heavy' else 'F' for i, res in enumerate(antibody.j.imgt_germline.ungapped_aa_sequence): if res == end_res and end_res not in antibody.j.imgt_germline.ungapped_aa_sequence[ i + 1:]: fr4_nt_start_pos = (antibody.j.imgt_germline.coding_start - 1) + (i * 3) break germ_fr4_sequence = antibody.j.imgt_germline.gapped_nt_sequence[ fr4_nt_start_pos:] # find the end of the junction (end of the first codon of FR4) aln = local_alignment(antibody.oriented_input, germ_fr4_sequence) fr4_start = aln.query_begin - aln.target_begin junc_end_codon = antibody.oriented_input[fr4_start:fr4_start + 3] antibody.log('JUNC END:', junc_end_codon, codons[junc_end_codon], fr4_start) return fr4_start + 3
def _get_isotype_query_region(self, antibody): aln = local_alignment(antibody.vdj_nt, antibody.oriented_input) return antibody.oriented_input[aln.target_end:]
def _get_alignments(self, antibody, isotype_seqs): query_region = self._get_isotype_query_region(antibody) alignments = local_alignment(query_region, targets=isotype_seqs, gap_open_penalty=22, gap_extend_penalty=1) return sorted(alignments, key=lambda x: x.score, reverse=True)