def align_to_germline(self, alignment, avg_len=None, avg_mut=None): if avg_len is not None and avg_mut is not None: alignment.v_gene = self.v_germlines.get_ties( alignment.v_gene, avg_len, avg_mut) alignment.j_gene = self.j_germlines.get_ties( alignment.j_gene, avg_len, avg_mut) # Set the germline to the V gene up to the CDR3 germ = get_common_seq( [self.v_germlines[v] for v in alignment.v_gene], cutoff=False ) alignment.germline = germ[:CDR3_OFFSET] # If we need to pad the sequence, do so, otherwise trim the sequence to # the germline length if alignment.seq_offset >= 0: alignment.sequence.pad(alignment.seq_offset) else: alignment.sequence.remove_prefix(-alignment.seq_offset) alignment.j_anchor_pos += alignment.seq_offset # Add germline gaps to sequence before CDR3 and update anchor positions for i, c in enumerate(alignment.germline): if c == '-': alignment.sequence.add_gap(i) alignment.j_anchor_pos += 1 if i < alignment.seq_start: alignment.seq_offset += 1 j_germ = get_common_seq( [self.j_germlines[j] for j in alignment.j_gene], right=True ) # Calculate the length of the CDR3 alignment.cdr3_num_nts = ( alignment.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3 - alignment.cdr3_start ) v_end = alignment.seq_start + alignment.num_gaps + alignment.v_length v_germ = germ[CDR3_OFFSET:v_end] alignment.germline_cdr3 = ''.join(( v_germ, '-' * (alignment.cdr3_num_nts - len(v_germ) - len(alignment.germline_cdr3)), alignment.germline_cdr3 )) if alignment.cdr3_num_nts < 3: raise AlignmentException('CDR3 has no AAs') alignment.j_anchor_pos += alignment.cdr3_num_nts # Fill germline CDR3 with gaps alignment.germline += '-' * alignment.cdr3_num_nts alignment.germline += j_germ[-self.j_germlines.upstream_of_cdr3:] # If the sequence is longer than the germline, trim it if len(alignment.sequence) > len(alignment.germline): alignment.sequence.trim_right(len(alignment.germline)) elif len(alignment.sequence) < len(alignment.germline): alignment.sequence.pad_right(len(alignment.germline) - len(alignment.sequence))
def all_ties(self, length, mutation, cutoff=True): ties = {} for name in self: tie_name = tuple(sorted(self.get_ties([name], length, mutation))) if tie_name not in ties: ties[tie_name] = get_common_seq([self[n] for n in tie_name], cutoff=cutoff) return ties
def all_ties(self, length, mutation, cutoff=True): ties = {} for name in self: tie_name = tuple(sorted(self.get_ties([name], length, mutation))) if tie_name not in ties: ties[tie_name] = get_common_seq( [self[n] for n in tie_name], cutoff=cutoff ) return ties
def create_alignment(seq, line, v_germlines, j_germlines): alignment = VDJAlignment(seq) alignment.v_gene = set([GeneName(g) for g in line['V_CALL'].split(',')]) alignment.j_gene = set([GeneName(g) for g in line['J_CALL'].split(',')]) alignment.cdr3_num_nts = int(line['JUNCTION_LENGTH']) alignment.v_length = int(line['V_SEQ_LENGTH']) alignment.seq_offset = re.match(r'[-]*', alignment.sequence.sequence).end() # TODO: Calculate these alignment.v_length = CDR3_OFFSET - seq[:CDR3_OFFSET].count('-') alignment.j_length = j_germlines.upstream_of_cdr3 germ_v = [v_germlines[g] for g in alignment.v_gene if g in v_germlines] germ_j = [j_germlines[g] for g in alignment.j_gene if g in j_germlines] if len(germ_v) == 0 or len(germ_j) == 0: raise AlignmentException('Missing germlines: V={} J={}'.format( ','.join([str(v) for v in alignment.v_gene]), ','.join([str(j) for j in alignment.j_gene]))) germ_v = get_common_seq(germ_v) germ_j = get_common_seq(germ_j) alignment.germline_cdr3 = ''.join(( germ_v, '-' * (len(alignment.sequence) - len(germ_v) - len(germ_j)), germ_j ))[CDR3_OFFSET:CDR3_OFFSET + alignment.cdr3_num_nts] alignment.germline = ''.join([ germ_v[:CDR3_OFFSET], '-' * alignment.cdr3_num_nts, germ_j[-j_germlines.upstream_of_cdr3:] ]) alignment.sequence.pad_right( len(alignment.germline) - len(alignment.sequence.sequence) ) if len(alignment.germline) != len(alignment.sequence.sequence): raise AlignmentException('Sequence and germline differ in size') return alignment
def create_alignment(seq, line, v_germlines, j_germlines): alignment = VDJAlignment(seq) alignment.v_gene = set([GeneName(g) for g in line['V_CALL'].split(',')]) alignment.j_gene = set([GeneName(g) for g in line['J_CALL'].split(',')]) alignment.cdr3_num_nts = int(line['JUNCTION_LENGTH']) alignment.v_length = int(line['V_SEQ_LENGTH']) alignment.seq_offset = re.match('[\-]*', alignment.sequence.sequence).end() # TODO: Calculate these alignment.v_length = CDR3_OFFSET - seq[:CDR3_OFFSET].count('-') alignment.j_length = j_germlines.upstream_of_cdr3 germ_v = [v_germlines[g] for g in alignment.v_gene if g in v_germlines] germ_j = [j_germlines[g] for g in alignment.j_gene if g in j_germlines] if len(germ_v) == 0 or len(germ_j) == 0: raise AlignmentException('Missing germlines: V={} J={}'.format( ','.join([str(v) for v in alignment.v_gene]), ','.join([str(j) for j in alignment.j_gene]))) germ_v = get_common_seq(germ_v) germ_j = get_common_seq(germ_j) alignment.germline_cdr3 = ''.join( (germ_v, '-' * (len(alignment.sequence) - len(germ_v) - len(germ_j)), germ_j))[CDR3_OFFSET:CDR3_OFFSET + alignment.cdr3_num_nts] alignment.germline = ''.join([ germ_v[:CDR3_OFFSET], '-' * alignment.cdr3_num_nts, germ_j[-j_germlines.upstream_of_cdr3:] ]) alignment.sequence.pad_right( len(alignment.germline) - len(alignment.sequence.sequence)) if len(alignment.germline) != len(alignment.sequence.sequence): raise AlignmentException('Sequence and germline differ in size') return alignment
def align_to_germline(self, avg_len=None, avg_mut=None, trim_to=None): if avg_len is not None and avg_mut is not None: self._v = self.v_germlines.get_ties(self.v_gene, avg_len, avg_mut) self._j = self.j_germlines.get_ties(self.j_gene, avg_len, avg_mut) # Set the germline to the V gene up to the CDR3 self.germline = get_common_seq([self.v_germlines[v] for v in self._v])[:CDR3_OFFSET] # If we need to pad the sequence, do so, otherwise trim the sequence to # the germline length if self._pad_len >= 0: self.sequence = 'N' * self._pad_len + str(self.sequence) if self.quality is not None: self.quality = (' ' * self._pad_len) + self.quality else: self.removed_prefix = self.sequence[:-self._pad_len] self.sequence = str(self.sequence[-self._pad_len:]) if self.quality is not None: self.removed_prefix_qual = self.quality[:-self._pad_len] self.quality = self.quality[-self._pad_len:] # Update the anchor positions after adding padding / trimming self.j_anchor_pos += self._pad_len # Add germline gaps to sequence before CDR3 and update anchor positions for i, c in enumerate(self.germline): if c == '-': self.sequence = self.sequence[:i] + '-' + self.sequence[i:] if self.quality is not None: self.quality = self.quality[:i] + ' ' + self.quality[i:] self.j_anchor_pos += 1 j_germ = get_common_seq( map(reversed, [self.j_germlines[j] for j in self.j_gene])) j_germ = ''.join(reversed(j_germ)) # Calculate the length of the CDR3 self._cdr3_len = (self.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3 - self.cdr3_start) if self._cdr3_len < 3: raise AlignmentException('CDR3 has no AAs'.format(self._cdr3_len)) self.j_anchor_pos += self._cdr3_len # Fill germline CDR3 with gaps self.germline += '-' * self._cdr3_len self.germline += j_germ[-self.j_germlines.upstream_of_cdr3:] # If the sequence is longer than the germline, trim it if len(self.sequence) > len(self.germline): self.sequence = self.sequence[:len(self.germline)] if self.quality is not None: self.quality = self.quality[:len(self.germline)] elif len(self.sequence) < len(self.germline): self.sequence += 'N' * (len(self.germline) - len(self.sequence)) if self.quality is not None: self.quality += ' ' * (len(self.germline) - len(self.quality)) if trim_to is not None: old_padding = max(self._pad_len, 0) new_prefix = ''.join( [c if c == '-' else 'N' for c in self.sequence[:trim_to]]) self.sequence = new_prefix + self.sequence[trim_to:] v_start = re.match('[N\-]*', self.sequence).span()[1] self._pad_len = self.sequence[:v_start].count('N') self.v_length -= self._pad_len - old_padding # Get the pre-CDR3 germline pre_cdr3_germ = self.germline[:self.cdr3_start] pre_cdr3_seq = self.sequence[:self.cdr3_start] # If there is padding, get rid of it in the sequence and align the # germline if self._pad_len > 0: pre_cdr3_germ = pre_cdr3_germ[self._pad_len:] pre_cdr3_seq = pre_cdr3_seq[self._pad_len:] # Calculate the pre-CDR3 length and distance self.pre_cdr3_length = len(pre_cdr3_seq) self.pre_cdr3_match = self.pre_cdr3_length - dnautils.hamming( str(pre_cdr3_seq), str(pre_cdr3_germ)) # Get the length of J after the CDR3 self.post_cdr3_length = self.j_germlines.upstream_of_cdr3 # Get the sequence and germline sequences after CDR3 post_j = j_germ[-self.post_cdr3_length:] post_s = self.sequence[-self.post_cdr3_length:] # Calculate their match count self.post_cdr3_match = self.post_cdr3_length - dnautils.hamming( post_j, post_s) self.v_match = self.v_length - dnautils.hamming( self.germline[:self.cdr3_start], self.sequence[:self.cdr3_start]) self.j_match = self.j_length - dnautils.hamming( self.germline[-len(j_germ):], self.sequence[-len(j_germ):])