def compare(self, other_v, max_extent, max_streak): alignment = self.align(other_v) this_seq = alignment['base'][:max_extent] other_seq = alignment['seq'][:max_extent] cdr3_offset = alignment['cdr3_start'] # Determine the CDR3 in the germline and sequence this_cdr3 = this_seq[cdr3_offset:] other_cdr3 = other_seq[cdr3_offset:] length = min(len(this_cdr3), len(other_cdr3)) this_cdr3 = this_cdr3[:length] other_cdr3 = other_cdr3[:length] if len(this_cdr3) == 0 or len(other_cdr3) == 0: raise AlignmentException('Empty CDR3 found after alignment') # Find the extent of the sequence's V into the CDR3 streak = dnautils.find_streak_position( this_cdr3, other_cdr3, max_streak) if streak is not None: # If there is a streak of mismatches, cut after the streak max_index = cdr3_offset + (streak - max_streak) else: # Unlikely: the CDR3 in the sequence exactly matches the # germline. Use the smaller sequence length (full match) max_index = cdr3_offset + min(len(this_cdr3), len(other_cdr3)) # Compare to the end of V this_seq = this_seq[:max_index] other_seq = other_seq[:max_index] if len(this_seq) != len(other_seq) or len(this_seq) == 0: raise AlignmentException('Unequal sequences after alignment') # Determine the distance between the germline and sequence dist = dnautils.hamming(this_seq, other_seq) return dist, len(other_seq)
def get_single_tie(self, gene, length, mutation): # Used to disable gene ties for genotyping if self.no_ties: return set([gene]) length = int(length) mutation = round(mutation, 3) mutation = self.mut_bucket(mutation) key = (length, mutation) if key not in self.ties: self.ties[key] = {} if gene not in self: return set([gene]) if gene not in self.ties[key]: s_1 = ( self[gene].replace('-', '') if self.remove_gaps else self[gene] ) self.ties[key][gene] = set([gene]) for name, v in sorted(self.items()): s_2 = v.replace('-', '') if self.remove_gaps else v K = dnautils.hamming(s_1[-length:], s_2[-length:]) p = self._hypergeom(length, mutation, K) if p >= self.TIES_PROB_THRESHOLD: self.ties[key][gene].add(name) self.ties[key][gene] = self.all_alleles(self.ties[key][gene]) return self.ties[key][gene]
def compare(self, other_v, max_extent, max_streak): alignment = self.align(other_v) this_seq = alignment['base'][:max_extent] other_seq = alignment['seq'][:max_extent] cdr3_offset = alignment['cdr3_start'] # Determine the CDR3 in the germline and sequence this_cdr3 = this_seq[cdr3_offset:] other_cdr3 = other_seq[cdr3_offset:] length = min(len(this_cdr3), len(other_cdr3)) this_cdr3 = this_cdr3[:length] other_cdr3 = other_cdr3[:length] if len(this_cdr3) == 0 or len(other_cdr3) == 0: raise AlignmentException('Empty CDR3 found after alignment') # Find the extent of the sequence's V into the CDR3 streak = dnautils.find_streak_position(this_cdr3, other_cdr3, max_streak) if streak is not None: # If there is a streak of mismatches, cut after the streak max_index = cdr3_offset + (streak - max_streak) else: # Unlikely: the CDR3 in the sequence exactly matches the # germline. Use the smaller sequence length (full match) max_index = cdr3_offset + min(len(this_cdr3), len(other_cdr3)) # Compare to the end of V this_seq = this_seq[:max_index] other_seq = other_seq[:max_index] if len(this_seq) != len(other_seq) or len(this_seq) == 0: raise AlignmentException('Unequal sequences after alignment') # Determine the distance between the germline and sequence dist = dnautils.hamming(this_seq, other_seq) return dist, len(other_seq)
def get_single_tie(self, gene, length, mutation): length = int(length) mutation = round(mutation, 3) mutation = self.mut_bucket(mutation) key = (length, mutation) if key not in self.ties: self.ties[key] = {} if gene not in self: return set([gene]) if gene not in self.ties[key]: s_1 = (self[gene].replace('-', '') if self.remove_gaps else self[gene]) self.ties[key][gene] = set([gene]) for name, v in sorted(self.iteritems()): s_2 = v.replace('-', '') if self.remove_gaps else v K = dnautils.hamming(s_1[-length:], s_2[-length:]) p = self._hypergeom(length, mutation, K) if p >= self.ties_prob_threshold: self.ties[key][gene].add(name) self.ties[key][gene] = self.all_alleles(self.ties[key][gene]) return self.ties[key][gene]
def collapse_similar_cdr3s(session, buckets, difference_allowed): logger.info('Collapsing similar clones in {} buckets'.format( buckets.count())) for i, bucket in enumerate(buckets): clones = session.query(Clone.id, Clone.cdr3_aa, Clone.cdr3_nt).filter( Clone.subject_id == bucket.subject_id, Clone.cdr3_num_nts == bucket.cdr3_num_nts, ).order_by(Clone.overall_total_cnt.desc()) if clones.count() < 2: continue logger.info('Reducing bucket {} / {} ({} clones)'.format( i, buckets.count(), clones.count())) reduced = {} for c in clones: for larger_cdr3_nt, others in reduced.items(): if (dnautils.hamming(larger_cdr3_nt, c.cdr3_nt) <= difference_allowed): others.append(c.id) break else: reduced[c.cdr3_nt] = [c.id] for collapse in reduced.values(): rep_id, others = collapse[0], collapse[1:] session.query(Sequence).filter( Sequence.clone_id.in_(others)).update( {'clone_id': rep_id}, synchronize_session=False) session.query(Clone).filter( Clone.id.in_(others)).delete(synchronize_session=False) session.commit()
def get_single_tie(self, gene, length, mutation): # Used to disable gene ties for genotyping if self.no_ties: return set([gene]) length = int(length) mutation = round(mutation, 3) mutation = self.mut_bucket(mutation) key = (length, mutation) if key not in self.ties: self.ties[key] = {} if gene not in self: return set([gene]) if gene not in self.ties[key]: s_1 = (self[gene].replace('-', '') if self.remove_gaps else self[gene]) self.ties[key][gene] = set([gene]) for name, v in sorted(self.items()): s_2 = v.replace('-', '') if self.remove_gaps else v K = dnautils.hamming(s_1[-length:], s_2[-length:]) p = self._hypergeom(length, mutation, K) if p >= self.TIES_PROB_THRESHOLD: self.ties[key][gene].add(name) self.ties[key][gene] = self.all_alleles(self.ties[key][gene]) return self.ties[key][gene]
def pre_cdr3_match(self): start = self.seq_start + self.num_gaps end = self.cdr3_start return self.pre_cdr3_length - dnautils.hamming( self.germline[start:end], self.sequence[start:end] )
def v_match(self): start = self.seq_start end = start + self.v_length + self.num_gaps return self.v_length - dnautils.hamming( self.filled_germline[start:end], self.sequence[start:end] )
def get_single_tie(self, gene, length, mutation): seq = self[gene][-self.anchor_len:] tied = self.all_alleles(set([gene])) for j, other_seq in sorted(self.iteritems()): other_seq = other_seq[-self.anchor_len:][:len(seq)] if other_seq == seq: tied.add(j) elif dnautils.hamming(other_seq, seq) == 0: tied.add(j) return tied
def _find_index(self, sequence, germline): best_pos, best_hamming = None, None for pos in range(len(sequence) - len(germline)): hamming = dnautils.hamming(sequence[pos:pos + len(germline)], germline) if best_hamming is None or hamming < best_hamming: best_pos = pos best_hamming = hamming is_rc = False rc = sequence.reverse_complement() for pos in range(len(rc) - len(germline)): hamming = dnautils.hamming(rc[pos:pos + len(germline)], germline) if best_hamming is None or hamming < best_hamming: best_pos = pos best_hamming = hamming is_rc = True best_pos += len(germline) - self.j_germlines.anchor_len return best_pos, best_hamming, is_rc
def _find_index(self, sequence, germline): best_pos, best_hamming = None, None for pos in range(len(sequence) - len(germline)): hamming = dnautils.hamming(sequence[pos:pos + len(germline)], germline) / len(germline) if best_hamming is None or hamming < best_hamming: best_pos = pos best_hamming = hamming is_rc = False rc = sequence.reverse_complement() for pos in range(len(rc) - len(germline) + 1): hamming = dnautils.hamming(rc[pos:pos + len(germline)], germline) / len(germline) if best_hamming is None or hamming < best_hamming: best_pos = pos best_hamming = hamming is_rc = True best_pos += len(germline) - self.j_germlines.anchor_len return best_pos, best_hamming, is_rc
def get_single_tie(self, gene, length, mutation): # Used to disable gene ties for genotyping if self.no_ties: return set([gene]) seq = self[gene][-self.anchor_len:] tied = self.all_alleles(set([gene])) for j, other_seq in sorted(self.items()): other_seq = other_seq[-self.anchor_len:][:len(seq)] if other_seq == seq: tied.add(j) elif dnautils.hamming(other_seq, seq) == 0: tied.add(j) return tied
def has_possible_indel(self): # Start comparison on first full AA to the INDEL_WINDOW or CDR3, # whichever comes first start = re.search('[ATCG]', self.sequence.sequence).start() germ = self.germline[start:self.cdr3_start] seq = self.sequence[start:self.cdr3_start] for i in range(0, len(germ) - self.INDEL_WINDOW + 1): dist = dnautils.hamming(germ[i:i+self.INDEL_WINDOW], seq[i:i+self.INDEL_WINDOW]) if dist >= self.INDEL_MISMATCH_THRESHOLD * self.INDEL_WINDOW: return True return False
def has_possible_indel(self): # Start comparison on first full AA to the INDEL_WINDOW or CDR3, # whichever comes first start = re.search('[ATCG]', self.sequence).start() germ = self.germline[start:self.cdr3_start] seq = self.sequence[start:self.cdr3_start] for i in range(0, len(germ) - self.INDEL_WINDOW + 1): dist = dnautils.hamming(germ[i:i + self.INDEL_WINDOW], seq[i:i + self.INDEL_WINDOW]) if dist >= self.INDEL_MISMATCH_THRESHOLD * self.INDEL_WINDOW: return True return False
def similar_to_all(seq, rest, min_similarity): """Determines if the string ``seq`` is at least ``min_similarity`` similar to the list of strings ``rest``. :param str seq: The string to compare :param list rest: The list of strings to compare to :param int min_similarity: Minimum fraction to be considered similar :returns: If ``seq`` is similar to every sequence in ``rest`` :rtype: bool """ for comp_seq in rest: dist = dnautils.hamming(comp_seq.cdr3_aa.replace('X', '-'), seq.cdr3_aa.replace('X', '-')) sim_frac = 1 - dist / float(len(comp_seq.cdr3_aa)) if sim_frac < min_similarity: return False return True
def similar_to_all(seq, rest, field, min_similarity): """Determines if the string ``seq`` is at least ``min_similarity`` similar to the list of strings ``rest``. :param str seq: The string to compare :param list rest: The list of strings to compare to :param int min_similarity: Minimum fraction to be considered similar :returns: If ``seq`` is similar to every sequence in ``rest`` :rtype: bool """ for comp_seq in rest: dist = dnautils.hamming( getattr(comp_seq, 'cdr3_' + field).replace('X', '-'), getattr(seq, 'cdr3_' + field).replace('X', '-') ) sim_frac = 1 - dist / len(comp_seq.cdr3_aa) if sim_frac < min_similarity: return False return True
def _found_j(self, i, j_gene, match): # If a match is found, record its location and gene self.j_anchor_pos = i self.j_anchor_len = len(match) end_of_j = min(self.j_anchor_pos + self.j_germlines.anchor_len, len(self.sequence)) best_dist = None self._j = [] if self._force_js: j_germs = { k: v for k, v in self.j_germlines.iteritems() if k in self._force_js } else: j_germs = self.j_germlines for j_gene, j_seq in j_germs.iteritems(): seq_j = self.sequence[end_of_j - len(j_seq):end_of_j] dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)]) if best_dist is None or dist < best_dist: best_dist = dist self._j = set([j_gene]) elif dist == best_dist: self._j.add(j_gene) if self._j is None: raise AlignmentException('Could not find suitable J anchor') # Get the full germline J gene j_full = self.j_germlines[self.j_gene[0]] # Get the portion of the germline J in the CDR3 germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(self.j_gene[0]) cdr3_end_pos = (self.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3) sequence_in_cdr3 = self.sequence[cdr3_end_pos - len(germline_in_cdr3):cdr3_end_pos] if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0: self._j = None raise AlignmentException('Could not find sequence or germline in ' 'CDR3') # Get the extent of the J in the CDR3 streak = find_streak_position(reversed(germline_in_cdr3), reversed(sequence_in_cdr3), self.MISMATCH_THRESHOLD) # Trim the J gene based on the extent in the CDR3 if streak is not None: j_full = j_full[len(germline_in_cdr3) - streak:] # Find where the full J starts self._j_start = self.j_anchor_pos + len(match) - len(j_full) # If the trimmed germline J extends past the end of the # sequence, there is a misalignment if len(j_full) != len( self.sequence[self._j_start:self._j_start + len(j_full)]): self._j = None self.j_anchor_pos = None raise AlignmentException('Germline extended past end of J') self.j_length = len(j_full)
def align_to_germline(self, avg_len=None, avg_mut=None, trim_to=None): if avg_len is not None and avg_mut is not None: self._v = self.v_germlines.get_ties(self.v_gene, avg_len, avg_mut) self._j = self.j_germlines.get_ties(self.j_gene, avg_len, avg_mut) # Set the germline to the V gene up to the CDR3 self.germline = get_common_seq([self.v_germlines[v] for v in self._v])[:CDR3_OFFSET] # If we need to pad the sequence, do so, otherwise trim the sequence to # the germline length if self._pad_len >= 0: self.sequence = 'N' * self._pad_len + str(self.sequence) if self.quality is not None: self.quality = (' ' * self._pad_len) + self.quality else: self.removed_prefix = self.sequence[:-self._pad_len] self.sequence = str(self.sequence[-self._pad_len:]) if self.quality is not None: self.removed_prefix_qual = self.quality[:-self._pad_len] self.quality = self.quality[-self._pad_len:] # Update the anchor positions after adding padding / trimming self.j_anchor_pos += self._pad_len # Add germline gaps to sequence before CDR3 and update anchor positions for i, c in enumerate(self.germline): if c == '-': self.sequence = self.sequence[:i] + '-' + self.sequence[i:] if self.quality is not None: self.quality = self.quality[:i] + ' ' + self.quality[i:] self.j_anchor_pos += 1 j_germ = get_common_seq( map(reversed, [self.j_germlines[j] for j in self.j_gene])) j_germ = ''.join(reversed(j_germ)) # Calculate the length of the CDR3 self._cdr3_len = (self.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3 - self.cdr3_start) if self._cdr3_len < 3: raise AlignmentException('CDR3 has no AAs'.format(self._cdr3_len)) self.j_anchor_pos += self._cdr3_len # Fill germline CDR3 with gaps self.germline += '-' * self._cdr3_len self.germline += j_germ[-self.j_germlines.upstream_of_cdr3:] # If the sequence is longer than the germline, trim it if len(self.sequence) > len(self.germline): self.sequence = self.sequence[:len(self.germline)] if self.quality is not None: self.quality = self.quality[:len(self.germline)] elif len(self.sequence) < len(self.germline): self.sequence += 'N' * (len(self.germline) - len(self.sequence)) if self.quality is not None: self.quality += ' ' * (len(self.germline) - len(self.quality)) if trim_to is not None: old_padding = max(self._pad_len, 0) new_prefix = ''.join( [c if c == '-' else 'N' for c in self.sequence[:trim_to]]) self.sequence = new_prefix + self.sequence[trim_to:] v_start = re.match('[N\-]*', self.sequence).span()[1] self._pad_len = self.sequence[:v_start].count('N') self.v_length -= self._pad_len - old_padding # Get the pre-CDR3 germline pre_cdr3_germ = self.germline[:self.cdr3_start] pre_cdr3_seq = self.sequence[:self.cdr3_start] # If there is padding, get rid of it in the sequence and align the # germline if self._pad_len > 0: pre_cdr3_germ = pre_cdr3_germ[self._pad_len:] pre_cdr3_seq = pre_cdr3_seq[self._pad_len:] # Calculate the pre-CDR3 length and distance self.pre_cdr3_length = len(pre_cdr3_seq) self.pre_cdr3_match = self.pre_cdr3_length - dnautils.hamming( str(pre_cdr3_seq), str(pre_cdr3_germ)) # Get the length of J after the CDR3 self.post_cdr3_length = self.j_germlines.upstream_of_cdr3 # Get the sequence and germline sequences after CDR3 post_j = j_germ[-self.post_cdr3_length:] post_s = self.sequence[-self.post_cdr3_length:] # Calculate their match count self.post_cdr3_match = self.post_cdr3_length - dnautils.hamming( post_j, post_s) self.v_match = self.v_length - dnautils.hamming( self.germline[:self.cdr3_start], self.sequence[:self.cdr3_start]) self.j_match = self.j_length - dnautils.hamming( self.germline[-len(j_germ):], self.sequence[-len(j_germ):])
def j_match(self): return self.j_length - dnautils.hamming( self.filled_germline[-self.j_length:], self.sequence[-self.j_length:] )
def process_j(self, alignment, i, match_len, limit_js): # If a match is found, record its location and gene alignment.j_anchor_pos = i end_of_j = min(alignment.j_anchor_pos + self.j_germlines.anchor_len, len(alignment.sequence)) best_dist = None if limit_js: j_germs = { k: v for k, v in self.j_germlines.items() if k.name in limit_js } else: j_germs = self.j_germlines for j_gene, j_seq in j_germs.items(): seq_j = alignment.sequence[end_of_j - len(j_seq):end_of_j] dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)]) if best_dist is None or dist < best_dist: best_dist = dist alignment.j_gene = set([j_gene]) elif dist == best_dist: alignment.j_gene.add(j_gene) if len(alignment.j_gene) == 0: raise AlignmentException('Could not find suitable J anchor') # Get the full germline J gene ex_j = sorted(alignment.j_gene)[0] j_full = self.j_germlines[ex_j] # Get the portion of the germline J in the CDR3 germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(ex_j) cdr3_end_pos = (alignment.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3) sequence_in_cdr3 = alignment.sequence[cdr3_end_pos - len(germline_in_cdr3 ):cdr3_end_pos] if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0: alignment.j_gene = set() raise AlignmentException('Could not find sequence or germline in ' 'CDR3') # Get the extent of the J in the CDR3 streak = dnautils.find_streak_position(germline_in_cdr3[::-1], sequence_in_cdr3[::-1], self.MISMATCH_THRESHOLD) # Trim the J gene based on the extent in the CDR3 if streak is not None: j_full = j_full[len(germline_in_cdr3) - streak:] alignment.germline_cdr3 = germline_in_cdr3[-streak:] else: alignment.germline_cdr3 = germline_in_cdr3 # Find where the full J starts j_start = alignment.j_anchor_pos + match_len - len(j_full) # If the trimmed germline J extends past the end of the # sequence, there is a misalignment if len(j_full) != len( alignment.sequence[j_start:j_start + len(j_full)]): alignment.j_gene = set() raise AlignmentException('Germline extended past end of J') alignment.j_length = len(j_full) alignment.post_cdr3_length = self.j_germlines.upstream_of_cdr3
def process_j(self, alignment, i, match_len, limit_js): # If a match is found, record its location and gene alignment.j_anchor_pos = i end_of_j = min( alignment.j_anchor_pos + self.j_germlines.anchor_len, len(alignment.sequence) ) if limit_js: j_germs = { k: v for k, v in self.j_germlines.items() if k.name in limit_js } else: j_germs = self.j_germlines best_dist = None for j_gene, j_seq in j_germs.items(): seq_j = alignment.sequence[end_of_j - len(j_seq):end_of_j] if seq_j: dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)]) / len(seq_j) if best_dist is None or dist < best_dist: best_dist = dist alignment.j_gene = set([j_gene]) elif dist == best_dist: alignment.j_gene.add(j_gene) if len(alignment.j_gene) == 0: raise AlignmentException('Could not find suitable J anchor') # Get the full germline J gene ex_j = sorted(alignment.j_gene)[0] j_full = self.j_germlines[ex_j] # Get the portion of the germline J in the CDR3 germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(ex_j) cdr3_end_pos = ( alignment.j_anchor_pos + self.j_germlines.anchor_len - self.j_germlines.upstream_of_cdr3 ) sequence_in_cdr3 = alignment.sequence[ cdr3_end_pos - len(germline_in_cdr3): cdr3_end_pos ] if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0: alignment.j_gene = set() raise AlignmentException('Could not find sequence or germline in ' 'CDR3') # Get the extent of the J in the CDR3 streak = dnautils.find_streak_position( germline_in_cdr3[::-1], sequence_in_cdr3[::-1], self.MISMATCH_THRESHOLD) # Trim the J gene based on the extent in the CDR3 if streak is not None: j_full = j_full[len(germline_in_cdr3) - streak:] alignment.germline_cdr3 = germline_in_cdr3[-streak:] else: alignment.germline_cdr3 = germline_in_cdr3 # Find where the full J starts j_start = alignment.j_anchor_pos + match_len - len(j_full) # If the trimmed germline J extends past the end of the # sequence, there is a misalignment if len(j_full) != len(alignment.sequence[j_start:j_start+len(j_full)]): alignment.j_gene = set() raise AlignmentException('Germline extended past end of J') alignment.j_length = len(j_full) alignment.post_cdr3_length = self.j_germlines.upstream_of_cdr3
def post_cdr3_match(self): return self.post_cdr3_length - dnautils.hamming( self.germline[-self.post_cdr3_length:], self.sequence[-self.post_cdr3_length:])
def get_distances(self, seqs): dists = np.zeros((len(seqs), len(seqs))) for i, s1 in enumerate(seqs): for j, s2 in enumerate(seqs): dists[i, j] = dists[j, i] = dnautils.hamming(s1, s2) / len(s1) return dists
def j_match(self): return self.j_length - dnautils.hamming( self.filled_germline[-self.j_length:], self.sequence[-self.j_length:])
def post_cdr3_match(self): return self.post_cdr3_length - dnautils.hamming( self.germline[-self.post_cdr3_length:], self.sequence[-self.post_cdr3_length:] )