Пример #1
0
    def compare(self, other_v, max_extent, max_streak):
        alignment = self.align(other_v)
        this_seq = alignment['base'][:max_extent]
        other_seq = alignment['seq'][:max_extent]
        cdr3_offset = alignment['cdr3_start']

        # Determine the CDR3 in the germline and sequence
        this_cdr3 = this_seq[cdr3_offset:]
        other_cdr3 = other_seq[cdr3_offset:]
        length = min(len(this_cdr3), len(other_cdr3))
        this_cdr3 = this_cdr3[:length]
        other_cdr3 = other_cdr3[:length]
        if len(this_cdr3) == 0 or len(other_cdr3) == 0:
            raise AlignmentException('Empty CDR3 found after alignment')

        # Find the extent of the sequence's V into the CDR3
        streak = dnautils.find_streak_position(this_cdr3, other_cdr3,
                                               max_streak)
        if streak is not None:
            # If there is a streak of mismatches, cut after the streak
            max_index = cdr3_offset + (streak - max_streak)
        else:
            # Unlikely: the CDR3 in the sequence exactly matches the
            # germline.  Use the smaller sequence length (full match)
            max_index = cdr3_offset + min(len(this_cdr3), len(other_cdr3))
        # Compare to the end of V
        this_seq = this_seq[:max_index]
        other_seq = other_seq[:max_index]

        if len(this_seq) != len(other_seq) or len(this_seq) == 0:
            raise AlignmentException('Unequal sequences after alignment')
        # Determine the distance between the germline and sequence
        dist = dnautils.hamming(this_seq, other_seq)

        return dist, len(other_seq)
Пример #2
0
    def align_to_germline(self, alignment, avg_len=None, avg_mut=None):
        if avg_len is not None and avg_mut is not None:
            alignment.v_gene = self.v_germlines.get_ties(
                alignment.v_gene, avg_len, avg_mut)
            alignment.j_gene = self.j_germlines.get_ties(
                alignment.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        germ = get_common_seq([self.v_germlines[v] for v in alignment.v_gene],
                              cutoff=False)
        alignment.germline = germ[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if alignment.seq_offset >= 0:
            alignment.sequence.pad(alignment.seq_offset)
        else:
            alignment.sequence.remove_prefix(-alignment.seq_offset)
        alignment.j_anchor_pos += alignment.seq_offset

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(alignment.germline):
            if c == '-':
                alignment.sequence.add_gap(i)
                alignment.j_anchor_pos += 1
                if i < alignment.seq_start:
                    alignment.seq_offset += 1

        j_germ = get_common_seq(
            [self.j_germlines[j] for j in alignment.j_gene], right=True)
        # Calculate the length of the CDR3
        alignment.cdr3_num_nts = (alignment.j_anchor_pos +
                                  self.j_germlines.anchor_len -
                                  self.j_germlines.upstream_of_cdr3 -
                                  alignment.cdr3_start)

        v_end = alignment.seq_start + alignment.num_gaps + alignment.v_length
        v_germ = germ[CDR3_OFFSET:v_end]
        alignment.germline_cdr3 = ''.join(
            (v_germ, '-' * (alignment.cdr3_num_nts - len(v_germ) -
                            len(alignment.germline_cdr3)),
             alignment.germline_cdr3))

        if alignment.cdr3_num_nts < 3:
            raise AlignmentException('CDR3 has no AAs')

        alignment.j_anchor_pos += alignment.cdr3_num_nts
        # Fill germline CDR3 with gaps
        alignment.germline += '-' * alignment.cdr3_num_nts
        alignment.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(alignment.sequence) > len(alignment.germline):
            alignment.sequence.trim_right(len(alignment.germline))
        elif len(alignment.sequence) < len(alignment.germline):
            alignment.sequence.pad_right(
                len(alignment.germline) - len(alignment.sequence))
        if len(alignment.cdr3) != alignment.cdr3_num_nts:
            raise AlignmentException('Invalid CDR3 length')
Пример #3
0
 def __init__(self, gapped_sequence):
     self._gapped_seq = str(gapped_sequence).upper()
     if self._gapped_seq[CDR3_OFFSET:].count('-') > 0:
         raise AlignmentException('Cannot have gaps after CDR3 start '
                                  '(position {})'.format(CDR3_OFFSET))
     try:
         self._ungapped_anchor_pos = find_v_position(
             self.sequence_ungapped).next()
     except StopIteration:
         raise AlignmentException('Unable to find anchor')
Пример #4
0
    def _find_j(self):
        '''Finds the location and type of J gene'''
        # Iterate over every possible J anchor.  For each germline, try its
        # full sequence, then exclude the final 3 characters at a time until
        # there are only MIN_J_ANCHOR_LEN nucleotides remaining.
        #
        # For example, the order for one germline:
        # TGGTCACCGTCTCCTCAG
        # TGGTCACCGTCTCCT
        # TGGTCACCGTCT

        for match, j_gene in self.j_germlines.get_all_anchors(self._force_js):
            i = self.sequence.rfind(match)
            if i >= 0:
                return self._found_j(i, j_gene, match)

            rc = str(Seq(self.sequence).reverse_complement())
            i = rc.rfind(match)
            if i >= 0:
                self.sequence = rc
                if self.quality is not None:
                    self.quality = self.quality[::-1]
                return self._found_j(i, j_gene, match)

            i = self._check_j_with_missing(self.sequence, match)
            if i >= 0:
                return self._found_j(i, j_gene, match)

            i = self._check_j_with_missing(rc, match)
            if i >= 0:
                self.sequence = rc
                if self.quality is not None:
                    self.quality = self.quality[::-1]
                return self._found_j(i, j_gene, match)
        raise AlignmentException('Could not find J anchor')
Пример #5
0
    def _find_v(self):
        for anchor_pos in find_v_position(self.sequence):
            self._found_v(anchor_pos)
            if self._v is not None:
                break

        if self._v is None:
            raise AlignmentException('Could not find suitable V anchor')
Пример #6
0
    def find_v(self, alignment, limit_vs):
        for anchor_pos in find_v_position(alignment.sequence.sequence):
            self.process_v(alignment, anchor_pos, limit_vs)
            if len(alignment.v_gene) > 0:
                break

        if len(alignment.v_gene) == 0:
            raise AlignmentException('Could not find suitable V anchor')
Пример #7
0
 def _resolve_gene(line, gene, db):
     full = _format_gene(line[gene.lower() + 'GeneName'])
     family = _format_gene(line[gene.lower() + 'FamilyName'])
     try:
         return full, db[GeneName(full)]
     except KeyError:
         return family, db[GeneName(family)]
     raise AlignmentException('Invalid {} gene: {} / {}'.format(
         gene.upper(), full, family))
Пример #8
0
    def __init__(self, name):
        self.name = name
        try:
            parts = re.search(r'((([A-Z]+)(\d+)([^\*]+)?)(\*(\d+))?)',
                              self.name).groups()
        except AttributeError:
            raise AlignmentException('Invalid gene name {}'.format(name))

        self.name = parts[0]
        self.base = parts[1]
        self.prefix = parts[2]
        self.family = parts[3]
        self.allele = parts[6] if parts[6] else None
Пример #9
0
def extract_adaptive_sequence(idx, line, v_germlines, j_germlines):
    def _format_gene(g):
        g = g.replace('V0', 'V').replace('J0', 'J').replace('-0', '-')
        g = g.replace('TCRB', 'TRB')
        if '*' not in g:
            return g + '*01'
        return g

    def _resolve_gene(line, gene, db):
        full = _format_gene(line[gene.lower() + 'GeneName'])
        family = _format_gene(line[gene.lower() + 'FamilyName'])
        try:
            return full, db[GeneName(full)]
        except KeyError:
            return family, db[GeneName(family)]
        raise AlignmentException('Invalid {} gene: {} / {}'.format(
            gene.upper(), full, family))

    if not line['aminoAcid']:
        raise AlignmentException('No amino-acids provided')

    v_gene, v_germ = _resolve_gene(line, 'V', v_germlines)
    j_gene, j_germ = _resolve_gene(line, 'J', j_germlines)

    v_end = int(line['vIndex'])
    v_region = line['nucleotide'][:v_end]
    v_region = list(('N' * (CDR3_OFFSET - v_end) + v_region))

    for i in range(len(v_germ)):
        if v_germ[i] == '-':
            v_region[i] = '-'
    v_region = ''.join(v_region)
    cdr3_region = line['nucleotide'][v_end:v_end + int(line['cdr3Length'])]
    j_region = line['nucleotide'][v_end + int(line['cdr3Length']):]
    j_region = j_region + ('N' *
                           (j_germlines.upstream_of_cdr3 - len(j_region)))
    imgt_sequence = v_region + cdr3_region + j_region
    try:
        counts = line['count (templates/reads)']
    except KeyError:
        counts = line['count (templates)']

    return {
        'SEQUENCE_ID': 'seq_{}'.format(idx),
        'SEQUENCE_IMGT': imgt_sequence,
        'V_CALL': v_gene,
        'J_CALL': j_gene,
        'JUNCTION_LENGTH': line['cdr3Length'],
        'V_SEQ_LENGTH': v_end,
        'DUPCOUNT': counts
    }
Пример #10
0
def create_alignment(seq, line, v_germlines, j_germlines):
    alignment = VDJAlignment(seq)
    alignment.v_gene = set([GeneName(g) for g in line['V_CALL'].split(',')])
    alignment.j_gene = set([GeneName(g) for g in line['J_CALL'].split(',')])
    alignment.cdr3_num_nts = int(line['JUNCTION_LENGTH'])
    alignment.v_length = int(line['V_SEQ_LENGTH'])
    alignment.seq_offset = re.match('[\-]*', alignment.sequence.sequence).end()

    # TODO: Calculate these
    alignment.v_length = CDR3_OFFSET - seq[:CDR3_OFFSET].count('-')
    alignment.j_length = j_germlines.upstream_of_cdr3

    germ_v = [v_germlines[g] for g in alignment.v_gene if g in v_germlines]
    germ_j = [j_germlines[g] for g in alignment.j_gene if g in j_germlines]
    if len(germ_v) == 0 or len(germ_j) == 0:
        raise AlignmentException('Missing germlines: V={} J={}'.format(
            ','.join([str(v) for v in alignment.v_gene]),
            ','.join([str(j) for j in alignment.j_gene])))

    germ_v = get_common_seq(germ_v)
    germ_j = get_common_seq(germ_j)

    alignment.germline_cdr3 = ''.join(
        (germ_v, '-' * (len(alignment.sequence) - len(germ_v) - len(germ_j)),
         germ_j))[CDR3_OFFSET:CDR3_OFFSET + alignment.cdr3_num_nts]

    alignment.germline = ''.join([
        germ_v[:CDR3_OFFSET], '-' * alignment.cdr3_num_nts,
        germ_j[-j_germlines.upstream_of_cdr3:]
    ])

    alignment.sequence.pad_right(
        len(alignment.germline) - len(alignment.sequence.sequence))

    if len(alignment.germline) != len(alignment.sequence.sequence):
        raise AlignmentException('Sequence and germline differ in size')
    return alignment
Пример #11
0
 def validate(self, alignment):
     if not self.valid_min_similarity(alignment):
         raise AlignmentException('V-identity too low {} < {}'.format(
             alignment.v_match / float(alignment.v_length),
             self.min_similarity))
     if not self.valid_v_ties(alignment):
         raise AlignmentException('Too many V-ties {} > {}'.format(
             len(alignment.v_gene), self.max_v_ties))
     if not self.valid_padding(alignment):
         raise AlignmentException('Too much padding {} (max {})'.format(
             alignment.seq_start, self.max_padding))
     if not self.valid_families(alignment):
         raise AlignmentException('Cross-family V-call')
     if not self.valid_indels(alignment):
         raise AlignmentException(
             'Too many indels insertions={} deletions={}'.format(
                 alignment.insertions, alignment.deletions))
     if not self.validate_cdr3(alignment):
         raise AlignmentException('CDR3 too short {}'.format(
             alignment.cdr3_num_nts))
Пример #12
0
def parse_airr(line, v_germlines, j_germlines):
    seq = VDJSequence(
        seq_id=line['sequence_id'].replace('reversed|', ''),
        sequence=line['sequence_alignment'],
        rev_comp=line['rev_comp'] == 'T',
    )
    if not all([line['v_call'], line['j_call'], line['junction_aa']]):
        raise AlignmentException(seq, 'Missing v_gene, j_gene, or junction_aa')

    seq.pad(int(line['v_germline_start']) - 1)
    try:
        v_germ_seq = v_germlines.get_ties(line['v_call'].split(','))
    except KeyError:
        raise AlignmentException(
            seq,
            'V-gene {} not in germline database'.format(line['v_call'])
        )

    aligned_germ = ''.join([
        v_germ_seq.replace('-', '')[:int(line['v_germline_start']) - 1],
        line['germline_alignment']
    ])
    # Append the missing portion, if any, of the J to the germline
    j_germ_seq = j_germlines.get_ties(line['j_call'].split(','))
    append_j = len(j_germ_seq) - int(line['j_germline_end'])
    if append_j > 0:
        aligned_germ += j_germ_seq[-append_j:]
        seq.pad_right(append_j)

    aligned_seq, gaps_added = add_imgt_gaps(v_germ_seq, seq)
    aligned_germ = add_imgt_gaps(
        v_germ_seq, VDJSequence('', aligned_germ)
    )[0].sequence
    cdr3_start = int(line['cdr3_start']) - int(line['v_sequence_start'])
    # Push the start of the CDR3 based on number of IMGT gaps added.  Then add
    # 3 because IgBLAST's CDR3 excludes the preserved Cysteine
    cdr3_start += gaps_added - 3
    cdr3_start += aligned_seq.sequence[:cdr3_start].count('-')
    cdr3_start += int(line['v_germline_start']) - 1
    cdr3_end = cdr3_start + len(line['cdr3']) + 6
    # If there is an insertion in the CDR3 but not junction, increase CDR3
    # length
    junction_insertions = aligned_germ[cdr3_end - 3:cdr3_end].count('-')
    cdr3_end += junction_insertions
    cdr3_seq = aligned_seq.sequence[cdr3_start:cdr3_end]

    germline_cdr3 = aligned_germ[cdr3_start:cdr3_end]
    aligned_germ = ''.join([
        aligned_germ[:cdr3_start],
        '.' * (cdr3_end - cdr3_start),
        aligned_germ[cdr3_end:]
    ])
    aligned_seq = ''.join([
        aligned_seq.sequence[:cdr3_start],
        cdr3_seq,
        aligned_seq.sequence[cdr3_end:]
    ])

    total_insertions = line['v_germline_alignment'].count('-')
    correct_cdr3_start = CDR3_OFFSET + total_insertions
    if cdr3_start != correct_cdr3_start:
        raise AlignmentException(
            seq, 'CDR3 starts at {} instead of {} ({} insertions)'.format(
                cdr3_start, correct_cdr3_start, total_insertions))

    alignment = funcs.ClassProxy(VDJAlignment(
        VDJSequence(line['sequence_id'], aligned_seq.replace('.', '-'))
    ))
    alignment.germline = aligned_germ.replace('.', '-')
    alignment.v_gene = set([GeneName(c) for c in line['v_call'].split(',')])
    alignment.j_gene = set([GeneName(c) for c in line['j_call'].split(',')])
    alignment.cdr3_start = cdr3_start
    alignment.cdr3_num_nts = len(cdr3_seq)
    alignment.locally_aligned = True
    alignment.germline_cdr3 = germline_cdr3
    alignment.seq_offset = int(line['v_germline_start']) - 1
    alignment.v_length = int(line['v_alignment_end'])
    alignment.j_length = (int(line['j_alignment_end']) -
                          int(line['j_alignment_start']))
    alignment.v_mutation_fraction = (100 - float(line['v_identity'])) / 100
    # Skipping the germline_cdr3 field and instead populating its dependencies
    # via the proxy
    alignment.j_match = float(line['j_identity']) * alignment.j_length / 100
    alignment.post_cdr3_length = len(alignment.sequence.sequence) - cdr3_end
    alignment.insertions = funcs.gap_positions(aligned_germ)
    alignment.deletions = funcs.gap_positions(aligned_seq)

    return alignment
Пример #13
0
    def analyze(self):
        if not all(map(lambda c: c in 'ATCGN', self.sequence)):
            raise AlignmentException('Invalid characters in sequence.')

        self._find_j()
        self._find_v()
Пример #14
0
    def align_to_germline(self, avg_len=None, avg_mut=None, trim_to=None):
        if avg_len is not None and avg_mut is not None:
            self._v = self.v_germlines.get_ties(self.v_gene, avg_len, avg_mut)
            self._j = self.j_germlines.get_ties(self.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        self.germline = get_common_seq([self.v_germlines[v]
                                        for v in self._v])[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if self._pad_len >= 0:
            self.sequence = 'N' * self._pad_len + str(self.sequence)
            if self.quality is not None:
                self.quality = (' ' * self._pad_len) + self.quality
        else:
            self.removed_prefix = self.sequence[:-self._pad_len]
            self.sequence = str(self.sequence[-self._pad_len:])
            if self.quality is not None:
                self.removed_prefix_qual = self.quality[:-self._pad_len]
                self.quality = self.quality[-self._pad_len:]
        # Update the anchor positions after adding padding / trimming
        self.j_anchor_pos += self._pad_len

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(self.germline):
            if c == '-':
                self.sequence = self.sequence[:i] + '-' + self.sequence[i:]
                if self.quality is not None:
                    self.quality = self.quality[:i] + ' ' + self.quality[i:]
                self.j_anchor_pos += 1

        j_germ = get_common_seq(
            map(reversed, [self.j_germlines[j] for j in self.j_gene]))
        j_germ = ''.join(reversed(j_germ))
        # Calculate the length of the CDR3
        self._cdr3_len = (self.j_anchor_pos + self.j_germlines.anchor_len -
                          self.j_germlines.upstream_of_cdr3 - self.cdr3_start)

        if self._cdr3_len < 3:
            raise AlignmentException('CDR3 has no AAs'.format(self._cdr3_len))

        self.j_anchor_pos += self._cdr3_len
        # Fill germline CDR3 with gaps
        self.germline += '-' * self._cdr3_len
        self.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(self.sequence) > len(self.germline):
            self.sequence = self.sequence[:len(self.germline)]
            if self.quality is not None:
                self.quality = self.quality[:len(self.germline)]
        elif len(self.sequence) < len(self.germline):
            self.sequence += 'N' * (len(self.germline) - len(self.sequence))
            if self.quality is not None:
                self.quality += ' ' * (len(self.germline) - len(self.quality))

        if trim_to is not None:
            old_padding = max(self._pad_len, 0)
            new_prefix = ''.join(
                [c if c == '-' else 'N' for c in self.sequence[:trim_to]])
            self.sequence = new_prefix + self.sequence[trim_to:]
            v_start = re.match('[N\-]*', self.sequence).span()[1]
            self._pad_len = self.sequence[:v_start].count('N')
            self.v_length -= self._pad_len - old_padding

        # Get the pre-CDR3 germline
        pre_cdr3_germ = self.germline[:self.cdr3_start]
        pre_cdr3_seq = self.sequence[:self.cdr3_start]

        # If there is padding, get rid of it in the sequence and align the
        # germline
        if self._pad_len > 0:
            pre_cdr3_germ = pre_cdr3_germ[self._pad_len:]
            pre_cdr3_seq = pre_cdr3_seq[self._pad_len:]

        # Calculate the pre-CDR3 length and distance
        self.pre_cdr3_length = len(pre_cdr3_seq)
        self.pre_cdr3_match = self.pre_cdr3_length - dnautils.hamming(
            str(pre_cdr3_seq), str(pre_cdr3_germ))

        # Get the length of J after the CDR3
        self.post_cdr3_length = self.j_germlines.upstream_of_cdr3
        # Get the sequence and germline sequences after CDR3
        post_j = j_germ[-self.post_cdr3_length:]
        post_s = self.sequence[-self.post_cdr3_length:]

        # Calculate their match count
        self.post_cdr3_match = self.post_cdr3_length - dnautils.hamming(
            post_j, post_s)

        self.v_match = self.v_length - dnautils.hamming(
            self.germline[:self.cdr3_start], self.sequence[:self.cdr3_start])

        self.j_match = self.j_length - dnautils.hamming(
            self.germline[-len(j_germ):], self.sequence[-len(j_germ):])
Пример #15
0
def read_file(session, handle, sample, v_germlines, j_germlines, columns,
              remaps):
    seqs = _collapse_seqs(session, sample,
                          csv.DictReader(handle, delimiter='\t'), columns)

    aligned_seqs = {}
    missed = 0
    total = 0
    for total, seq in enumerate(seqs):
        if total > 0 and total % 1000 == 0:
            logger.info('Finished {}'.format(total))
            session.commit()

        orig_v_genes = set(
            re.findall('IGHV[^ ,]+', seq['record'][columns.v_gene]))
        orig_j_genes = set(
            re.findall('IGHJ[^ ,]+', seq['record'][columns.j_gene]))
        if remaps is not None:
            remapped_j_genes = set([])
            for j in orig_j_genes:
                for remap_from, remap_to in remaps.iteritems():
                    if j.startswith(remap_from):
                        remapped_j_genes.add(remap_to)
                        break
                else:
                    remapped_j_genes.add(j)
            orig_j_genes = remapped_j_genes

        v_genes = filter(lambda v: v in v_germlines, orig_v_genes)
        j_genes = filter(lambda j: j in j_germlines, orig_j_genes)

        vdj = VDJSequence(seq['seq_ids'],
                          seq['record'][columns.full_sequence],
                          v_germlines,
                          j_germlines,
                          force_vs=v_genes,
                          force_js=j_genes)
        try:
            if len(v_genes) == 0:
                raise AlignmentException('No valid V germline for {}'.format(
                    ','.join(sorted(orig_v_genes))))
            if len(j_genes) == 0:
                raise AlignmentException('No valid J germline for {}'.format(
                    ','.join(sorted(orig_j_genes))))
            vdj.analyze()

            if vdj.sequence in aligned_seqs:
                aligned_seqs[vdj.sequence].ids += vdj.ids
            else:
                aligned_seqs[vdj.sequence] = vdj
        except AlignmentException as e:
            add_as_noresult(session, vdj, sample, str(e))
            missed += 1
    logger.info('Aligned {} / {} sequences'.format(total - missed + 1, total))

    logger.info('Collapsing ambiguous character sequences')
    if len(aligned_seqs) > 0:
        avg_mut = sum([v.mutation_fraction for v in aligned_seqs.values()
                       ]) / float(len(aligned_seqs))
        avg_len = sum([v.v_length for v in aligned_seqs.values()]) / float(
            len(aligned_seqs))
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        if columns.ties:
            add_uniques(session,
                        sample,
                        aligned_seqs.values(),
                        realign_mut=avg_mut,
                        realign_len=avg_len,
                        trim_to=columns.trim_to,
                        max_padding=columns.max_padding)
        else:
            add_uniques(session, sample, aligned_seqs.values())
    session.commit()
Пример #16
0
    def process_j(self, alignment, i, match_len, limit_js):
        # If a match is found, record its location and gene
        alignment.j_anchor_pos = i
        end_of_j = min(alignment.j_anchor_pos + self.j_germlines.anchor_len,
                       len(alignment.sequence))
        best_dist = None
        if limit_js:
            j_germs = {
                k: v
                for k, v in self.j_germlines.items() if k.name in limit_js
            }
        else:
            j_germs = self.j_germlines

        for j_gene, j_seq in j_germs.items():
            seq_j = alignment.sequence[end_of_j - len(j_seq):end_of_j]
            dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)])
            if best_dist is None or dist < best_dist:
                best_dist = dist
                alignment.j_gene = set([j_gene])
            elif dist == best_dist:
                alignment.j_gene.add(j_gene)

        if len(alignment.j_gene) == 0:
            raise AlignmentException('Could not find suitable J anchor')

        # Get the full germline J gene
        ex_j = sorted(alignment.j_gene)[0]
        j_full = self.j_germlines[ex_j]

        # Get the portion of the germline J in the CDR3
        germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(ex_j)
        cdr3_end_pos = (alignment.j_anchor_pos + self.j_germlines.anchor_len -
                        self.j_germlines.upstream_of_cdr3)
        sequence_in_cdr3 = alignment.sequence[cdr3_end_pos -
                                              len(germline_in_cdr3
                                                  ):cdr3_end_pos]
        if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0:
            alignment.j_gene = set()
            raise AlignmentException('Could not find sequence or germline in '
                                     'CDR3')

        # Get the extent of the J in the CDR3
        streak = dnautils.find_streak_position(germline_in_cdr3[::-1],
                                               sequence_in_cdr3[::-1],
                                               self.MISMATCH_THRESHOLD)

        # Trim the J gene based on the extent in the CDR3
        if streak is not None:
            j_full = j_full[len(germline_in_cdr3) - streak:]
            alignment.germline_cdr3 = germline_in_cdr3[-streak:]
        else:
            alignment.germline_cdr3 = germline_in_cdr3

        # Find where the full J starts
        j_start = alignment.j_anchor_pos + match_len - len(j_full)

        # If the trimmed germline J extends past the end of the
        # sequence, there is a misalignment
        if len(j_full) != len(
                alignment.sequence[j_start:j_start + len(j_full)]):
            alignment.j_gene = set()
            raise AlignmentException('Germline extended past end of J')

        alignment.j_length = len(j_full)
        alignment.post_cdr3_length = self.j_germlines.upstream_of_cdr3
Пример #17
0
 def _verify_type(self, other_v):
     if type(other_v) != type(self):
         raise AlignmentException('Must compare to instance of {}'.format(
             self.__class__.__name__))
Пример #18
0
    def _found_j(self, i, j_gene, match):
        # If a match is found, record its location and gene
        self.j_anchor_pos = i
        self.j_anchor_len = len(match)
        end_of_j = min(self.j_anchor_pos + self.j_germlines.anchor_len,
                       len(self.sequence))
        best_dist = None
        self._j = []
        if self._force_js:
            j_germs = {
                k: v
                for k, v in self.j_germlines.iteritems() if k in self._force_js
            }
        else:
            j_germs = self.j_germlines
        for j_gene, j_seq in j_germs.iteritems():
            seq_j = self.sequence[end_of_j - len(j_seq):end_of_j]
            dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)])
            if best_dist is None or dist < best_dist:
                best_dist = dist
                self._j = set([j_gene])
            elif dist == best_dist:
                self._j.add(j_gene)

        if self._j is None:
            raise AlignmentException('Could not find suitable J anchor')

        # Get the full germline J gene
        j_full = self.j_germlines[self.j_gene[0]]

        # Get the portion of the germline J in the CDR3
        germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(self.j_gene[0])
        cdr3_end_pos = (self.j_anchor_pos + self.j_germlines.anchor_len -
                        self.j_germlines.upstream_of_cdr3)
        sequence_in_cdr3 = self.sequence[cdr3_end_pos -
                                         len(germline_in_cdr3):cdr3_end_pos]
        if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0:
            self._j = None
            raise AlignmentException('Could not find sequence or germline in '
                                     'CDR3')

        # Get the extent of the J in the CDR3
        streak = find_streak_position(reversed(germline_in_cdr3),
                                      reversed(sequence_in_cdr3),
                                      self.MISMATCH_THRESHOLD)

        # Trim the J gene based on the extent in the CDR3
        if streak is not None:
            j_full = j_full[len(germline_in_cdr3) - streak:]

        # Find where the full J starts
        self._j_start = self.j_anchor_pos + len(match) - len(j_full)

        # If the trimmed germline J extends past the end of the
        # sequence, there is a misalignment
        if len(j_full) != len(
                self.sequence[self._j_start:self._j_start + len(j_full)]):
            self._j = None
            self.j_anchor_pos = None
            raise AlignmentException('Germline extended past end of J')

        self.j_length = len(j_full)