Exemplo n.º 1
0
    def align_to_germline(self, alignment, avg_len=None, avg_mut=None):
        if avg_len is not None and avg_mut is not None:
            alignment.v_gene = self.v_germlines.get_ties(
                alignment.v_gene, avg_len, avg_mut)
            alignment.j_gene = self.j_germlines.get_ties(
                alignment.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        germ = get_common_seq(
            [self.v_germlines[v] for v in alignment.v_gene], cutoff=False
        )
        alignment.germline = germ[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if alignment.seq_offset >= 0:
            alignment.sequence.pad(alignment.seq_offset)
        else:
            alignment.sequence.remove_prefix(-alignment.seq_offset)
        alignment.j_anchor_pos += alignment.seq_offset

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(alignment.germline):
            if c == '-':
                alignment.sequence.add_gap(i)
                alignment.j_anchor_pos += 1
                if i < alignment.seq_start:
                    alignment.seq_offset += 1

        j_germ = get_common_seq(
            [self.j_germlines[j] for j in alignment.j_gene], right=True
        )
        # Calculate the length of the CDR3
        alignment.cdr3_num_nts = (
            alignment.j_anchor_pos + self.j_germlines.anchor_len -
            self.j_germlines.upstream_of_cdr3 - alignment.cdr3_start
        )

        v_end = alignment.seq_start + alignment.num_gaps + alignment.v_length
        v_germ = germ[CDR3_OFFSET:v_end]
        alignment.germline_cdr3 = ''.join((
            v_germ,
            '-' * (alignment.cdr3_num_nts -
                   len(v_germ) -
                   len(alignment.germline_cdr3)),
            alignment.germline_cdr3
        ))

        if alignment.cdr3_num_nts < 3:
            raise AlignmentException('CDR3 has no AAs')

        alignment.j_anchor_pos += alignment.cdr3_num_nts
        # Fill germline CDR3 with gaps
        alignment.germline += '-' * alignment.cdr3_num_nts
        alignment.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(alignment.sequence) > len(alignment.germline):
            alignment.sequence.trim_right(len(alignment.germline))
        elif len(alignment.sequence) < len(alignment.germline):
            alignment.sequence.pad_right(len(alignment.germline) -
                                         len(alignment.sequence))
Exemplo n.º 2
0
    def align_to_germline(self, alignment, avg_len=None, avg_mut=None):
        if avg_len is not None and avg_mut is not None:
            alignment.v_gene = self.v_germlines.get_ties(
                alignment.v_gene, avg_len, avg_mut)
            alignment.j_gene = self.j_germlines.get_ties(
                alignment.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        germ = get_common_seq(
            [self.v_germlines[v] for v in alignment.v_gene], cutoff=False
        )
        alignment.germline = germ[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if alignment.seq_offset >= 0:
            alignment.sequence.pad(alignment.seq_offset)
        else:
            alignment.sequence.remove_prefix(-alignment.seq_offset)
        alignment.j_anchor_pos += alignment.seq_offset

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(alignment.germline):
            if c == '-':
                alignment.sequence.add_gap(i)
                alignment.j_anchor_pos += 1
                if i < alignment.seq_start:
                    alignment.seq_offset += 1

        j_germ = get_common_seq(
            [self.j_germlines[j] for j in alignment.j_gene], right=True
        )
        # Calculate the length of the CDR3
        alignment.cdr3_num_nts = (
            alignment.j_anchor_pos + self.j_germlines.anchor_len -
            self.j_germlines.upstream_of_cdr3 - alignment.cdr3_start
        )

        v_end = alignment.seq_start + alignment.num_gaps + alignment.v_length
        v_germ = germ[CDR3_OFFSET:v_end]
        alignment.germline_cdr3 = ''.join((
            v_germ,
            '-' * (alignment.cdr3_num_nts -
                   len(v_germ) -
                   len(alignment.germline_cdr3)),
            alignment.germline_cdr3
        ))

        if alignment.cdr3_num_nts < 3:
            raise AlignmentException('CDR3 has no AAs')

        alignment.j_anchor_pos += alignment.cdr3_num_nts
        # Fill germline CDR3 with gaps
        alignment.germline += '-' * alignment.cdr3_num_nts
        alignment.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(alignment.sequence) > len(alignment.germline):
            alignment.sequence.trim_right(len(alignment.germline))
        elif len(alignment.sequence) < len(alignment.germline):
            alignment.sequence.pad_right(len(alignment.germline) -
                                         len(alignment.sequence))
Exemplo n.º 3
0
 def all_ties(self, length, mutation, cutoff=True):
     ties = {}
     for name in self:
         tie_name = tuple(sorted(self.get_ties([name], length, mutation)))
         if tie_name not in ties:
             ties[tie_name] = get_common_seq([self[n] for n in tie_name],
                                             cutoff=cutoff)
     return ties
Exemplo n.º 4
0
 def all_ties(self, length, mutation, cutoff=True):
     ties = {}
     for name in self:
         tie_name = tuple(sorted(self.get_ties([name], length, mutation)))
         if tie_name not in ties:
             ties[tie_name] = get_common_seq(
                 [self[n] for n in tie_name], cutoff=cutoff
             )
     return ties
def create_alignment(seq, line, v_germlines, j_germlines):
    alignment = VDJAlignment(seq)
    alignment.v_gene = set([GeneName(g) for g in line['V_CALL'].split(',')])
    alignment.j_gene = set([GeneName(g) for g in line['J_CALL'].split(',')])
    alignment.cdr3_num_nts = int(line['JUNCTION_LENGTH'])
    alignment.v_length = int(line['V_SEQ_LENGTH'])
    alignment.seq_offset = re.match(r'[-]*', alignment.sequence.sequence).end()

    # TODO: Calculate these
    alignment.v_length = CDR3_OFFSET - seq[:CDR3_OFFSET].count('-')
    alignment.j_length = j_germlines.upstream_of_cdr3

    germ_v = [v_germlines[g] for g in alignment.v_gene if g in v_germlines]
    germ_j = [j_germlines[g] for g in alignment.j_gene if g in j_germlines]
    if len(germ_v) == 0 or len(germ_j) == 0:
        raise AlignmentException('Missing germlines: V={} J={}'.format(
            ','.join([str(v) for v in alignment.v_gene]),
            ','.join([str(j) for j in alignment.j_gene])))

    germ_v = get_common_seq(germ_v)
    germ_j = get_common_seq(germ_j)

    alignment.germline_cdr3 = ''.join((
        germ_v,
        '-' * (len(alignment.sequence) - len(germ_v) - len(germ_j)),
        germ_j
    ))[CDR3_OFFSET:CDR3_OFFSET + alignment.cdr3_num_nts]

    alignment.germline = ''.join([
        germ_v[:CDR3_OFFSET],
        '-' * alignment.cdr3_num_nts,
        germ_j[-j_germlines.upstream_of_cdr3:]
    ])

    alignment.sequence.pad_right(
        len(alignment.germline) -
        len(alignment.sequence.sequence)
    )

    if len(alignment.germline) != len(alignment.sequence.sequence):
        raise AlignmentException('Sequence and germline differ in size')
    return alignment
Exemplo n.º 6
0
def create_alignment(seq, line, v_germlines, j_germlines):
    alignment = VDJAlignment(seq)
    alignment.v_gene = set([GeneName(g) for g in line['V_CALL'].split(',')])
    alignment.j_gene = set([GeneName(g) for g in line['J_CALL'].split(',')])
    alignment.cdr3_num_nts = int(line['JUNCTION_LENGTH'])
    alignment.v_length = int(line['V_SEQ_LENGTH'])
    alignment.seq_offset = re.match('[\-]*', alignment.sequence.sequence).end()

    # TODO: Calculate these
    alignment.v_length = CDR3_OFFSET - seq[:CDR3_OFFSET].count('-')
    alignment.j_length = j_germlines.upstream_of_cdr3

    germ_v = [v_germlines[g] for g in alignment.v_gene if g in v_germlines]
    germ_j = [j_germlines[g] for g in alignment.j_gene if g in j_germlines]
    if len(germ_v) == 0 or len(germ_j) == 0:
        raise AlignmentException('Missing germlines: V={} J={}'.format(
            ','.join([str(v) for v in alignment.v_gene]),
            ','.join([str(j) for j in alignment.j_gene])))

    germ_v = get_common_seq(germ_v)
    germ_j = get_common_seq(germ_j)

    alignment.germline_cdr3 = ''.join(
        (germ_v, '-' * (len(alignment.sequence) - len(germ_v) - len(germ_j)),
         germ_j))[CDR3_OFFSET:CDR3_OFFSET + alignment.cdr3_num_nts]

    alignment.germline = ''.join([
        germ_v[:CDR3_OFFSET], '-' * alignment.cdr3_num_nts,
        germ_j[-j_germlines.upstream_of_cdr3:]
    ])

    alignment.sequence.pad_right(
        len(alignment.germline) - len(alignment.sequence.sequence))

    if len(alignment.germline) != len(alignment.sequence.sequence):
        raise AlignmentException('Sequence and germline differ in size')
    return alignment
Exemplo n.º 7
0
    def align_to_germline(self, avg_len=None, avg_mut=None, trim_to=None):
        if avg_len is not None and avg_mut is not None:
            self._v = self.v_germlines.get_ties(self.v_gene, avg_len, avg_mut)
            self._j = self.j_germlines.get_ties(self.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        self.germline = get_common_seq([self.v_germlines[v]
                                        for v in self._v])[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if self._pad_len >= 0:
            self.sequence = 'N' * self._pad_len + str(self.sequence)
            if self.quality is not None:
                self.quality = (' ' * self._pad_len) + self.quality
        else:
            self.removed_prefix = self.sequence[:-self._pad_len]
            self.sequence = str(self.sequence[-self._pad_len:])
            if self.quality is not None:
                self.removed_prefix_qual = self.quality[:-self._pad_len]
                self.quality = self.quality[-self._pad_len:]
        # Update the anchor positions after adding padding / trimming
        self.j_anchor_pos += self._pad_len

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(self.germline):
            if c == '-':
                self.sequence = self.sequence[:i] + '-' + self.sequence[i:]
                if self.quality is not None:
                    self.quality = self.quality[:i] + ' ' + self.quality[i:]
                self.j_anchor_pos += 1

        j_germ = get_common_seq(
            map(reversed, [self.j_germlines[j] for j in self.j_gene]))
        j_germ = ''.join(reversed(j_germ))
        # Calculate the length of the CDR3
        self._cdr3_len = (self.j_anchor_pos + self.j_germlines.anchor_len -
                          self.j_germlines.upstream_of_cdr3 - self.cdr3_start)

        if self._cdr3_len < 3:
            raise AlignmentException('CDR3 has no AAs'.format(self._cdr3_len))

        self.j_anchor_pos += self._cdr3_len
        # Fill germline CDR3 with gaps
        self.germline += '-' * self._cdr3_len
        self.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(self.sequence) > len(self.germline):
            self.sequence = self.sequence[:len(self.germline)]
            if self.quality is not None:
                self.quality = self.quality[:len(self.germline)]
        elif len(self.sequence) < len(self.germline):
            self.sequence += 'N' * (len(self.germline) - len(self.sequence))
            if self.quality is not None:
                self.quality += ' ' * (len(self.germline) - len(self.quality))

        if trim_to is not None:
            old_padding = max(self._pad_len, 0)
            new_prefix = ''.join(
                [c if c == '-' else 'N' for c in self.sequence[:trim_to]])
            self.sequence = new_prefix + self.sequence[trim_to:]
            v_start = re.match('[N\-]*', self.sequence).span()[1]
            self._pad_len = self.sequence[:v_start].count('N')
            self.v_length -= self._pad_len - old_padding

        # Get the pre-CDR3 germline
        pre_cdr3_germ = self.germline[:self.cdr3_start]
        pre_cdr3_seq = self.sequence[:self.cdr3_start]

        # If there is padding, get rid of it in the sequence and align the
        # germline
        if self._pad_len > 0:
            pre_cdr3_germ = pre_cdr3_germ[self._pad_len:]
            pre_cdr3_seq = pre_cdr3_seq[self._pad_len:]

        # Calculate the pre-CDR3 length and distance
        self.pre_cdr3_length = len(pre_cdr3_seq)
        self.pre_cdr3_match = self.pre_cdr3_length - dnautils.hamming(
            str(pre_cdr3_seq), str(pre_cdr3_germ))

        # Get the length of J after the CDR3
        self.post_cdr3_length = self.j_germlines.upstream_of_cdr3
        # Get the sequence and germline sequences after CDR3
        post_j = j_germ[-self.post_cdr3_length:]
        post_s = self.sequence[-self.post_cdr3_length:]

        # Calculate their match count
        self.post_cdr3_match = self.post_cdr3_length - dnautils.hamming(
            post_j, post_s)

        self.v_match = self.v_length - dnautils.hamming(
            self.germline[:self.cdr3_start], self.sequence[:self.cdr3_start])

        self.j_match = self.j_length - dnautils.hamming(
            self.germline[-len(j_germ):], self.sequence[-len(j_germ):])