示例#1
0
class TestBiLM(unittest.TestCase):
    def setUp(self):
        path = pretrained_language_models['bilstm']
        self.embedding = BiLM()
        self.embedding.load_state_dict(torch.load(path))
        self.embedding.eval()
        self.tokenizer = UniprotTokenizer()

    def test_bilm(self):
        toks = torch.Tensor(self.tokenizer(b'ABC')).long().unsqueeze(0)
        res = self.embedding(toks)
        self.assertEqual(res.shape, (1, 3, 21))

    @unittest.skip('something is misbehaving here.')
    def test_bilm_batch(self):
        toks = torch.Tensor([[0, 20, 4, 3], [0, 20, 4, 0]]).long()
        lens = torch.Tensor([4, 3]).long()
        idx = pack_padded_sequence(toks, lens, batch_first=True)
        res = self.embedding(idx.data)
        x, xlen = pad_packed_sequence(res)
        tt.assert_allclose(xlen, lens)
        tt.assert_allclose(x, toks)
示例#2
0
class NeedlemanWunschAligner(nn.Module):
    def __init__(self,
                 n_alpha,
                 n_input,
                 n_units,
                 n_embed,
                 n_layers=2,
                 lm=None,
                 device='gpu'):
        """ NeedlemanWunsch Alignment model

        Parameters
        ----------
        n_alpha : int
           Size of the alphabet (default 22)
        n_input : int
           Input dimensions.
        n_units : int
           Number of hidden units in RNN.
        n_embed : int
           Embedding dimension
        n_layers : int
           Number of RNN layers.
        lm : BiLM
           Pretrained language model (optional)
        padding_idx : int
           Location of padding index in embedding (default -1)
        transform : function
           Activation function (default relu)
        sparse : False?
        """
        super(NeedlemanWunschAligner, self).__init__()
        if lm is None:
            path = pretrained_language_models['bilstm']
            self.lm = BiLM()
            self.lm.load_state_dict(torch.load(path))
            self.lm.eval()
        if n_layers > 1:
            self.match_embedding = StackedRNN(n_alpha,
                                              n_input,
                                              n_units,
                                              n_embed,
                                              n_layers,
                                              lm=lm)
            self.gap_embedding = StackedRNN(n_alpha,
                                            n_input,
                                            n_units,
                                            n_embed,
                                            n_layers,
                                            lm=lm)
        else:
            self.match_embedding = EmbedLinear(n_alpha,
                                               n_input,
                                               n_embed,
                                               lm=lm)
            self.gap_embedding = EmbedLinear(n_alpha, n_input, n_embed, lm=lm)

        # TODO: make cpu compatible version
        # if device == 'cpu':
        #     self.nw = NWDecoderCPU(operator='softmax')
        # else:
        self.nw = NWDecoderCUDA(operator='softmax')

    def forward(self, x, order):
        """ Generate alignment matrix.

        Parameters
        ----------
        x : PackedSequence
            Packed sequence object of proteins to align.
        order : np.array
            The origin order of the sequences

        Returns
        -------
        aln : torch.Tensor
            Alignment Matrix (dim B x N x M)
        """
        with torch.enable_grad():
            zx, _, zy, _ = unpack_sequences(self.match_embedding(x), order)
            gx, _, gy, _ = unpack_sequences(self.gap_embedding(x), order)

            # Obtain theta through an inner product across latent dimensions
            theta = F.softplus(torch.einsum('bid,bjd->bij', zx, zy))
            A = F.logsigmoid(torch.einsum('bid,bjd->bij', gx, gy))
            aln = self.nw.decode(theta, A)
            return aln, theta, A

    def traceback(self, x, order):
        # dim B x N x D
        with torch.enable_grad():
            zx, _, zy, _ = unpack_sequences(self.match_embedding(x), order)
            gx, xlen, gy, ylen = unpack_sequences(self.gap_embedding(x), order)
            match = F.softplus(torch.einsum('bid,bjd->bij', zx, zy))
            gap = F.logsigmoid(torch.einsum('bid,bjd->bij', gx, gy))
            B, _, _ = match.shape
            for b in range(B):
                aln = self.nw.decode(match[b, :xlen[b], :ylen[b]].unsqueeze(0),
                                     gap[b, :xlen[b], :ylen[b]].unsqueeze(0))
                decoded = self.nw.traceback(aln.squeeze())
                yield decoded, aln
示例#3
0
class TestAlignmentModel(unittest.TestCase):
    def setUp(self):
        path = pretrained_language_models['bilstm']
        self.embedding = BiLM()
        self.embedding.load_state_dict(torch.load(path))
        self.embedding.eval()
        self.tokenizer = UniprotTokenizer(pad_ends=False)
        nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024
        self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed)

    @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected")
    def test_alignment(self):
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        x = torch.Tensor(
            self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda()
        y = torch.Tensor(
            self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda()
        N, M = x.shape[0], y.shape[0]
        M = max(N, M)
        seq, order = pack_sequences([x], [y])
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (1, M, M))

    @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected")
    def test_batch_alignment(self):
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        length = len('ARNDCQEGHILKMFPSTWYVXOUBZ')
        x = torch.zeros((2, length))
        y = torch.zeros((2, length))
        x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
        x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY')
        y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
        y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV')
        x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()]
        y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()]
        seq, order = pack_sequences(x, y)
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (2, length, length))
        self.assertEqual(theta.shape, (2, length, length))

    @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected")
    def test_collate_alignment(self):
        M = 5
        x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long()
        x2 = torch.Tensor(self.tokenizer(b'NDC')).long()
        y1 = torch.Tensor(self.tokenizer(b'ND')).long()
        y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long()
        s1 = torch.Tensor([1, 1, 1, 0]).long()
        s2 = torch.Tensor([1, 1, 2, 2, 2]).long()
        A1 = torch.ones((len(x1), len(y1))).long()
        A2 = torch.ones((len(x2), len(y2))).long()
        P1 = torch.ones((len(x1), len(y1))).long()
        P2 = torch.ones((len(x2), len(y2))).long()
        G1 = torch.ones((len(x1), len(y1))).long()
        G2 = torch.ones((len(x2), len(y2))).long()

        batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)]
        gene_codes, other_codes, states, dm, p, g = collate_f(batch)
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        seq, order = pack_sequences(gene_codes, other_codes)
        seq = seq.cuda()
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (2, M, M))
        self.assertEqual(theta.shape, (2, M, M))