Python pack_sequences示例，deepblast.dataset.utils.pack_sequences Python示例

示例#1

0

显示文件

文件： trainer.py 项目： mortonjt/deepblast

 def test_step(self, batch, batch_idx):
     genes, others, s, A, P, G, gene_names, other_names = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO: compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     assert len(statistics) > 0, (batch_idx, s)
     genes = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), genes))
     others = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), others))
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'test_tp', 'test_fp', 'test_fn',
                                   'test_perc_id', 'test_ppv', 'test_fnr',
                                   'test_fdr'
                               ])
     statistics['query_name'] = gene_names
     statistics['key_name'] = other_names
     return statistics

示例#2

0

显示文件

 def test_pack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     res, order = pack_sequences(X, Y)
     npt.assert_allclose(order, np.array([2, 3, 1, 0]))

示例#3

0

显示文件

文件： trainer.py 项目： mortonjt/deepblast

 def align(self, x, y):
     x_code = torch.Tensor(self.tokenizer(str.encode(x))).long()
     y_code = torch.Tensor(self.tokenizer(str.encode(y))).long()
     x_code = x_code.to(self.device)
     y_code = y_code.to(self.device)
     seq, order = pack_sequences([x_code], [y_code])
     gen = self.aligner.traceback(seq, order)
     decoded, _ = next(gen)
     pred_x, pred_y, pred_states = zip(*decoded)
     s = ''.join(list(map(revstate_f, pred_states)))
     return s

示例#4

0

显示文件

 def test_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     x = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda()
     y = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda()
     N, M = x.shape[0], y.shape[0]
     M = max(N, M)
     seq, order = pack_sequences([x], [y])
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (1, M, M))

示例#5

0

显示文件

 def test_unpack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     z, order = pack_sequences(X, Y)
     resX, xlen, resY, ylen = unpack_sequences(z, order)
     tt.assert_allclose(xlen, torch.Tensor([3, 4]).long())
     tt.assert_allclose(ylen, torch.Tensor([6, 5]).long())
     expX = torch.Tensor([[6, 4, 5, 0, 0, 0], [1, 4, 5, 7, 0, 0]])
     expY = torch.Tensor([[21, 10, 12, 2, 4, 5], [1, 4, 11, 13, 14, 0]])
     tt.assert_allclose(expX, resX)
     tt.assert_allclose(expY, resY)

示例#6

0

显示文件

文件： trainer.py 项目： mortonjt/deepblast

 def training_step(self, batch, batch_idx):
     self.aligner.train()
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     _, xlen, _, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     if len(self.trainer.lr_schedulers) >= 1:
         current_lr = self.trainer.lr_schedulers[0]['scheduler']
         current_lr = current_lr.get_last_lr()[0]
     else:
         current_lr = self.hparams.learning_rate
     tensorboard_logs = {'train_loss': loss, 'lr': current_lr}
     # log the learning rate
     return {'loss': loss, 'log': tensorboard_logs}

示例#7

0

显示文件

 def test_batch_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     length = len('ARNDCQEGHILKMFPSTWYVXOUBZ')
     x = torch.zeros((2, length))
     y = torch.zeros((2, length))
     x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY')
     y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV')
     x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()]
     y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()]
     seq, order = pack_sequences(x, y)
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (2, length, length))
     self.assertEqual(theta.shape, (2, length, length))

示例#8

0

显示文件

def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments

示例#9

0

显示文件

文件： trainer.py 项目： mortonjt/deepblast

 def validation_step(self, batch, batch_idx):
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO; compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'val_tp', 'val_fp', 'val_fn',
                                   'val_perc_id', 'val_ppv', 'val_fnr',
                                   'val_fdr'
                               ])
     statistics = statistics.mean(axis=0).to_dict()
     tensorboard_logs = {'valid_loss': loss}
     tensorboard_logs = {**tensorboard_logs, **statistics}
     return {'validation_loss': loss, 'log': tensorboard_logs}

示例#10

0

显示文件

    def test_collate_alignment(self):
        M = 5
        x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long()
        x2 = torch.Tensor(self.tokenizer(b'NDC')).long()
        y1 = torch.Tensor(self.tokenizer(b'ND')).long()
        y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long()
        s1 = torch.Tensor([1, 1, 1, 0]).long()
        s2 = torch.Tensor([1, 1, 2, 2, 2]).long()
        A1 = torch.ones((len(x1), len(y1))).long()
        A2 = torch.ones((len(x2), len(y2))).long()
        P1 = torch.ones((len(x1), len(y1))).long()
        P2 = torch.ones((len(x2), len(y2))).long()
        G1 = torch.ones((len(x1), len(y1))).long()
        G2 = torch.ones((len(x2), len(y2))).long()

        batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)]
        gene_codes, other_codes, states, dm, p, g = collate_f(batch)
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        seq, order = pack_sequences(gene_codes, other_codes)
        seq = seq.cuda()
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (2, M, M))
        self.assertEqual(theta.shape, (2, M, M))

示例#11

0

显示文件

文件： trainer.py 项目： konstin/deepblast

 def score(self, x_code, y_code):
     seq, order = pack_sequences(x_code, y_code)
     seq = seq.to(self.device)
     A = self.aligner.score(seq, order)
     return A

示例#12

0

显示文件

		i=i.rstrip()
		if i[0]=='>':
			ID=i[1:]
			continue
		seqs[ID]=seqs.get(ID,'')+i

keys_list = list(seqs)
x = seqs[keys_list[0]]
y = seqs[keys_list[1]]
pred_alignment = model.align(x, y)

x_aligned, y_aligned = states2alignment(pred_alignment, x, y)

file = open(args.output,"w")
file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned))
file.close()

print(x_aligned)
print(pred_alignment)
print(y_aligned)

x_ = torch.Tensor(model.tokenizer(str.encode(x))).long()
y_ = torch.Tensor(model.tokenizer(str.encode(y))).long()

seq, order = pack_sequences([x_], [y_])

score = model.aligner.score(seq, order).item()
print('Score', score)