def test_step(self, batch, batch_idx): genes, others, s, A, P, G, gene_names, other_names = batch seq, order = pack_sequences(genes, others) predA, theta, gap = self.aligner(seq, order) x, xlen, y, ylen = unpack_sequences(seq, order) loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta) assert torch.isnan(loss).item() is False # Obtain alignment statistics + visualizations gen = self.aligner.traceback(seq, order) # TODO: compare the traceback and the forward statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA, theta, gap, batch_idx) assert len(statistics) > 0, (batch_idx, s) genes = list( map( lambda x: self.tokenizer.alphabet.decode(x.detach().cpu( ).numpy()).decode("utf-8"), genes)) others = list( map( lambda x: self.tokenizer.alphabet.decode(x.detach().cpu( ).numpy()).decode("utf-8"), others)) statistics = pd.DataFrame(statistics, columns=[ 'test_tp', 'test_fp', 'test_fn', 'test_perc_id', 'test_ppv', 'test_fnr', 'test_fdr' ]) statistics['query_name'] = gene_names statistics['key_name'] = other_names return statistics
def test_pack_sequences(self): X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])] Y = [ torch.Tensor([21, 10, 12, 2, 4, 5]), torch.Tensor([1, 4, 11, 13, 14]) ] res, order = pack_sequences(X, Y) npt.assert_allclose(order, np.array([2, 3, 1, 0]))
def align(self, x, y): x_code = torch.Tensor(self.tokenizer(str.encode(x))).long() y_code = torch.Tensor(self.tokenizer(str.encode(y))).long() x_code = x_code.to(self.device) y_code = y_code.to(self.device) seq, order = pack_sequences([x_code], [y_code]) gen = self.aligner.traceback(seq, order) decoded, _ = next(gen) pred_x, pred_y, pred_states = zip(*decoded) s = ''.join(list(map(revstate_f, pred_states))) return s
def test_alignment(self): self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() x = torch.Tensor( self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda() y = torch.Tensor( self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda() N, M = x.shape[0], y.shape[0] M = max(N, M) seq, order = pack_sequences([x], [y]) aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (1, M, M))
def test_unpack_sequences(self): X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])] Y = [ torch.Tensor([21, 10, 12, 2, 4, 5]), torch.Tensor([1, 4, 11, 13, 14]) ] z, order = pack_sequences(X, Y) resX, xlen, resY, ylen = unpack_sequences(z, order) tt.assert_allclose(xlen, torch.Tensor([3, 4]).long()) tt.assert_allclose(ylen, torch.Tensor([6, 5]).long()) expX = torch.Tensor([[6, 4, 5, 0, 0, 0], [1, 4, 5, 7, 0, 0]]) expY = torch.Tensor([[21, 10, 12, 2, 4, 5], [1, 4, 11, 13, 14, 0]]) tt.assert_allclose(expX, resX) tt.assert_allclose(expY, resY)
def training_step(self, batch, batch_idx): self.aligner.train() genes, others, s, A, P, G = batch seq, order = pack_sequences(genes, others) predA, theta, gap = self.aligner(seq, order) _, xlen, _, ylen = unpack_sequences(seq, order) loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta) assert torch.isnan(loss).item() is False if len(self.trainer.lr_schedulers) >= 1: current_lr = self.trainer.lr_schedulers[0]['scheduler'] current_lr = current_lr.get_last_lr()[0] else: current_lr = self.hparams.learning_rate tensorboard_logs = {'train_loss': loss, 'lr': current_lr} # log the learning rate return {'loss': loss, 'log': tensorboard_logs}
def test_batch_alignment(self): self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() length = len('ARNDCQEGHILKMFPSTWYVXOUBZ') x = torch.zeros((2, length)) y = torch.zeros((2, length)) x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ') x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY') y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ') y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV') x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()] y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()] seq, order = pack_sequences(x, y) aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (2, length, length)) self.assertEqual(theta.shape, (2, length, length))
def deepblast_align( pairings: List[Tuple[str, str]], query_by_id: Dict[str, str], target_by_id: Dict[str, str], model_file: str, device: torch.device, batch_size: int, ) -> List[Tuple[str, str, str, str]]: """Aligns the given pairings using DeepBLAST Returns a list of query id, target id, query aligned, target aligned The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7) Note that the batch size has much less of an impact for DeepBLAST than for the embedders """ model = LightningAligner.load_from_checkpoint(model_file).to(device) tokenizer = UniprotTokenizer() alignments = [] # Naive batching batches = numpy.array_split(pairings, math.ceil(len(pairings) / batch_size)) for batch in tqdm(batches): # noinspection PyArgumentList queries = [ torch.Tensor(tokenizer(query_by_id[query].encode())).long() for query, _ in batch ] # noinspection PyArgumentList targets = [ torch.Tensor(tokenizer(target_by_id[target].encode())).long() for _, target in batch ] seqs, order = pack_sequences(queries, targets) gen = model.aligner.traceback(seqs.to(device), order) for (decoded, _), (query, target) in zip(gen, batch): pred_x, pred_y, pred_states = zip(*decoded) pred_alignment = "".join(list(map(revstate_f, pred_states))) x_aligned, y_aligned = states2alignment(pred_alignment, query_by_id[query], target_by_id[target]) alignments.append((query, target, x_aligned, y_aligned)) return alignments
def validation_step(self, batch, batch_idx): genes, others, s, A, P, G = batch seq, order = pack_sequences(genes, others) predA, theta, gap = self.aligner(seq, order) x, xlen, y, ylen = unpack_sequences(seq, order) loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta) assert torch.isnan(loss).item() is False # Obtain alignment statistics + visualizations gen = self.aligner.traceback(seq, order) # TODO; compare the traceback and the forward statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA, theta, gap, batch_idx) statistics = pd.DataFrame(statistics, columns=[ 'val_tp', 'val_fp', 'val_fn', 'val_perc_id', 'val_ppv', 'val_fnr', 'val_fdr' ]) statistics = statistics.mean(axis=0).to_dict() tensorboard_logs = {'valid_loss': loss} tensorboard_logs = {**tensorboard_logs, **statistics} return {'validation_loss': loss, 'log': tensorboard_logs}
def test_collate_alignment(self): M = 5 x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long() x2 = torch.Tensor(self.tokenizer(b'NDC')).long() y1 = torch.Tensor(self.tokenizer(b'ND')).long() y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long() s1 = torch.Tensor([1, 1, 1, 0]).long() s2 = torch.Tensor([1, 1, 2, 2, 2]).long() A1 = torch.ones((len(x1), len(y1))).long() A2 = torch.ones((len(x2), len(y2))).long() P1 = torch.ones((len(x1), len(y1))).long() P2 = torch.ones((len(x2), len(y2))).long() G1 = torch.ones((len(x1), len(y1))).long() G2 = torch.ones((len(x2), len(y2))).long() batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)] gene_codes, other_codes, states, dm, p, g = collate_f(batch) self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() seq, order = pack_sequences(gene_codes, other_codes) seq = seq.cuda() aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (2, M, M)) self.assertEqual(theta.shape, (2, M, M))
def score(self, x_code, y_code): seq, order = pack_sequences(x_code, y_code) seq = seq.to(self.device) A = self.aligner.score(seq, order) return A
i=i.rstrip() if i[0]=='>': ID=i[1:] continue seqs[ID]=seqs.get(ID,'')+i keys_list = list(seqs) x = seqs[keys_list[0]] y = seqs[keys_list[1]] pred_alignment = model.align(x, y) x_aligned, y_aligned = states2alignment(pred_alignment, x, y) file = open(args.output,"w") file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned)) file.close() print(x_aligned) print(pred_alignment) print(y_aligned) x_ = torch.Tensor(model.tokenizer(str.encode(x))).long() y_ = torch.Tensor(model.tokenizer(str.encode(y))).long() seq, order = pack_sequences([x_], [y_]) score = model.aligner.score(seq, order).item() print('Score', score)