def alignment_text(x, y, pred, truth, stats): """ Used to visualize alignment as text Parameters ---------- x : str Protein X y : str Protein Y pred : list of int Predicted states truth : list of int Ground truth states stats : list of float List of statistics from roc_edges """ # TODO: we got the truth and prediction edges swapped somewhere earlier true_alignment = states2alignment(truth, x, y) pred_alignment = states2alignment(pred, x, y) cols = ['tp', 'fp', 'fn', 'perc_id', 'ppv', 'fnr', 'fdr'] stats = list(map(lambda x: np.round(x, 2), stats)) s = list(map(lambda x: f'{x[0]}: {x[1]}', list(zip(cols, stats)))) stats_viz = ' '.join(s) truth_viz = ('# Ground truth\n' f' {true_alignment[0]}\n {true_alignment[1]}') pred_viz = ('# Prediction\n' f' {pred_alignment[0]}\n {pred_alignment[1]}') s = stats_viz + '\n' + truth_viz + '\n' + pred_viz return s
def test_states2alignment_8(self): x = 'HECDDCSKQFSRNNHLAKHLRAH' y = 'YRCHKVCPYTFVGKSDLDLHQFITAH' s = np.array([ 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1 ]) states2alignment(s, y, x)
def test_states2alignment_10(self): gen = 'YACSGGCGQNFRTMSEFNEHMIRLVH' oth = 'LICPKHTRDCGKVFKRNSSLRVHEH' pred = np.array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1 ]) states2alignment(pred, gen, oth)
def test_states2alignment_11(self): gen = 'LNCKEIKKYCEMSFRNPDDIRKHRGAIH' oth = 'YTCSSCNESLRTAWCLNKHLR' pred = np.array([ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) states2alignment(pred, gen, oth)
def test_decoding2(self): X = 'HECDRKTCDESFSTKGNLRVHKLGH' Y = 'LKCSGCGKNFKSQYAYKRHEQTH' needle = NeedlemanWunschDecoder(self.operator) dm = torch.Tensor(np.loadtxt(get_data_path('dm.txt'))) decoded = needle.traceback(dm) pred_x, pred_y, pred_states = list(zip(*decoded)) states2alignment(np.array(pred_states), X, Y)
def test_states2alignment_3(self): x = ('XSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMA' 'SIYSEEGNIEHAFILYNKYITLFIEKLPKHRDYKSAVIPEKKDTVK' 'KLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQ' 'QELX') y = ('XIDVLRAKAAKERAERRLQSQQDDIDFKRAELALKRAMNRLSVAEMKX') s = np.array([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 ]) states2alignment(s, x, y)
def test_states2alignment_7(self): x = ('XGSSGSSGFDENWGADEELLLIDACETLGLGNWADIADYVGNARTKEECRDHYLKTYIEX') y = ('XGEIRVGNRYQADITDLLKEGEEDGRDQSRLETQVWEAHNPLTDKQIDQFLVVARSVGTF' 'ARALDSLHMSAAAASRDITLFHAMDTLHKNIYDISKAISALVPQGGPVLCRDEMEEWSAS' 'EANLFEEALEKYGKDFTDIQQDFLPWKSLTSIIEYYYMWKTTX') s = np.array([ 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]) states2alignment(s, x, y)
def test_states2alignment_2(self): s = "111:::111" s = np.array(list(map(tmstate_f, s))) X = "123456789" Y = "abc" exp_x = "123456789" exp_y = "---abc---" res_x, res_y = states2alignment(s, X, Y) self.assertEqual(res_x, exp_x) self.assertEqual(res_y, exp_y)
def test_states2alignment_1(self): s = "111:::222" s = np.array(list(map(tmstate_f, s))) X = "123456" Y = "abcdef" exp_x = "123456---" exp_y = "---abcdef" res_x, res_y = states2alignment(s, X, Y) self.assertEqual(res_x, exp_x) self.assertEqual(res_y, exp_y)
def deepblast_align( pairings: List[Tuple[str, str]], query_by_id: Dict[str, str], target_by_id: Dict[str, str], model_file: str, device: torch.device, batch_size: int, ) -> List[Tuple[str, str, str, str]]: """Aligns the given pairings using DeepBLAST Returns a list of query id, target id, query aligned, target aligned The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7) Note that the batch size has much less of an impact for DeepBLAST than for the embedders """ model = LightningAligner.load_from_checkpoint(model_file).to(device) tokenizer = UniprotTokenizer() alignments = [] # Naive batching batches = numpy.array_split(pairings, math.ceil(len(pairings) / batch_size)) for batch in tqdm(batches): # noinspection PyArgumentList queries = [ torch.Tensor(tokenizer(query_by_id[query].encode())).long() for query, _ in batch ] # noinspection PyArgumentList targets = [ torch.Tensor(tokenizer(target_by_id[target].encode())).long() for _, target in batch ] seqs, order = pack_sequences(queries, targets) gen = model.aligner.traceback(seqs.to(device), order) for (decoded, _), (query, target) in zip(gen, batch): pred_x, pred_y, pred_states = zip(*decoded) pred_alignment = "".join(list(map(revstate_f, pred_states))) x_aligned, y_aligned = states2alignment(pred_alignment, query_by_id[query], target_by_id[target]) alignments.append((query, target, x_aligned, y_aligned)) return alignments
def test_states2alignment_9(self): x = 'HCH' y = 'HCAH' s = np.array([1, 1, 0, 1]) states2alignment(s, y, x)
seqs={} with open(args.input) as f: for i in f: i=i.rstrip() if i[0]=='>': ID=i[1:] continue seqs[ID]=seqs.get(ID,'')+i keys_list = list(seqs) x = seqs[keys_list[0]] y = seqs[keys_list[1]] pred_alignment = model.align(x, y) x_aligned, y_aligned = states2alignment(pred_alignment, x, y) file = open(args.output,"w") file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned)) file.close() print(x_aligned) print(pred_alignment) print(y_aligned) x_ = torch.Tensor(model.tokenizer(str.encode(x))).long() y_ = torch.Tensor(model.tokenizer(str.encode(y))).long() seq, order = pack_sequences([x_], [y_]) score = model.aligner.score(seq, order).item()