예제 #1
0
def alignment_text(x, y, pred, truth, stats):
    """ Used to visualize alignment as text

    Parameters
    ----------
    x : str
        Protein X
    y : str
        Protein Y
    pred : list of int
        Predicted states
    truth : list of int
        Ground truth states
    stats : list of float
        List of statistics from roc_edges
    """
    # TODO: we got the truth and prediction edges swapped somewhere earlier
    true_alignment = states2alignment(truth, x, y)
    pred_alignment = states2alignment(pred, x, y)
    cols = ['tp', 'fp', 'fn', 'perc_id', 'ppv', 'fnr', 'fdr']
    stats = list(map(lambda x: np.round(x, 2), stats))
    s = list(map(lambda x: f'{x[0]}: {x[1]}', list(zip(cols, stats))))

    stats_viz = ' '.join(s)
    truth_viz = ('# Ground truth\n'
                 f'    {true_alignment[0]}\n    {true_alignment[1]}')
    pred_viz = ('# Prediction\n'
                f'    {pred_alignment[0]}\n    {pred_alignment[1]}')

    s = stats_viz + '\n' + truth_viz + '\n' + pred_viz
    return s
예제 #2
0
 def test_states2alignment_8(self):
     x = 'HECDDCSKQFSRNNHLAKHLRAH'
     y = 'YRCHKVCPYTFVGKSDLDLHQFITAH'
     s = np.array([
         1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 1
     ])
     states2alignment(s, y, x)
예제 #3
0
 def test_states2alignment_10(self):
     gen = 'YACSGGCGQNFRTMSEFNEHMIRLVH'
     oth = 'LICPKHTRDCGKVFKRNSSLRVHEH'
     pred = np.array([
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0, 1,
         2, 0, 1, 1, 1, 1
     ])
     states2alignment(pred, gen, oth)
예제 #4
0
 def test_states2alignment_11(self):
     gen = 'LNCKEIKKYCEMSFRNPDDIRKHRGAIH'
     oth = 'YTCSSCNESLRTAWCLNKHLR'
     pred = np.array([
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0
     ])
     states2alignment(pred, gen, oth)
예제 #5
0
    def test_decoding2(self):
        X = 'HECDRKTCDESFSTKGNLRVHKLGH'
        Y = 'LKCSGCGKNFKSQYAYKRHEQTH'

        needle = NeedlemanWunschDecoder(self.operator)
        dm = torch.Tensor(np.loadtxt(get_data_path('dm.txt')))
        decoded = needle.traceback(dm)
        pred_x, pred_y, pred_states = list(zip(*decoded))
        states2alignment(np.array(pred_states), X, Y)
예제 #6
0
 def test_states2alignment_3(self):
     x = ('XSDHGDVSLPPEDRVRALSQLGSAVEVNEDIPPRRYFRSGVEIIRMA'
          'SIYSEEGNIEHAFILYNKYITLFIEKLPKHRDYKSAVIPEKKDTVK'
          'KLKEIAFPKAEELKAELLKRYTKEYTEYNEEKKKEAEELARNMAIQ'
          'QELX')
     y = ('XIDVLRAKAAKERAERRLQSQQDDIDFKRAELALKRAMNRLSVAEMKX')
     s = np.array([
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
     ])
     states2alignment(s, x, y)
예제 #7
0
 def test_states2alignment_7(self):
     x = ('XGSSGSSGFDENWGADEELLLIDACETLGLGNWADIADYVGNARTKEECRDHYLKTYIEX')
     y = ('XGEIRVGNRYQADITDLLKEGEEDGRDQSRLETQVWEAHNPLTDKQIDQFLVVARSVGTF'
          'ARALDSLHMSAAAASRDITLFHAMDTLHKNIYDISKAISALVPQGGPVLCRDEMEEWSAS'
          'EANLFEEALEKYGKDFTDIQQDFLPWKSLTSIIEYYYMWKTTX')
     s = np.array([
         1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
     ])
     states2alignment(s, x, y)
예제 #8
0
 def test_states2alignment_2(self):
     s = "111:::111"
     s = np.array(list(map(tmstate_f, s)))
     X = "123456789"
     Y = "abc"
     exp_x = "123456789"
     exp_y = "---abc---"
     res_x, res_y = states2alignment(s, X, Y)
     self.assertEqual(res_x, exp_x)
     self.assertEqual(res_y, exp_y)
예제 #9
0
 def test_states2alignment_1(self):
     s = "111:::222"
     s = np.array(list(map(tmstate_f, s)))
     X = "123456"
     Y = "abcdef"
     exp_x = "123456---"
     exp_y = "---abcdef"
     res_x, res_y = states2alignment(s, X, Y)
     self.assertEqual(res_x, exp_x)
     self.assertEqual(res_y, exp_y)
예제 #10
0
def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments
예제 #11
0
 def test_states2alignment_9(self):
     x = 'HCH'
     y = 'HCAH'
     s = np.array([1, 1, 0, 1])
     states2alignment(s, y, x)
예제 #12
0
seqs={}
with open(args.input) as f:
	for i in f:
		i=i.rstrip()
		if i[0]=='>':
			ID=i[1:]
			continue
		seqs[ID]=seqs.get(ID,'')+i

keys_list = list(seqs)
x = seqs[keys_list[0]]
y = seqs[keys_list[1]]
pred_alignment = model.align(x, y)

x_aligned, y_aligned = states2alignment(pred_alignment, x, y)

file = open(args.output,"w")
file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned))
file.close()

print(x_aligned)
print(pred_alignment)
print(y_aligned)

x_ = torch.Tensor(model.tokenizer(str.encode(x))).long()
y_ = torch.Tensor(model.tokenizer(str.encode(y))).long()

seq, order = pack_sequences([x_], [y_])

score = model.aligner.score(seq, order).item()