def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer(pad_ends=False) nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024 self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed)
def test_tokenizer_encode_no_padding(self): tokenizer = UniprotTokenizer(pad_ends=False) x = 'ARNDCQEGHILKMFPSTWYVXOUBZ' x = str.encode(x) res = tokenizer(x) exp = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 4, 20, 20 ]) npt.assert_allclose(exp, res)
def __init__(self, pairs, tokenizer=UniprotTokenizer()): """ Read in pairs of proteins Parameters ---------- pairs: np.array of str Pairs of proteins that are aligned. This includes gaps and require that the proteins have the same length """ self.pairs = pairs self.tokenizer = tokenizer
def __init__(self, args): super(LightningAligner, self).__init__() self.tokenizer = UniprotTokenizer(pad_ends=False) self.hparams = args self.initialize_aligner() if self.hparams.loss == 'sse': self.loss_func = SoftAlignmentLoss() elif self.hparams.loss == 'cross_entropy': self.loss_func = MatrixCrossEntropy() elif self.hparams.loss == 'path': self.loss_func = SoftPathLoss() else: raise ValueError(f'`{args.loss}` is not implemented.')
def __init__(self, query_file, db_file, tokenizer=UniprotTokenizer()): """ Read in pairs of proteins Parameters ---------- query_file : path Path to query protein sequences. db_file : path Path to database protein sequences. """ self.tokenizer = tokenizer self.query_file = query_file self.db_file = db_file
def deepblast_align( pairings: List[Tuple[str, str]], query_by_id: Dict[str, str], target_by_id: Dict[str, str], model_file: str, device: torch.device, batch_size: int, ) -> List[Tuple[str, str, str, str]]: """Aligns the given pairings using DeepBLAST Returns a list of query id, target id, query aligned, target aligned The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7) Note that the batch size has much less of an impact for DeepBLAST than for the embedders """ model = LightningAligner.load_from_checkpoint(model_file).to(device) tokenizer = UniprotTokenizer() alignments = [] # Naive batching batches = numpy.array_split(pairings, math.ceil(len(pairings) / batch_size)) for batch in tqdm(batches): # noinspection PyArgumentList queries = [ torch.Tensor(tokenizer(query_by_id[query].encode())).long() for query, _ in batch ] # noinspection PyArgumentList targets = [ torch.Tensor(tokenizer(target_by_id[target].encode())).long() for _, target in batch ] seqs, order = pack_sequences(queries, targets) gen = model.aligner.traceback(seqs.to(device), order) for (decoded, _), (query, target) in zip(gen, batch): pred_x, pred_y, pred_states = zip(*decoded) pred_alignment = "".join(list(map(revstate_f, pred_states))) x_aligned, y_aligned = states2alignment(pred_alignment, query_by_id[query], target_by_id[target]) alignments.append((query, target, x_aligned, y_aligned)) return alignments
def __init__(self, path, tokenizer=UniprotTokenizer(), tm_threshold=0.4, max_len=1024, pad_ends=False, clip_ends=True, mask_gaps=True, return_names=False, construct_paths=False): """ Read in pairs of proteins. This assumes that columns are labeled as | chain1_name | chain2_name | tmscore1 | tmscore2 | rmsd | | chain1 | chain2 | alignment | Parameters ---------- path: path Data path to aligned protein pairs. This includes gaps and require that the proteins have the same length tokenizer: UniprotTokenizer Converts residues to one-hot encodings tm_threshold: float Minimum threshold to investigate alignments max_len : float Maximum sequence length to be aligned pad_ends : bool Specifies if the ends of the sequences should be padded or not. clip_ends : bool Specifies if the ends of the alignments should be clipped or not. mask_gaps : bool Specifies if the mask for the gaps should be constructed. return_names : bool Specifies if the names of the proteins should be returned. construct_paths : bool Specifies if path distances should be calculated. Notes ----- There are start/stop tokens that are incorporated into the alignment. The needleman-wunsch algorithm assumes this to be true. """ self.tokenizer = tokenizer self.tm_threshold = tm_threshold self.max_len = max_len self.pairs = pd.read_table(path, header=None) self.construct_paths = construct_paths cols = [ 'chain1_name', 'chain2_name', 'tmscore1', 'tmscore2', 'rmsd', 'chain1', 'chain2', 'alignment' ] self.pairs.columns = cols self.pairs['tm'] = np.maximum(self.pairs['tmscore1'], self.pairs['tmscore2']) self.pairs['length'] = self.pairs.apply( lambda x: max(len(x['chain1']), len(x['chain2'])), axis=1) idx = np.logical_and(self.pairs['tm'] > self.tm_threshold, self.pairs['length'] < self.max_len) self.pairs = self.pairs.loc[idx] # TODO: pad_ends needs to be documented properly self.pad_ends = pad_ends self.clip_ends = clip_ends self.mask_gaps = mask_gaps self.return_names = return_names
def __init__(self, pairs, tokenizer=UniprotTokenizer()): self.tokenizer = tokenizer self.pairs = pairs
def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer()
def setUp(self): self.data_path = get_data_path('test_tm_align.tab') self.tokenizer = UniprotTokenizer(pad_ends=False)
def test_tokenizer(self): tokenizer = UniprotTokenizer(pad_ends=True) res = tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ') # Need to account for padding and offset exp = np.array([20] + list(range(0, 21)) + [11, 4, 20, 20] + [20]) npt.assert_allclose(res, exp)