def __init__(self, data_path: Union[str, Path], split: str, label_scheme: str, label: str, tokenizer: Union[str, TAPETokenizer] = 'iupac', in_memory: bool = False): if label_scheme != 'ss8' and label_scheme != 'ss4': raise NotImplementedError if split not in ('train', 'valid', 'casp12', 'ts115', 'cb513'): raise ValueError(f"Unrecognized split: {split}. Must be one of " f"['train', 'valid', 'casp12', " f"'ts115', 'cb513']") if isinstance(tokenizer, str): tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer data_path = Path(data_path) data_file = f'secondary_structure/secondary_structure_{split}.lmdb' self.data = dataset_factory(data_path / data_file, in_memory) if label_scheme == 'ss8': self.label = ss8_to_idx[label] elif label_scheme == 'ss4': self.label = label else: raise NotImplementedError self.label_scheme = label_scheme
def __init__(self, input_file, tokenizer: Union[str, TAPETokenizer] = 'iupac', max_pep_len=30, in_memory: bool = False, instance_weight: bool = False, train: bool = True): if isinstance(tokenizer, str): tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer self.data = CSVDataset(input_file, max_pep_len=max_pep_len, train=train) self.instance_weight = instance_weight
def __init__(self, data_path: Union[str, Path], split: str, tokenizer: Union[str, TAPETokenizer] = 'dna', in_memory: bool = False): if split not in ('train', 'valid', 'test'): raise ValueError(f"Unrecognized split: {split}. Must be one of " f"['train', 'valid', 'test']") if isinstance(tokenizer, str): tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer data_path = Path(data_path) data_file = f'small_{split}.fa' self.data = FastaDataset(data_path / data_file, in_memory=in_memory)
def __init__(self, data_path: Union[str, Path], split: str, tokenizer: Union[str, TAPETokenizer] = 'iupac', in_memory: bool = False, max_seqlen: int = 512): allowed_splits = ('train', 'valid') if split not in allowed_splits: raise ValueError(f"Unrecognized split: {split}. Must be one of: {', '.join(allowed_splits)}") if isinstance(tokenizer, str): tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer data_path = Path(data_path) data_file = f'binding_sites/binding_site_{split}.lmdb' self.data = dataset_factory(data_path / data_file, in_memory) self.max_seqlen = max_seqlen
def __init__(self, data_path: Union[str, Path], split: str, tokenizer: Union[str, TAPETokenizer] = 'iupac', in_memory: bool = False, max_seq_len=None): if split not in ('train', 'train_unfiltered', 'valid', 'test'): raise ValueError(f"Unrecognized split: {split}. Must be one of " f"['train', 'train_unfiltered', 'valid', 'test']") if isinstance(tokenizer, str): tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer data_path = Path(data_path) data_file = f'proteinnet/proteinnet_{split}.lmdb' self.data = dataset_factory(data_path / data_file, in_memory) self.max_seq_len = max_seq_len
def __init__(self, data_path: Union[str, Path], split: str, tokenizer: Union[str, TAPETokenizer] = 'iupac', in_memory: bool = False): if split not in ('train', 'test', 'valid'): raise ValueError(f"Unrecognized split: {split}. Must be one of " f"['train', 'test', 'valid']") if isinstance(tokenizer, str): # If you get tokenizer in as a string, create an actual tokenizer tokenizer = TAPETokenizer(vocab=tokenizer) self.tokenizer = tokenizer # Define the path to the data file. There are three helper datasets # that you can import from tape.datasets - a FastaDataset, # a JSONDataset, and an LMDBDataset. You can use these to load raw # data from your files (or of course, you can do this manually). data_path = Path(data_path) data_file = f'deeploc/deeploc_{split}.lmdb' self.data = LMDBDataset(data_path / data_file, in_memory=in_memory)
def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'): print("Processing raw data file", input_file) # set tokenizer tokenizer = TAPETokenizer(vocab=vocab) # create output file file = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length), maxshape=(None, max_sequence_length), dtype='int32') dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9), maxshape=(None, max_sequence_length, 9), dtype='float') dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length), maxshape=(None, max_sequence_length), dtype='uint8') dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21), maxshape=(None, max_sequence_length, 21), dtype='float') dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length), maxshape=(None, 2 * max_sequence_length), dtype='int32') input_file_pointer = open("data/raw/" + input_file, "r") while True: # while there's more proteins to process next_protein = read_protein_from_file(input_file_pointer) if next_protein is None: break sequence_length = len(next_protein['primary']) if sequence_length > max_sequence_length: # print("Dropping protein as length too long:", sequence_length) continue print("Process protein with length", sequence_length) if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size, max_sequence_length)) dset2.resize((current_buffer_size, max_sequence_length, 9)) dset3.resize((current_buffer_size, max_sequence_length)) dset4.resize((current_buffer_size, max_sequence_length, 21)) dset5.resize((current_buffer_size, 2 * max_sequence_length)) primary_padded = np.zeros(max_sequence_length) tertiary_padded = np.zeros((9, max_sequence_length)) mask_padded = np.zeros(max_sequence_length) pssm_padded = np.zeros((21, max_sequence_length)) primary_token_padded = np.zeros(2 * max_sequence_length) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] t_transposed = np.ravel(np.array(next_protein['tertiary']).T) t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T tertiary_padded[:, :sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary']) if use_mask: mask = torch.Tensor(mask_padded).type(dtype=torch.bool) prim = torch.masked_select(torch.Tensor(primary_padded) .type(dtype=torch.long), mask) seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask'])) pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \ .view(9, -1).transpose(0, 1).unsqueeze(1) / 100 pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1) if use_gpu: pos = pos.cuda() angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos, [len(prim)], use_gpu=use_gpu) tertiary, _ = get_backbone_positions_from_angular_prediction(angles, batch_sizes, use_gpu=use_gpu) tertiary = tertiary.squeeze(1) primary_padded = np.zeros(max_sequence_length) tertiary_padded = np.zeros((max_sequence_length, 9)) pssm_padded = np.zeros((max_sequence_length, 21)) length_after_mask_removed = len(prim) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy() pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy() mask_padded = np.zeros(max_sequence_length) mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed) dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded dset4[current_buffer_allocation] = pssm_padded dset5[current_buffer_allocation] = primary_token_padded current_buffer_allocation += 1 print("Wrote output to", current_buffer_allocation, "proteins to", output_file)