Пример #1
0
    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 label_scheme: str,
                 label: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False):

        if label_scheme != 'ss8' and label_scheme != 'ss4':
            raise NotImplementedError

        if split not in ('train', 'valid', 'casp12', 'ts115', 'cb513'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'valid', 'casp12', "
                             f"'ts115', 'cb513']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'secondary_structure/secondary_structure_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        if label_scheme == 'ss8':
            self.label = ss8_to_idx[label]
        elif label_scheme == 'ss4':
            self.label = label
        else:
            raise NotImplementedError
        self.label_scheme = label_scheme
Пример #2
0
 def __init__(self,
              input_file,
              tokenizer: Union[str, TAPETokenizer] = 'iupac',
              max_pep_len=30,
              in_memory: bool = False,
              instance_weight: bool = False,
              train: bool = True):
     if isinstance(tokenizer, str):
         tokenizer = TAPETokenizer(vocab=tokenizer)
     self.tokenizer = tokenizer
     self.data = CSVDataset(input_file,
                            max_pep_len=max_pep_len,
                            train=train)
     self.instance_weight = instance_weight
Пример #3
0
    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'dna',
                 in_memory: bool = False):

        if split not in ('train', 'valid', 'test'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'valid', 'test']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'small_{split}.fa'
        self.data = FastaDataset(data_path / data_file, in_memory=in_memory)
Пример #4
0
    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False,
                 max_seqlen: int = 512):

        allowed_splits = ('train', 'valid')
        if split not in allowed_splits:
            raise ValueError(f"Unrecognized split: {split}. Must be one of: {', '.join(allowed_splits)}")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'binding_sites/binding_site_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        self.max_seqlen = max_seqlen
Пример #5
0
    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False,
                 max_seq_len=None):

        if split not in ('train', 'train_unfiltered', 'valid', 'test'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'train_unfiltered', 'valid', 'test']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'proteinnet/proteinnet_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        self.max_seq_len = max_seq_len
Пример #6
0
    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False):

        if split not in ('train', 'test', 'valid'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'test', 'valid']")

        if isinstance(tokenizer, str):
            # If you get tokenizer in as a string, create an actual tokenizer
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        # Define the path to the data file. There are three helper datasets
        # that you can import from tape.datasets - a FastaDataset,
        # a JSONDataset, and an LMDBDataset. You can use these to load raw
        # data from your files (or of course, you can do this manually).
        data_path = Path(data_path)
        data_file = f'deeploc/deeploc_{split}.lmdb'
        self.data = LMDBDataset(data_path / data_file, in_memory=in_memory)
Пример #7
0
def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'):
    print("Processing raw data file", input_file)

    # set tokenizer
    tokenizer = TAPETokenizer(vocab=vocab)

    # create output file
    file = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length), dtype='int32')
    dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9),
                                maxshape=(None, max_sequence_length, 9), dtype='float')
    dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length),
                                dtype='uint8')
    dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21),
                                maxshape=(None, max_sequence_length, 21), dtype='float')
    dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length),
                                maxshape=(None, 2 * max_sequence_length), dtype='int32')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > max_sequence_length:
            # print("Dropping protein as length too long:", sequence_length)
            continue
        print("Process protein with length", sequence_length)
        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, max_sequence_length))
            dset2.resize((current_buffer_size, max_sequence_length, 9))
            dset3.resize((current_buffer_size, max_sequence_length))
            dset4.resize((current_buffer_size, max_sequence_length, 21))
            dset5.resize((current_buffer_size, 2 * max_sequence_length))

        primary_padded = np.zeros(max_sequence_length)
        tertiary_padded = np.zeros((9, max_sequence_length))
        mask_padded = np.zeros(max_sequence_length)
        pssm_padded = np.zeros((21, max_sequence_length))
        primary_token_padded = np.zeros(2 * max_sequence_length)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']

        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary'])

        if use_mask:
            mask = torch.Tensor(mask_padded).type(dtype=torch.bool)

            prim = torch.masked_select(torch.Tensor(primary_padded)
                                       .type(dtype=torch.long), mask)
            seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask']))

            pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \
                      .view(9, -1).transpose(0, 1).unsqueeze(1) / 100

            pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1)

            if use_gpu:
                pos = pos.cuda()

            angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos,
                                                                           [len(prim)],
                                                                           use_gpu=use_gpu)

            tertiary, _ = get_backbone_positions_from_angular_prediction(angles,
                                                                         batch_sizes,
                                                                         use_gpu=use_gpu)
            tertiary = tertiary.squeeze(1)

            primary_padded = np.zeros(max_sequence_length)
            tertiary_padded = np.zeros((max_sequence_length, 9))
            pssm_padded = np.zeros((max_sequence_length, 21))

            length_after_mask_removed = len(prim)

            primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
            primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy()
            tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
            pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy()
            mask_padded = np.zeros(max_sequence_length)
            mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = pssm_padded
        dset5[current_buffer_allocation] = primary_token_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)