Python TAPETokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: tape.tokenizers

Класс/Тип: TAPETokenizer

Примеров на hotexamples.com: 7

Python TAPETokenizer - 7 примеров найдено. Это лучшие примеры Python кода для tape.tokenizers.TAPETokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TAPETokenizer(7)

Основные методы

TAPETokenizer (7)

Пример #1

Показать файл

Файл: datasets.py Проект: suvrajeet01/provis

    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 label_scheme: str,
                 label: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False):

        if label_scheme != 'ss8' and label_scheme != 'ss4':
            raise NotImplementedError

        if split not in ('train', 'valid', 'casp12', 'ts115', 'cb513'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'valid', 'casp12', "
                             f"'ts115', 'cb513']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'secondary_structure/secondary_structure_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        if label_scheme == 'ss8':
            self.label = ss8_to_idx[label]
        elif label_scheme == 'ss4':
            self.label = label
        else:
            raise NotImplementedError
        self.label_scheme = label_scheme

Пример #2

Показать файл

Файл: dataloader.py Проект: s6juncheng/BERTMHC

 def __init__(self,
              input_file,
              tokenizer: Union[str, TAPETokenizer] = 'iupac',
              max_pep_len=30,
              in_memory: bool = False,
              instance_weight: bool = False,
              train: bool = True):
     if isinstance(tokenizer, str):
         tokenizer = TAPETokenizer(vocab=tokenizer)
     self.tokenizer = tokenizer
     self.data = CSVDataset(input_file,
                            max_pep_len=max_pep_len,
                            train=train)
     self.instance_weight = instance_weight

Пример #3

Показать файл

Файл: eval_bert.py Проект: Vasyka/DeepGQuad

    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'dna',
                 in_memory: bool = False):

        if split not in ('train', 'valid', 'test'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'valid', 'test']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'small_{split}.fa'
        self.data = FastaDataset(data_path / data_file, in_memory=in_memory)

Пример #4

Показать файл

Файл: datasets.py Проект: suvrajeet01/provis

    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False,
                 max_seqlen: int = 512):

        allowed_splits = ('train', 'valid')
        if split not in allowed_splits:
            raise ValueError(f"Unrecognized split: {split}. Must be one of: {', '.join(allowed_splits)}")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'binding_sites/binding_site_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        self.max_seqlen = max_seqlen

Пример #5

Показать файл

Файл: datasets.py Проект: salesforce/provis

    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False,
                 max_seq_len=None):

        if split not in ('train', 'train_unfiltered', 'valid', 'test'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'train_unfiltered', 'valid', 'test']")

        if isinstance(tokenizer, str):
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        data_path = Path(data_path)
        data_file = f'proteinnet/proteinnet_{split}.lmdb'
        self.data = dataset_factory(data_path / data_file, in_memory)
        self.max_seq_len = max_seq_len

Пример #6

Показать файл

Файл: scl_lmdb.py Проект: rdedhia/tape

    def __init__(self,
                 data_path: Union[str, Path],
                 split: str,
                 tokenizer: Union[str, TAPETokenizer] = 'iupac',
                 in_memory: bool = False):

        if split not in ('train', 'test', 'valid'):
            raise ValueError(f"Unrecognized split: {split}. Must be one of "
                             f"['train', 'test', 'valid']")

        if isinstance(tokenizer, str):
            # If you get tokenizer in as a string, create an actual tokenizer
            tokenizer = TAPETokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        # Define the path to the data file. There are three helper datasets
        # that you can import from tape.datasets - a FastaDataset,
        # a JSONDataset, and an LMDBDataset. You can use these to load raw
        # data from your files (or of course, you can do this manually).
        data_path = Path(data_path)
        data_file = f'deeploc/deeploc_{split}.lmdb'
        self.data = LMDBDataset(data_path / data_file, in_memory=in_memory)

Пример #7

Показать файл

def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'):
    print("Processing raw data file", input_file)

    # set tokenizer
    tokenizer = TAPETokenizer(vocab=vocab)

    # create output file
    file = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length), dtype='int32')
    dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9),
                                maxshape=(None, max_sequence_length, 9), dtype='float')
    dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length),
                                dtype='uint8')
    dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21),
                                maxshape=(None, max_sequence_length, 21), dtype='float')
    dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length),
                                maxshape=(None, 2 * max_sequence_length), dtype='int32')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > max_sequence_length:
            # print("Dropping protein as length too long:", sequence_length)
            continue
        print("Process protein with length", sequence_length)
        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, max_sequence_length))
            dset2.resize((current_buffer_size, max_sequence_length, 9))
            dset3.resize((current_buffer_size, max_sequence_length))
            dset4.resize((current_buffer_size, max_sequence_length, 21))
            dset5.resize((current_buffer_size, 2 * max_sequence_length))

        primary_padded = np.zeros(max_sequence_length)
        tertiary_padded = np.zeros((9, max_sequence_length))
        mask_padded = np.zeros(max_sequence_length)
        pssm_padded = np.zeros((21, max_sequence_length))
        primary_token_padded = np.zeros(2 * max_sequence_length)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']

        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary'])

        if use_mask:
            mask = torch.Tensor(mask_padded).type(dtype=torch.bool)

            prim = torch.masked_select(torch.Tensor(primary_padded)
                                       .type(dtype=torch.long), mask)
            seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask']))

            pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \
                      .view(9, -1).transpose(0, 1).unsqueeze(1) / 100

            pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1)

            if use_gpu:
                pos = pos.cuda()

            angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos,
                                                                           [len(prim)],
                                                                           use_gpu=use_gpu)

            tertiary, _ = get_backbone_positions_from_angular_prediction(angles,
                                                                         batch_sizes,
                                                                         use_gpu=use_gpu)
            tertiary = tertiary.squeeze(1)

            primary_padded = np.zeros(max_sequence_length)
            tertiary_padded = np.zeros((max_sequence_length, 9))
            pssm_padded = np.zeros((max_sequence_length, 21))

            length_after_mask_removed = len(prim)

            primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
            primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy()
            tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
            pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy()
            mask_padded = np.zeros(max_sequence_length)
            mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = pssm_padded
        dset5[current_buffer_allocation] = primary_token_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)