Python load_fasta示例，biotransformers.utils.utils.load_fasta Python示例

示例#1

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def compute_embeddings(
        self,
        sequences: Union[List[str], str],
        batch_size: int = 1,
        pool_mode: Tuple[str, ...] = ("cls", "mean"),
        tokens_list: List[str] = None,
        silent: bool = False,
    ) -> Dict[str, np.ndarray]:
        """Function that computes embeddings of sequences.

        The embedding has a size (n_sequence, num_tokens, embeddings_size) so we use
        an aggregation function specified in pool_mode to aggregate the tensor on
        the num_tokens dimension. 'mean' signifies that we take the mean over the
        num_tokens dimension.

        Args:
            sequences: List of sequences or path of fasta file
            batch_size: Batch size
            pool_mode: Mode of pooling ('cls', 'mean', 'min', 'max)
            tokens_list: List of tokens to consider
            silent : whereas to display or not progress bar
        Returns:
            torch.Tensor: Tensor of shape [number_of_sequences, embeddings_size]
        """
        if "full" in pool_mode and not all(
                len(s) == len(sequences[0]) for s in sequences):
            raise Exception(
                'Sequences must be of same length when pool_mode = ("full",)')

        if tokens_list is None:
            tokens_list = NATURAL_AAS_LIST

        if isinstance(sequences, str):
            sequences = load_fasta(sequences)

        _check_sequence(sequences, self.model_dir, 1024)
        _check_memory_embeddings(sequences, self.embeddings_size, pool_mode)

        inputs, _, tokens = self._process_sequences_and_tokens(
            sequences, tokens_list)
        embeddings_dict = dict(
            zip(pool_mode, [torch.Tensor()] * len(pool_mode)))

        for batch_inputs in tqdm(
                self._generate_chunks(inputs, batch_size),
                total=self._get_num_batch_iter(inputs, batch_size),
                disable=silent,
        ):
            _, batch_embeddings = self._model_pass(batch_inputs)
            batch_labels = batch_inputs["input_ids"]

            batch_embeddings_dict = self._filter_and_pool_embeddings(
                batch_embeddings, batch_labels, tokens, pool_mode)

            for key in pool_mode:
                embeddings_dict[key] = torch.cat(
                    (embeddings_dict[key], batch_embeddings_dict[key]), dim=0)

        return {key: value.numpy() for key, value in embeddings_dict.items()}

示例#2

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def compute_logits(
        self,
        sequences: Union[List[str], str],
        batch_size: int = 1,
        tokens_list: List[str] = None,
        pass_mode: str = "forward",
        silent: bool = False,
    ) -> Tuple[List[np.ndarray]]:
        """Function that computes the logits from sequences.

        It returns a list of logits for each sequence. Each sequence in the list
        contains only the amino acid to interest.

        Args:
            sequences_list: List of sequences
            batch_size: number of sequences to consider for the forward pass
            pass_mode: Mode of model evaluation ('forward' or 'masked')
            tokens_list: List of tokens to consider

        Returns:
            Tuple[torch.tensor, torch.tensor]: logits and labels in torch.tensor format
        """
        if tokens_list is None:
            tokens_list = NATURAL_AAS_LIST

        if isinstance(sequences, str):
            sequences = load_fasta(sequences)

        _check_sequence(sequences, self.model_dir, 1024)
        _check_memory_logits(sequences, self.vocab_size, pass_mode)

        inputs, labels, tokens = self._process_sequences_and_tokens(
            sequences, tokens_list)
        logits = self._compute_logits(inputs,
                                      batch_size,
                                      pass_mode,
                                      silent=silent)
        logits, labels = self._filter_logits(logits, labels, tokens)

        lengths = [len(sequence) for sequence in sequences]
        splitted_logits = torch.split(logits, lengths, dim=0)
        splitted_logits = [logits.numpy() for logits in splitted_logits]

        return splitted_logits

示例#3

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def compute_calibration(
        self,
        sequences: Union[List[str], str],
        batch_size: int = 1,
        pass_mode: str = "forward",
        tokens_list: Optional[List[str]] = None,
        n_bins: int = 10,
        silent: bool = False,
    ) -> Dict[str, Any]:
        """Compute model calibration from the input sequences

        Args:
            sequences_list : [description]
            batch_size : [description]. Defaults to 1.
            pass_mode : [description]. Defaults to "forward".
            tokens_list : [description]. Defaults to None.
            n_bins : [description]. Defaults to 10.
            silent: display or not progress bar
        Returns:
            [Dict]: [description]
        """
        if tokens_list is None:
            tokens_list = NATURAL_AAS_LIST

        if isinstance(sequences, str):
            sequences = load_fasta(sequences)

        _check_sequence(sequences, self.model_dir, 1024)

        inputs, labels, tokens = self._process_sequences_and_tokens(
            sequences, tokens_list)
        logits = self._compute_logits(inputs,
                                      batch_size,
                                      pass_mode,
                                      silent=silent)
        logits, labels = self._filter_logits(logits, labels, tokens)
        calibration_dict = self._compute_calibration(logits, labels, n_bins)

        return calibration_dict

示例#4

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def compute_loglikelihood(
        self,
        sequences: Union[List[str], str],
        batch_size: int = 1,
        tokens_list: List[str] = None,
        pass_mode: str = "forward",
        silent: bool = False,
    ) -> np.ndarray:
        """Function that computes loglikelihoods of sequences

        Args:
            sequences: List of sequences
            batch_size: Batch size
            pass_mode: Mode of model evaluation ('forward' or 'masked')
            tokens_list: List of tokens to consider

        Returns:
            torch.Tensor: loglikelihoods in numpy format
        """
        if tokens_list is None:
            tokens_list = NATURAL_AAS_LIST

        if isinstance(sequences, str):
            sequences = load_fasta(sequences)

        _check_sequence(sequences, self.model_dir, 1024)
        _check_memory_logits(sequences, self.vocab_size, pass_mode)

        inputs, labels, tokens = self._process_sequences_and_tokens(
            sequences, tokens_list)
        logits = self._compute_logits(inputs,
                                      batch_size,
                                      pass_mode,
                                      silent=silent)
        loglikelihoods = self._filter_loglikelihoods(logits, labels, tokens)

        return loglikelihoods.numpy()

示例#5

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def compute_accuracy(
        self,
        sequences: Union[List[str], str],
        batch_size: int = 1,
        pass_mode: str = "forward",
        tokens_list: List[str] = None,
        silent: bool = False,
    ) -> float:
        """Compute model accuracy from the input sequences

        Args:
            sequences (Union[List[str],str]): list of sequence or fasta file
            batch_size ([type], optional): [description]. Defaults to 1.
            pass_mode ([type], optional): [description]. Defaults to "forward".
            tokens_list ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """
        if tokens_list is None:
            tokens_list = NATURAL_AAS_LIST

        if isinstance(sequences, str):
            sequences = load_fasta(sequences)
        _check_sequence(sequences, self.model_dir, 1024)

        inputs, labels, tokens = self._process_sequences_and_tokens(
            sequences, tokens_list)
        logits = self._compute_logits(inputs,
                                      batch_size,
                                      pass_mode,
                                      silent=silent)
        logits, labels = self._filter_logits(logits, labels, tokens)
        accuracy = self._compute_accuracy(logits, labels)

        return accuracy

示例#6

0

显示文件

    def finetune(
        self,
        train_sequences: Union[List[str], str],
        validation_sequences: Union[List[str], str],
        num_data_workers: int = 4,
        lr: float = 1.0e-5,
        warmup_updates: int = 1024,
        warmup_init_lr: float = 1e-7,
        epochs: int = 10,
        acc_batch_size: int = 50,
        masking_ratio: float = 0.025,
        masking_prob: float = 0.8,
        random_token_prob: float = 0.15,
        toks_per_batch: int = 2048,
        crop_sizes: Tuple[int, int] = (512, 1024),
        accelerator: str = "ddp",
        amp_level: str = "O2",
        precision: int = 16,
        logs_save_dir: str = "logs",
        logs_name_exp: str = "finetune_masked",
        checkpoint: Optional[str] = None,
        save_last_checkpoint: bool = True,
    ):
        """Function to finetune a model on a specific dataset

        This function will finetune the choosen model on a dataset of
        sequences with pytorch ligthening. You can modify the masking ratio of AA
        in the arguments for better convergence.
        Be careful with the accelerator that you use. DDP accelerator will
        launch multiple python process and do not be use in a notebook.

        More informations on GPU/accelerator compatibility here :
            https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html
        The wisest choice would be to use DDP for multi-gpu training.

        Args:
            train_sequences : Could be a list of sequences or the path of a
                              fasta file with multiple seqRecords
            validation_sequences : Could be a list of sequences or the path of a
                              fasta file with multiple seqRecords
            num_data_workers: number of cpus workers per gpu to load data
            lr : learning rate for training phase. Defaults to 1.0e-5.
            warmup_updates : Number of warming updates, number of step while increasing
            the leraning rate. Defaults to 1024.
            warmup_init_lr :  Initial lr for warming_update. Defaults to 1e-7.
            epochs :  number of epoch for training. Defaults to 10.
            acc_batch_size : number of batches of toks_per_batch tokens to accumulate before applying gradients.
            masking_ratio : ratio of tokens to be masked. Defaults to 0.025.
            masking_prob :  probability that the chose token is replaced with a mask token.
                            Defaults to 0.8.
            random_token_prob : probability that the chose token is replaced with a random token.
                                Defaults to 0.1.
            toks_per_batch: Maximum number of token to consider in a batch.Defaults to 2048.
                            This argument will set the number of sequences in a batch, which
                            is dynamically computed. Batch size use accumulate_grad_batches
                            to compute accumulate_grad_batches parameter.
            crop_sizes: range of lengths to crop dynamically sequences when sampling them
            extra_toks_per_seq: Defaults to 2,
            accelerator: type of accelerator for mutli-gpu processing (DPP recommanded)
            amp_level: allow mixed precision. Defaults to '02'
            precision: reducing precision allows to decrease the GPU memory needed.
                       Defaults to 16 (float16)
            logs_save_dir : Defaults directory to logs.
            logs_name_exp: Name of the experience in the logs.
            checkpoint : Path to a checkpoint file to restore training session.
            save_last_checkpoint: Save last checkpoint and 2 best trainings models
                                  to restore training session. Take a large amout of time
                                  and memory.
        """
        if isinstance(train_sequences, str):
            train_sequences = load_fasta(train_sequences)

        if isinstance(validation_sequences, str):
            validation_sequences = load_fasta(validation_sequences)

        if self._language_model.is_msa:
            raise NotImplementedError("MSA finetunening not implemented.")

        fit_model = self._language_model.model  # type: ignore
        alphabet = self._language_model.get_alphabet_dataloader()

        if alphabet.model_dir == "esm1b_t33_650M_UR50S":
            if crop_sizes[1] > 1024:
                raise ValueError(
                    "ESM-1b model works only with sequences of maximum "
                    "length 1024. Please, be sure to crop to maximum of "
                    "1024 when using this model.")

        lightning_model = LightningModule(
            model=fit_model,
            alphabet=alphabet,
            lr=lr,
            warmup_updates=warmup_updates,
            warmup_init_lr=warmup_init_lr,
            warmup_end_lr=lr,
        )

        lightning_data_module = BatchWithConstantNumberTokensDataModule(
            train_sequences=train_sequences,
            validation_sequences=validation_sequences,
            alphabet=alphabet,
            masking_ratio=masking_ratio,
            masking_prob=masking_prob,
            random_token_prob=random_token_prob,
            num_workers=num_data_workers,
            toks_per_batch=toks_per_batch,
            crop_sizes=crop_sizes,
        )

        if self._num_gpus == 0:
            raise ValueError("You try to train a transformers without GPU.")

        logger = CSVLogger(logs_save_dir, name=logs_name_exp)
        checkpoint_callback = None

        if save_last_checkpoint:
            checkpoint_callback = [
                ModelCheckpoint(
                    save_last=True,
                    save_top_k=2,
                    mode="max",
                    monitor="val_acc",
                    every_n_val_epochs=3,
                )
            ]

        trainer = Trainer(
            gpus=self._num_gpus,
            amp_level=amp_level,
            precision=precision,
            accumulate_grad_batches=acc_batch_size,
            max_epochs=epochs,
            logger=logger,
            accelerator=accelerator,
            replace_sampler_ddp=False,
            resume_from_checkpoint=checkpoint,
            callbacks=checkpoint_callback,
        )

        # Launch training
        trainer.fit(lightning_model, lightning_data_module)

        # Load new model
        self._language_model.set_model(lightning_model.model)
        log.info("Training completed.")

示例#7

0

显示文件

文件： transformers_wrappers.py 项目： ranzenTom/bio-transformers

    def train_masked(
        self,
        train_sequences: Union[List[str], str],
        lr: float = 1.0e-5,
        warmup_updates: int = 1024,
        warmup_init_lr: float = 1e-7,
        epochs: int = 10,
        batch_size: int = 2,
        acc_batch_size: int = 256,
        masking_ratio: float = 0.025,
        masking_prob: float = 0.8,
        random_token_prob: float = 0.15,
        toks_per_batch: int = 2048,
        filter_len=1024,
        accelerator: str = "ddp",
        amp_level: str = "O2",
        precision: int = 16,
        logs_save_dir: str = "logs",
        logs_name_exp: str = "finetune_masked",
        checkpoint: Optional[str] = None,
        save_last_checkpoint: bool = True,
    ):
        """Function to finetune a model on a specific dataset

        This function will finetune the choosen model on a dataset of
        sequences with pytorch ligthening. You can modify the masking ratio of AA
        in the arguments for better convergence.
        Be careful with the accelerator that you use. DDP accelerator will
        launch multiple python process and do not be use in a notebook.

        More informations on GPU/accelerator compatibility here :
            https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html
        The wisest choice would be to use DDP for multi-gpu training.

        Args:
            train_sequences : Could be a list of sequences or the path of a
                              fasta file with multiple seqRecords
            lr : learning rate for training phase. Defaults to 1.0e-5.
            warmup_updates : Number of warming updates, number of step while increasing
            the leraning rate. Defaults to 1024.
            warmup_init_lr :  Initial lr for warming_update. Defaults to 1e-7.
            epochs :  number of epoch for training. Defaults to 10.
            batch_size :  number of sequence to consider in a batch. Defaults to 2.
            acc_batch_size : accumulated batch size Defaults to 2048.
            masking_ratio : ratio of tokens to be masked. Defaults to 0.025.
            masking_prob :  probability that the chose token is replaced with a mask token.
                            Defaults to 0.8.
            random_token_prob : probability that the chose token is replaced with a random token.
                                Defaults to 0.1.
            toks_per_batch: Maximum number of token to consider in a batch.Defaults to 2048.
                            This argument will set the number of sequences in a batch, which
                            is dynamically computed. Batch size use accumulate_grad_batches to compute
                            accumulate_grad_batches parameter.
            extra_toks_per_seq: Defaults to 2,
            filter_len : Size of sequence to filter. Defaults to 1024. (NOT USED)
            accelerator: type of accelerator for mutli-gpu processing (DPP recommanded)
            amp_level: allow mixed precision. Defaults to '02'
            precision: reducing precision allows to decrease the GPU memory needed.
                       Defaults to 16 (float16)
            logs_save_dir : Defaults directory to logs.
            logs_name_exp: Name of the experience in the logs.
            checkpoint : Path to a checkpoint file to restore training session.
            save_last_checkpoint: Save last checkpoint and 2 best trainings models
                                  to restore training session. Take a large amout of time and memory.
        """
        if isinstance(train_sequences, str):
            train_sequences = load_fasta(train_sequences)
        _check_sequence(train_sequences, self.model_dir, 1024)  # noqa: ignore

        fit_model = self.model.module if self.multi_gpu else self.model  # type: ignore
        alphabet = self._get_alphabet_dataloader()

        extra_toks_per_seq = int(alphabet.prepend_bos) + int(
            alphabet.append_eos)
        lightning_model = LightningModule(
            model=fit_model,
            alphabet=alphabet,
            lr=lr,
            warmup_updates=warmup_updates,
            warmup_init_lr=warmup_init_lr,
            warmup_end_lr=lr,
        )

        data_module = BioDataModule(
            train_sequences,
            alphabet,
            filter_len,
            batch_size,
            masking_ratio,
            masking_prob,
            random_token_prob,
            toks_per_batch,
            extra_toks_per_seq,
        )

        if torch.cuda.is_available():
            n_gpus = torch.cuda.device_count()
        else:
            log.warning("You try to train a transformers without GPU.")
            return

        logger = CSVLogger(logs_save_dir, name=logs_name_exp)
        checkpoint_callback = None

        if save_last_checkpoint:
            checkpoint_callback = [
                ModelCheckpoint(
                    save_last=True,
                    save_top_k=2,
                    mode="max",
                    monitor="val_acc",
                    every_n_val_epochs=3,
                )
            ]

        trainer = Trainer(
            gpus=n_gpus,
            amp_level=amp_level,
            precision=precision,
            accumulate_grad_batches=acc_batch_size // batch_size,
            max_epochs=epochs,
            logger=logger,
            accelerator=accelerator,
            replace_sampler_ddp=False,
            resume_from_checkpoint=checkpoint,
            callbacks=checkpoint_callback,
        )

        trainer.fit(lightning_model, data_module)

        save_path = str(Path(join(logs_save_dir, logs_name_exp)).resolve())
        if accelerator == "ddp":
            rank = os.environ.get("LOCAL_RANK", None)
            rank = int(rank) if rank is not None else None  # type: ignore
            if rank == 0:
                self.save_model(save_path, lightning_model)
        else:
            self.save_model(save_path, lightning_model)

        if self.multi_gpu:
            self.model = DataParallel(lightning_model.model).to(self._device)
        else:
            self.model = lightning_model.model.to(self._device)

        log.info("Training completed.")