Пример #1
0
def run(**kwargs):
    """
    Run embedding protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        sequences_file: Where sequences live
        prefix: Output prefix for all generated files
        protocol: Which embedder to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes
        stage_name: The stage name

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(
        kwargs,
        [
            "protocol", "prefix", "stage_name", "remapped_sequences_file",
            "mapping_file"
        ],
    )

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: {}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())))

    embedder_class = PROTOCOLS[kwargs["protocol"]]

    if embedder_class == UniRepEmbedder and kwargs.get("use_cpu") is not None:
        raise InvalidParameterError(
            "UniRep does not support configuring `use_cpu`")

    result_kwargs = deepcopy(kwargs)

    # Download necessary files if needed
    # noinspection PyProtectedMember
    for file in embedder_class._necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=embedder_class.name,
                                                 file=file)

    # noinspection PyProtectedMember
    for directory in embedder_class._necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(
                model=embedder_class.name, directory=directory)

    result_kwargs.setdefault("max_amino_acids",
                             DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]])

    file_manager = get_file_manager(**kwargs)
    embedder: EmbedderInterface = embedder_class(**result_kwargs)
    _check_transform_embeddings_function(embedder, result_kwargs)

    return embed_and_write_batched(embedder, file_manager, result_kwargs,
                                   kwargs.get("half_precision", False))
Пример #2
0
    def __init__(self, device: Union[None, str, torch.device] = None, **kwargs):
        """
        Initializer accepts location of a pre-trained model and options
        """
        self._options = kwargs
        self._device = get_device(device)

        # Special case because SeqVec can currently be used with either a model directory or two files
        if self.__class__.__name__ == "SeqVecEmbedder":
            # No need to download weights_file/options_file if model_directory is given
            if "model_directory" in self._options:
                return

        files_loaded = 0
        for file in self.necessary_files:
            if not self._options.get(file):
                self._options[file] = get_model_file(model=self.name, file=file)
                files_loaded += 1

        for directory in self.necessary_directories:
            if not self._options.get(directory):
                self._options[directory] = get_model_directories_from_zip(
                    model=self.name, directory=directory
                )

                files_loaded += 1

        total_necessary = len(self.necessary_files) + len(self.necessary_directories)
        if 0 < files_loaded < total_necessary:
            logger.warning(
                f"You should pass either all necessary files or directories, or none, "
                f"while you provide {files_loaded} of {total_necessary}"
            )
Пример #3
0
    def __init__(
        self,
        device: Union[None, str, torch.device] = None,
        model_directory: Optional[str] = None,
        half_precision_model: bool = False,
    ):
        """Loads the Bert Model for Masked LM"""
        self.device = get_device(device)
        self._half_precision_model = half_precision_model

        if not model_directory:
            model_directory = get_model_directories_from_zip(
                model=ProtTransBertBFDEmbedder.name,
                directory="model_directory")
        self.tokenizer = BertTokenizer.from_pretrained(model_directory,
                                                       do_lower_case=False)
        self.model = BertForMaskedLM.from_pretrained(model_directory)
        # Compute in half precision, which is a lot faster and saves us half the memory
        if self._half_precision_model:
            self.model = self.model.half()
        self.model = self.model.eval().to(self.device)
Пример #4
0
def run(**kwargs):
    """
    Run embedding protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        sequences_file: Where sequences live
        prefix: Output prefix for all generated files
        protocol: Which embedder to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes
        stage_name: The stage name

    Returns
    -------
    Dictionary with results of stage
    """
    embedder_class, result_kwargs = prepare_kwargs(**kwargs)

    # Download necessary files if needed
    # noinspection PyProtectedMember
    for file in embedder_class.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=embedder_class.name,
                                                 file=file)

    # noinspection PyProtectedMember
    for directory in embedder_class.necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(
                model=embedder_class.name, directory=directory)

    file_manager = get_file_manager(**kwargs)
    embedder: EmbedderInterface = embedder_class(**result_kwargs)
    _check_transform_embeddings_function(embedder, result_kwargs)

    return embed_and_write_batched(embedder, file_manager, result_kwargs,
                                   kwargs.get("half_precision", False))
Пример #5
0
    def __init__(self, **kwargs):
        """
        Initialize annotation extractor. Must define non-positional arguments for paths of files.
        """

        self._options = kwargs

        # Download the annotation files if needed
        for directory in self.necessary_directories:
            if not self._options.get(directory):
                self._options[directory] = get_model_directories_from_zip(model=f"bindembed21hbi", directory=directory)

        metal_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_metal.fasta'
        nuc_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_nuc.fasta'
        small_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_small.fasta'

        metal_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(metal_annotations_file_path), 'fasta'))
        nuc_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(nuc_annotations_file_path), 'fasta'))
        small_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(small_annotations_file_path), 'fasta'))

        self.metal_annotations = self.convert_annotations_to_list(metal_annotations_fasta)
        self.nuc_annotations = self.convert_annotations_to_list(nuc_annotations_fasta)
        self.small_annotations = self.convert_annotations_to_list(small_annotations_fasta)
Пример #6
0
def bindembed21(**kwargs) -> Dict[str, Any]:
    """
    Protocol extracts binding residues from "alignment_result_file" if possible, and from "embeddings_file", otherwise.
    :param kwargs:
    :return:
    """

    check_required(kwargs, ['alignment_results_file', 'embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    # for HBI
    for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory)
    # for DL
    for file in BindEmbed21DLAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model="bindembed21dl", file=file)

    hbi_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs)
    dl_extractor = BindEmbed21DLAnnotationExtractor(**result_kwargs)

    # Try to create final files (if this fails, now is better than later
    metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'metal_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['metal_binding_predictions_file'] = metal_binding_predictions_file_path
    nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                 result_kwargs.get('stage_name'),
                                                                 'nucleic_acid_binding_predictions_file',
                                                                 extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = nuc_binding_predictions_file_path
    small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'small_molecule_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = small_binding_predictions_file_path

    metal_sequences = list()
    nuc_sequences = list()
    small_sequences = list()

    alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t',
                                 dtype={'query': 'str', 'target': 'str'})
    alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy()

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            # get HBI hit for this query
            hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy()
            hits_min_eval = hits[hits['eval'] == min(hits['eval'])]
            hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])]

            metal_sequence = deepcopy(protein_sequence)
            nuc_sequence = deepcopy(protein_sequence)
            small_sequence = deepcopy(protein_sequence)

            hbi_annotations = hbi_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict())
            metal_inference = convert_list_of_enum_to_string(hbi_annotations.metal_ion)
            nuc_inference = convert_list_of_enum_to_string(hbi_annotations.nucleic_acids)
            small_inference = convert_list_of_enum_to_string(hbi_annotations.small_molecules)

            # some part of the sequence was predicted using HBI --> save output and don't run DL method
            if 'M' in metal_inference or 'N' in nuc_inference or 'S' in small_inference:
                metal_sequence.seq = Seq(metal_inference)
                nuc_sequence.seq = Seq(nuc_inference)
                small_sequence.seq = Seq(small_inference)
            # no inference containing binding annotations was made --> run bindEmbed21DL
            else:
                embedding = np.array(embedding_file[protein_sequence.id])
                annotations = dl_extractor.get_binding_residues(embedding)
                metal_sequence = deepcopy(protein_sequence)
                nuc_sequence = deepcopy(protein_sequence)
                small_sequence = deepcopy(protein_sequence)

                metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion))
                nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids))
                small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules))

            metal_sequences.append(metal_sequence)
            nuc_sequences.append(nuc_sequence)
            small_sequences.append(small_sequence)

    # Write files
    write_fasta_file(metal_sequences, metal_binding_predictions_file_path)
    write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path)
    write_fasta_file(small_sequences, small_binding_predictions_file_path)

    return result_kwargs
Пример #7
0
def bindembed21hbi(**kwargs) -> Dict[str, Any]:
    """
    Protocol extracts binding residues from "alignment_results_file".

    :return:
    """

    check_required(kwargs, ['alignment_results_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory)

    annotation_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs)

    # Try to create final files (if this fails, now is better than later
    metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'metal_binding_inference_file',
                                                                   extension='.fasta')
    result_kwargs['metal_binding_inference_file'] = metal_binding_predictions_file_path
    nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                 result_kwargs.get('stage_name'),
                                                                 'nucleic_acid_binding_inference_file',
                                                                 extension='.fasta')
    result_kwargs['binding_residue_inference_file'] = nuc_binding_predictions_file_path
    small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'small_molecule_binding_inference_file',
                                                                   extension='.fasta')
    result_kwargs['binding_residue_inference_file'] = small_binding_predictions_file_path

    metal_sequences = list()
    nuc_sequences = list()
    small_sequences = list()

    alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t',
                                 dtype={'query': 'str', 'target': 'str'})
    alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy()

    for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
        # get hits for this query
        hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy()
        # get hits with minimal E-value
        hits_min_eval = hits[hits['eval'] == min(hits['eval'])]
        # get hit with maximal PIDE
        hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])]

        annotations = annotation_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict())
        metal_sequence = deepcopy(protein_sequence)
        nuc_sequence = deepcopy(protein_sequence)
        small_sequence = deepcopy(protein_sequence)

        metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion))
        nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids))
        small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules))

        metal_sequences.append(metal_sequence)
        nuc_sequences.append(nuc_sequence)
        small_sequences.append(small_sequence)

    # Write files
    write_fasta_file(metal_sequences, metal_binding_predictions_file_path)
    write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path)
    write_fasta_file(small_sequences, small_binding_predictions_file_path)

    return result_kwargs