def run(**kwargs): """ Run embedding protocol Parameters ---------- kwargs arguments (* denotes optional): sequences_file: Where sequences live prefix: Output prefix for all generated files protocol: Which embedder to use mapping_file: the mapping file generated by the pipeline when remapping indexes stage_name: The stage name Returns ------- Dictionary with results of stage """ check_required( kwargs, [ "protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file" ], ) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: {}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()))) embedder_class = PROTOCOLS[kwargs["protocol"]] if embedder_class == UniRepEmbedder and kwargs.get("use_cpu") is not None: raise InvalidParameterError( "UniRep does not support configuring `use_cpu`") result_kwargs = deepcopy(kwargs) # Download necessary files if needed # noinspection PyProtectedMember for file in embedder_class._necessary_files: if not result_kwargs.get(file): result_kwargs[file] = get_model_file(model=embedder_class.name, file=file) # noinspection PyProtectedMember for directory in embedder_class._necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip( model=embedder_class.name, directory=directory) result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]]) file_manager = get_file_manager(**kwargs) embedder: EmbedderInterface = embedder_class(**result_kwargs) _check_transform_embeddings_function(embedder, result_kwargs) return embed_and_write_batched(embedder, file_manager, result_kwargs, kwargs.get("half_precision", False))
def __init__(self, device: Union[None, str, torch.device] = None, **kwargs): """ Initializer accepts location of a pre-trained model and options """ self._options = kwargs self._device = get_device(device) # Special case because SeqVec can currently be used with either a model directory or two files if self.__class__.__name__ == "SeqVecEmbedder": # No need to download weights_file/options_file if model_directory is given if "model_directory" in self._options: return files_loaded = 0 for file in self.necessary_files: if not self._options.get(file): self._options[file] = get_model_file(model=self.name, file=file) files_loaded += 1 for directory in self.necessary_directories: if not self._options.get(directory): self._options[directory] = get_model_directories_from_zip( model=self.name, directory=directory ) files_loaded += 1 total_necessary = len(self.necessary_files) + len(self.necessary_directories) if 0 < files_loaded < total_necessary: logger.warning( f"You should pass either all necessary files or directories, or none, " f"while you provide {files_loaded} of {total_necessary}" )
def __init__( self, device: Union[None, str, torch.device] = None, model_directory: Optional[str] = None, half_precision_model: bool = False, ): """Loads the Bert Model for Masked LM""" self.device = get_device(device) self._half_precision_model = half_precision_model if not model_directory: model_directory = get_model_directories_from_zip( model=ProtTransBertBFDEmbedder.name, directory="model_directory") self.tokenizer = BertTokenizer.from_pretrained(model_directory, do_lower_case=False) self.model = BertForMaskedLM.from_pretrained(model_directory) # Compute in half precision, which is a lot faster and saves us half the memory if self._half_precision_model: self.model = self.model.half() self.model = self.model.eval().to(self.device)
def run(**kwargs): """ Run embedding protocol Parameters ---------- kwargs arguments (* denotes optional): sequences_file: Where sequences live prefix: Output prefix for all generated files protocol: Which embedder to use mapping_file: the mapping file generated by the pipeline when remapping indexes stage_name: The stage name Returns ------- Dictionary with results of stage """ embedder_class, result_kwargs = prepare_kwargs(**kwargs) # Download necessary files if needed # noinspection PyProtectedMember for file in embedder_class.necessary_files: if not result_kwargs.get(file): result_kwargs[file] = get_model_file(model=embedder_class.name, file=file) # noinspection PyProtectedMember for directory in embedder_class.necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip( model=embedder_class.name, directory=directory) file_manager = get_file_manager(**kwargs) embedder: EmbedderInterface = embedder_class(**result_kwargs) _check_transform_embeddings_function(embedder, result_kwargs) return embed_and_write_batched(embedder, file_manager, result_kwargs, kwargs.get("half_precision", False))
def __init__(self, **kwargs): """ Initialize annotation extractor. Must define non-positional arguments for paths of files. """ self._options = kwargs # Download the annotation files if needed for directory in self.necessary_directories: if not self._options.get(directory): self._options[directory] = get_model_directories_from_zip(model=f"bindembed21hbi", directory=directory) metal_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_metal.fasta' nuc_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_nuc.fasta' small_annotations_file_path = Path(self._options['annotations_directory']) / 'annotations_small.fasta' metal_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(metal_annotations_file_path), 'fasta')) nuc_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(nuc_annotations_file_path), 'fasta')) small_annotations_fasta = SeqIO.to_dict(SeqIO.parse(str(small_annotations_file_path), 'fasta')) self.metal_annotations = self.convert_annotations_to_list(metal_annotations_fasta) self.nuc_annotations = self.convert_annotations_to_list(nuc_annotations_fasta) self.small_annotations = self.convert_annotations_to_list(small_annotations_fasta)
def bindembed21(**kwargs) -> Dict[str, Any]: """ Protocol extracts binding residues from "alignment_result_file" if possible, and from "embeddings_file", otherwise. :param kwargs: :return: """ check_required(kwargs, ['alignment_results_file', 'embeddings_file', 'mapping_file', 'remapped_sequences_file']) result_kwargs = deepcopy(kwargs) file_manager = get_file_manager(**kwargs) # Download necessary files if needed # for HBI for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory) # for DL for file in BindEmbed21DLAnnotationExtractor.necessary_files: if not result_kwargs.get(file): result_kwargs[file] = get_model_file(model="bindembed21dl", file=file) hbi_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs) dl_extractor = BindEmbed21DLAnnotationExtractor(**result_kwargs) # Try to create final files (if this fails, now is better than later metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'metal_binding_predictions_file', extension='.fasta') result_kwargs['metal_binding_predictions_file'] = metal_binding_predictions_file_path nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'nucleic_acid_binding_predictions_file', extension='.fasta') result_kwargs['binding_residue_predictions_file'] = nuc_binding_predictions_file_path small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'small_molecule_binding_predictions_file', extension='.fasta') result_kwargs['binding_residue_predictions_file'] = small_binding_predictions_file_path metal_sequences = list() nuc_sequences = list() small_sequences = list() alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t', dtype={'query': 'str', 'target': 'str'}) alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy() with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file: for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']): # get HBI hit for this query hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy() hits_min_eval = hits[hits['eval'] == min(hits['eval'])] hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])] metal_sequence = deepcopy(protein_sequence) nuc_sequence = deepcopy(protein_sequence) small_sequence = deepcopy(protein_sequence) hbi_annotations = hbi_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict()) metal_inference = convert_list_of_enum_to_string(hbi_annotations.metal_ion) nuc_inference = convert_list_of_enum_to_string(hbi_annotations.nucleic_acids) small_inference = convert_list_of_enum_to_string(hbi_annotations.small_molecules) # some part of the sequence was predicted using HBI --> save output and don't run DL method if 'M' in metal_inference or 'N' in nuc_inference or 'S' in small_inference: metal_sequence.seq = Seq(metal_inference) nuc_sequence.seq = Seq(nuc_inference) small_sequence.seq = Seq(small_inference) # no inference containing binding annotations was made --> run bindEmbed21DL else: embedding = np.array(embedding_file[protein_sequence.id]) annotations = dl_extractor.get_binding_residues(embedding) metal_sequence = deepcopy(protein_sequence) nuc_sequence = deepcopy(protein_sequence) small_sequence = deepcopy(protein_sequence) metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion)) nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids)) small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules)) metal_sequences.append(metal_sequence) nuc_sequences.append(nuc_sequence) small_sequences.append(small_sequence) # Write files write_fasta_file(metal_sequences, metal_binding_predictions_file_path) write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path) write_fasta_file(small_sequences, small_binding_predictions_file_path) return result_kwargs
def bindembed21hbi(**kwargs) -> Dict[str, Any]: """ Protocol extracts binding residues from "alignment_results_file". :return: """ check_required(kwargs, ['alignment_results_file', 'mapping_file', 'remapped_sequences_file']) result_kwargs = deepcopy(kwargs) file_manager = get_file_manager(**kwargs) # Download necessary files if needed for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories: if not result_kwargs.get(directory): result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory) annotation_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs) # Try to create final files (if this fails, now is better than later metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'metal_binding_inference_file', extension='.fasta') result_kwargs['metal_binding_inference_file'] = metal_binding_predictions_file_path nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'nucleic_acid_binding_inference_file', extension='.fasta') result_kwargs['binding_residue_inference_file'] = nuc_binding_predictions_file_path small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'small_molecule_binding_inference_file', extension='.fasta') result_kwargs['binding_residue_inference_file'] = small_binding_predictions_file_path metal_sequences = list() nuc_sequences = list() small_sequences = list() alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t', dtype={'query': 'str', 'target': 'str'}) alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy() for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']): # get hits for this query hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy() # get hits with minimal E-value hits_min_eval = hits[hits['eval'] == min(hits['eval'])] # get hit with maximal PIDE hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])] annotations = annotation_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict()) metal_sequence = deepcopy(protein_sequence) nuc_sequence = deepcopy(protein_sequence) small_sequence = deepcopy(protein_sequence) metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion)) nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids)) small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules)) metal_sequences.append(metal_sequence) nuc_sequences.append(nuc_sequence) small_sequences.append(small_sequence) # Write files write_fasta_file(metal_sequences, metal_binding_predictions_file_path) write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path) write_fasta_file(small_sequences, small_binding_predictions_file_path) return result_kwargs