def testSequenceToOneHotUnknownMappingError(self, seq): """tbd.""" with self.assertRaises(ValueError): residue_constants.sequence_to_onehot( sequence=seq, mapping=residue_constants.restype_order_with_x, map_unknown_to_x=True)
def testSequenceToOneHotHHBlits(self): """tbd.""" one_hot = residue_constants.sequence_to_onehot( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ-', residue_constants.HHBLITS_AA_TO_ID) exp_one_hot = np.array([ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] ]) np.testing.assert_array_equal(one_hot, exp_one_hot)
def testSequenceToOneHotUnknownMapping(self): """tbd.""" seq = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' expected_out = np.zeros([26, 21]) for row, position in enumerate([ 0, 20, 4, 3, 6, 13, 7, 8, 9, 20, 11, 10, 12, 2, 20, 14, 5, 1, 15, 16, 20, 19, 17, 20, 18, 20 ]): expected_out[row, position] = 1 aa_types = residue_constants.sequence_to_onehot( sequence=seq, mapping=residue_constants.restype_order_with_x, map_unknown_to_x=True) self.assertTrue((aa_types == expected_out).all())
def make_sequence_features(sequence: str, description: str, num_res: int) -> FeatureDict: """Constructs a feature dict of sequence features.""" features = {} features['aatype'] = residue_constants.sequence_to_onehot( sequence=sequence, mapping=residue_constants.restype_order_with_x, map_unknown_to_x=True) features['between_segment_residues'] = np.zeros((num_res, ), dtype=np.int32) features['domain_name'] = np.array([description.encode('utf-8')], dtype=np.object_) features['residue_index'] = np.array(range(num_res), dtype=np.int32) features['seq_length'] = np.array([num_res] * num_res, dtype=np.int32) features['sequence'] = np.array([sequence.encode('utf-8')], dtype=np.object_) return features
def _extract_template_features( mmcif_object: mmcif_parsing.MmcifObject, pdb_id: str, mapping: Mapping[int, int], template_sequence: str, query_sequence: str, template_chain_id: str, kalign_binary_path: str) -> Tuple[Dict[str, Any], Optional[str]]: """Parses atom positions in the target structure and aligns with the query. Atoms for each residue in the template structure are indexed to coincide with their corresponding residue in the query sequence, according to the alignment mapping provided. Args: mmcif_object: mmcif_parsing.MmcifObject representing the template. pdb_id: PDB code for the template. mapping: Dictionary mapping indices in the query sequence to indices in the template sequence. template_sequence: String describing the amino acid sequence for the template protein. query_sequence: String describing the amino acid sequence for the query protein. template_chain_id: String ID describing which chain in the structure proto should be used. kalign_binary_path: The path to a kalign executable used for template realignment. Returns: A tuple with: * A dictionary containing the extra features derived from the template protein structure. * A warning message if the hit was realigned to the actual mmCIF sequence. Otherwise None. Raises: NoChainsError: If the mmcif object doesn't contain any chains. SequenceNotInTemplateError: If the given chain id / sequence can't be found in the mmcif object. QueryToTemplateAlignError: If the actual template in the mmCIF file can't be aligned to the query. NoAtomDataInTemplateError: If the mmcif object doesn't contain atom positions. TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any unmasked residues. """ if mmcif_object is None or not mmcif_object.chain_to_seqres: raise NoChainsError('No chains in PDB: %s_%s' % (pdb_id, template_chain_id)) warning = None try: seqres, chain_id, mapping_offset = _find_template_in_pdb( template_chain_id=template_chain_id, template_sequence=template_sequence, mmcif_object=mmcif_object) except SequenceNotInTemplateError: # If PDB70 contains a different version of the template, we use the sequence # from the mmcif_object. chain_id = template_chain_id warning = ( f'The exact sequence {template_sequence} was not found in ' f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.' ) logger.warning(warning) # This throws an exception if it fails to realign the hit. seqres, mapping = _realign_pdb_template_to_query( old_template_sequence=template_sequence, template_chain_id=template_chain_id, mmcif_object=mmcif_object, old_mapping=mapping, kalign_binary_path=kalign_binary_path) logger.info('Sequence in %s_%s: %s successfully realigned to %s', pdb_id, chain_id, template_sequence, seqres) # The template sequence changed. template_sequence = seqres # No mapping offset, the query is aligned to the actual sequence. mapping_offset = 0 try: # Essentially set to infinity - we don't want to reject templates unless # they're really really bad. all_atom_positions, all_atom_mask = _get_atom_positions( mmcif_object, chain_id, max_ca_ca_distance=150.0) except (CaDistanceError, KeyError) as ex: raise NoAtomDataInTemplateError('Could not get atom data (%s_%s): %s' % (pdb_id, chain_id, str(ex))) from ex all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0]) all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0]) output_templates_sequence = [] templates_all_atom_positions = [] templates_all_atom_masks = [] for _ in query_sequence: # Residues in the query_sequence that are not in the template_sequence: templates_all_atom_positions.append( np.zeros((residue_constants.atom_type_num, 3))) templates_all_atom_masks.append( np.zeros(residue_constants.atom_type_num)) output_templates_sequence.append('-') for k, v in mapping.items(): template_index = v + mapping_offset templates_all_atom_positions[k] = all_atom_positions[template_index][0] templates_all_atom_masks[k] = all_atom_masks[template_index][0] output_templates_sequence[k] = template_sequence[v] # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O). if np.sum(templates_all_atom_masks) < 5: raise TemplateAtomMaskAllZerosError( 'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d' % (pdb_id, chain_id, min(mapping.values()) + mapping_offset, max(mapping.values()) + mapping_offset)) output_templates_sequence = ''.join(output_templates_sequence) templates_aatype = residue_constants.sequence_to_onehot( output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID) return ({ 'template_all_atom_positions': np.array(templates_all_atom_positions), 'template_all_atom_masks': np.array(templates_all_atom_masks), 'template_sequence': output_templates_sequence.encode(), 'template_aatype': np.array(templates_aatype), 'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(), }, warning)
def testSequenceToOneHotStandard(self): """tbd.""" one_hot = residue_constants.sequence_to_onehot( 'ARNDCQEGHILKMFPSTWYV', residue_constants.restype_order) np.testing.assert_array_equal(one_hot, np.eye(20))