def load_gsd_topology(filename, frame=0): """ Create an MDTraj.Topology from a GSD file Parameters ---------- filename : path-like Path of GSD trajectory file. frame : int, 0 Frame of GSD file to parse topology Returns ------- top : mdtraj.Topology Notes ----- GSD files support systems with variable topologies. For compatibility with MDTraj, only the topology from GSD frame 0 is used to construct the MDTraj topology. """ import gsd.hoomd with gsd.hoomd.open(filename, 'rb') as gsdfile: top = Topology() generic_chain = top.add_chain() generic_residue = top.add_residue('A', generic_chain) all_particle_types = gsdfile[frame].particles.types for particle_type_id in gsdfile[frame].particles.typeid: top.add_atom(all_particle_types[particle_type_id], virtual_site, generic_residue) for bond in gsdfile[frame].bonds.group: atom1, atom2 = bond[0], bond[1] top.add_bond(top.atom(atom1), top.atom(atom2)) return top
def __init__(self, topology): r"""Calpha representation mapping Maps an all-atom representation to just the C-alpha's of the backbone. Holds default assignment of . Parameters ---------- topology : mdtraj.Topology object """ n_calphas = len( [atm.index for atm in topology.atoms if atm.name == "CA"]) assert ( n_calphas == topology.n_residues ), " number of C-alpha is not equal to number of residues! check for missing or non-standard amino acids." self._ref_topology = topology.copy() # Build new topology newTopology = Topology() prev_ca = None ca_idxs = [] atm_idx = 0 for chain in topology._chains: newChain = newTopology.add_chain() for residue in chain._residues: resSeq = getattr(residue, 'resSeq', None) or residue.index newResidue = newTopology.add_residue(residue.name, newChain, resSeq) # map CA new_ca = newTopology.add_atom( 'CA', md.core.element.get_by_symbol('C'), newResidue, serial=atm_idx) ca_idxs.append([[ atm.index for atm in residue.atoms if \ (atm.name == "CA") ][0], atm_idx ]) if prev_ca is None: prev_ca = new_ca else: if prev_ca.residue.chain.index == new_ca.residue.chain.index: # Only bond atoms in same chain newTopology.add_bond(prev_ca, new_ca) prev_ca = new_ca atm_idx += 1 self._ca_idxs = np.array(ca_idxs) self.topology = newTopology
def topology(self): """Get the topology out from the file Returns ------- topology : mdtraj.Topology A topology object """ try: raw = self._get_node(self._handle.root, name='topology')[0] if not isinstance(raw, string_types): raw = raw.decode() topology_dict = json.loads(raw) except self.tables.NoSuchNodeError: return None topology = Topology() for chain_dict in sorted(topology_dict['chains'], key=operator.itemgetter('index')): chain = topology.add_chain() for residue_dict in sorted(chain_dict['residues'], key=operator.itemgetter('index')): try: resSeq = residue_dict["resSeq"] except KeyError: resSeq = None warnings.warn( 'No resSeq information found in HDF file, defaulting to zero-based indices' ) try: segment_id = residue_dict["segmentID"] except KeyError: segment_id = "" residue = topology.add_residue(residue_dict['name'], chain, resSeq=resSeq, segment_id=segment_id) for atom_dict in sorted(residue_dict['atoms'], key=operator.itemgetter('index')): try: element = elem.get_by_symbol(atom_dict['element']) except KeyError: element = elem.virtual topology.add_atom(atom_dict['name'], element, residue) atoms = list(topology.atoms) for index1, index2 in topology_dict['bonds']: topology.add_bond(atoms[index1], atoms[index2]) return topology
def create_water_topology_on_disc(n): topfile = tempfile.mktemp('.pdb') top = Topology() chain = top.add_chain() for i in range(n): res = top.add_residue('r%i' % i, chain) h1 = top.add_atom('H', hydrogen, res) o = top.add_atom('O', oxygen, res) h2 = top.add_atom('H', hydrogen, res) top.add_bond(h1, o) top.add_bond(h2, o) xyz = np.zeros((n * 3, 3)) Trajectory(xyz, top).save_pdb(topfile) return topfile
def create_water_topology_on_disc(n): topfile = tempfile.mktemp('.pdb') top = Topology() chain = top.add_chain() for i in xrange(n): res = top.add_residue('r%i' % i, chain) h1 = top.add_atom('H', hydrogen, res) o = top.add_atom('O', oxygen, res) h2 = top.add_atom('H', hydrogen, res) top.add_bond(h1, o) top.add_bond(h2, o) xyz = np.zeros((n * 3, 3)) Trajectory(xyz, top).save_pdb(topfile) return topfile
def topology(self): """Get the topology out from the file Returns ------- topology : mdtraj.Topology A topology object """ try: raw = self._get_node('/', name='topology')[0] if not isinstance(raw, string_types): raw = raw.decode() topology_dict = json.loads(raw) except self.tables.NoSuchNodeError: return None topology = Topology() for chain_dict in sorted(topology_dict['chains'], key=operator.itemgetter('index')): chain = topology.add_chain() for residue_dict in sorted(chain_dict['residues'], key=operator.itemgetter('index')): try: resSeq = residue_dict["resSeq"] except KeyError: resSeq = None warnings.warn('No resSeq information found in HDF file, defaulting to zero-based indices') try: segment_id = residue_dict["segmentID"] except KeyError: segment_id = "" residue = topology.add_residue(residue_dict['name'], chain, resSeq=resSeq, segment_id=segment_id) for atom_dict in sorted(residue_dict['atoms'], key=operator.itemgetter('index')): try: element = elem.get_by_symbol(atom_dict['element']) except KeyError: element = elem.virtual topology.add_atom(atom_dict['name'], element, residue) atoms = list(topology.atoms) for index1, index2 in topology_dict['bonds']: topology.add_bond(atoms[index1], atoms[index2]) return topology
def topology(self): """Get the topology out from the file Returns ------- topology : mdtraj.Topology A topology object """ try: raw = self._get_node("/", name="topology")[0] if not isinstance(raw, string_types): raw = raw.decode() topology_dict = json.loads(raw) except self.tables.NoSuchNodeError: return None topology = Topology() for chain_dict in sorted(topology_dict["chains"], key=operator.itemgetter("index")): chain = topology.add_chain() for residue_dict in sorted(chain_dict["residues"], key=operator.itemgetter("index")): try: resSeq = residue_dict["resSeq"] except KeyError: resSeq = None warnings.warn("No resSeq information found in HDF file, defaulting to zero-based indices") residue = topology.add_residue(residue_dict["name"], chain, resSeq=resSeq) for atom_dict in sorted(residue_dict["atoms"], key=operator.itemgetter("index")): try: element = elem.get_by_symbol(atom_dict["element"]) except KeyError: element = None topology.add_atom(atom_dict["name"], element, residue) atoms = list(topology.atoms) for index1, index2 in topology_dict["bonds"]: topology.add_bond(atoms[index1], atoms[index2]) return topology
def mutate(self, mut_res_idx, mut_new_resname): """Mutate residue Parameters ---------- mut_res_idx : int Index of residue to mutate. mut_new_resname : str Three-letter code of residue to mutate to. """ assert (self.topology.residue(mut_res_idx).name != mut_new_resname), "mutating the residue to itself!" # Build new topology newTopology = Topology() for chain in self.topology.chains: newChain = newTopology.add_chain() for residue in chain._residues: res_idx = residue.index if res_idx == mut_res_idx: # create mutated residue self._add_mutated_residue(mut_new_resname, newTopology, newChain, res_idx, residue) else: # copy old residue atoms directly newResidue = newTopology.add_residue(residue.name, newChain, res_idx) for atom in residue.atoms: newTopology.add_atom(atom.name, md.core.element.get_by_symbol(atom.element.symbol), newResidue, serial=atom.index) # The bond connectivity should stay identical for atm1, atm2 in self.topology._bonds: new_atm1 = newTopology.atom(atm1.index) new_atm2 = newTopology.atom(atm2.index) newTopology.add_bond(new_atm1, new_atm2) self._prev_topology = self.topology.copy() self.topology = newTopology
class PDBTrajectoryFile(object): """Interface for reading and writing Protein Data Bank (PDB) files Parameters ---------- filename : str The filename to open. A path to a file on disk. mode : {'r', 'w'} The mode in which to open the file, either 'r' for read or 'w' for write. force_overwrite : bool If opened in write mode, and a file by the name of `filename` already exists on disk, should we overwrite it? Attributes ---------- positions : np.ndarray, shape=(n_frames, n_atoms, 3) topology : mdtraj.Topology closed : bool Notes ----- When writing pdb files, mdtraj follows the PDB3.0 standard as closely as possible. During *reading* however, we try to be more lenient. For instance, we will parse common nonstandard atom names during reading, and convert them into the standard names. The replacement table used by mdtraj is at {mdtraj_source}/formats/pdb/data/pdbNames.xml. See Also -------- mdtraj.load_pdb : High-level wrapper that returns a ``md.Trajectory`` """ distance_unit = 'angstroms' _residueNameReplacements = {} _atomNameReplacements = {} _chain_names = [chr(ord('A') + i) for i in range(26)] def __init__(self, filename, mode='r', force_overwrite=True): self._open = False self._file = None self._topology = None self._positions = None self._mode = mode self._last_topology = None if mode == 'r': PDBTrajectoryFile._loadNameReplacementTables() if _is_url(filename): self._file = urlopen(filename) if filename.lower().endswith('.gz'): if six.PY3: self._file = gzip.GzipFile(fileobj=self._file) else: self._file = gzip.GzipFile(fileobj=six.StringIO( self._file.read())) if six.PY3: self._file = six.StringIO(self._file.read().decode('utf-8')) else: self._file = open_maybe_zipped(filename, 'r') self._read_models() elif mode == 'w': self._header_written = False self._footer_written = False self._file = open_maybe_zipped(filename, 'w', force_overwrite) else: raise ValueError("invalid mode: %s" % mode) self._open = True def write(self, positions, topology, modelIndex=None, unitcell_lengths=None, unitcell_angles=None, bfactors=None): """Write a PDB file to disk Parameters ---------- positions : array_like The list of atomic positions to write. topology : mdtraj.Topology The Topology defining the model to write. modelIndex : {int, None} If not None, the model will be surrounded by MODEL/ENDMDL records with this index unitcell_lengths : {tuple, None} Lengths of the three unit cell vectors, or None for a non-periodic system unitcell_angles : {tuple, None} Angles between the three unit cell vectors, or None for a non-periodic system bfactors : array_like, default=None, shape=(n_atoms,) Save bfactors with pdb file. Should contain a single number for each atom in the topology """ if not self._mode == 'w': raise ValueError('file not opened for writing') if not self._header_written: self._write_header(unitcell_lengths, unitcell_angles) self._header_written = True if ilen(topology.atoms) != len(positions): raise ValueError('The number of positions must match the number of atoms') if np.any(np.isnan(positions)): raise ValueError('Particle position is NaN') if np.any(np.isinf(positions)): raise ValueError('Particle position is infinite') self._last_topology = topology # Hack to save the topology of the last frame written, allows us to output CONECT entries in write_footer() if bfactors is None: bfactors = ['{0:5.2f}'.format(0.0)] * len(positions) else: if (np.max(bfactors) >= 100) or (np.min(bfactors) <= -10): raise ValueError("bfactors must be in (-10, 100)") bfactors = ['{0:5.2f}'.format(b) for b in bfactors] atomIndex = 1 posIndex = 0 if modelIndex is not None: print("MODEL %4d" % modelIndex, file=self._file) for (chainIndex, chain) in enumerate(topology.chains): chainName = self._chain_names[chainIndex % len(self._chain_names)] residues = list(chain.residues) for (resIndex, res) in enumerate(residues): if len(res.name) > 3: resName = res.name[:3] else: resName = res.name for atom in res.atoms: if len(atom.name) < 4 and atom.name[:1].isalpha() and (atom.element is None or len(atom.element.symbol) < 2): atomName = ' '+atom.name elif len(atom.name) > 4: atomName = atom.name[:4] else: atomName = atom.name coords = positions[posIndex] if atom.element is not None: symbol = atom.element.symbol else: symbol = ' ' line = "ATOM %5d %-4s %3s %s%4d %s%s%s 1.00 %s %2s " % ( atomIndex % 100000, atomName, resName, chainName, (res.resSeq) % 10000, _format_83(coords[0]), _format_83(coords[1]), _format_83(coords[2]), bfactors[posIndex], symbol) assert len(line) == 80, 'Fixed width overflow detected' print(line, file=self._file) posIndex += 1 atomIndex += 1 if resIndex == len(residues)-1: print("TER %5d %3s %s%4d" % (atomIndex, resName, chainName, res.resSeq), file=self._file) atomIndex += 1 if modelIndex is not None: print("ENDMDL", file=self._file) def _write_header(self, unitcell_lengths, unitcell_angles, write_metadata=True): """Write out the header for a PDB file. Parameters ---------- unitcell_lengths : {tuple, None} The lengths of the three unitcell vectors, ``a``, ``b``, ``c`` unitcell_angles : {tuple, None} The angles between the three unitcell vectors, ``alpha``, ``beta``, ``gamma`` """ if not self._mode == 'w': raise ValueError('file not opened for writing') if unitcell_lengths is None and unitcell_angles is None: return if unitcell_lengths is not None and unitcell_angles is not None: if not len(unitcell_lengths) == 3: raise ValueError('unitcell_lengths must be length 3') if not len(unitcell_angles) == 3: raise ValueError('unitcell_angles must be length 3') else: raise ValueError('either unitcell_lengths and unitcell_angles' 'should both be spefied, or neither') box = list(unitcell_lengths) + list(unitcell_angles) assert len(box) == 6 if write_metadata: print("REMARK 1 CREATED WITH MDTraj %s, %s" % (version.version, str(date.today())), file=self._file) print("CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1 1 " % tuple(box), file=self._file) def _write_footer(self): if not self._mode == 'w': raise ValueError('file not opened for writing') # Identify bonds that should be listed as CONECT records. standardResidues = ['ALA', 'ASN', 'CYS', 'GLU', 'HIS', 'LEU', 'MET', 'PRO', 'THR', 'TYR', 'ARG', 'ASP', 'GLN', 'GLY', 'ILE', 'LYS', 'PHE', 'SER', 'TRP', 'VAL', 'A', 'G', 'C', 'U', 'I', 'DA', 'DG', 'DC', 'DT', 'DI', 'HOH'] conectBonds = [] if self._last_topology is not None: for atom1, atom2 in self._last_topology.bonds: if atom1.residue.name not in standardResidues or atom2.residue.name not in standardResidues: conectBonds.append((atom1, atom2)) elif atom1.name == 'SG' and atom2.name == 'SG' and atom1.residue.name == 'CYS' and atom2.residue.name == 'CYS': conectBonds.append((atom1, atom2)) if len(conectBonds) > 0: # Work out the index used in the PDB file for each atom. atomIndex = {} nextAtomIndex = 0 prevChain = None for chain in self._last_topology.chains: for atom in chain.atoms: if atom.residue.chain != prevChain: nextAtomIndex += 1 prevChain = atom.residue.chain atomIndex[atom] = nextAtomIndex nextAtomIndex += 1 # Record which other atoms each atom is bonded to. atomBonds = {} for atom1, atom2 in conectBonds: index1 = atomIndex[atom1] index2 = atomIndex[atom2] if index1 not in atomBonds: atomBonds[index1] = [] if index2 not in atomBonds: atomBonds[index2] = [] atomBonds[index1].append(index2) atomBonds[index2].append(index1) # Write the CONECT records. for index1 in sorted(atomBonds): bonded = atomBonds[index1] while len(bonded) > 4: print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=self._file) del bonded[:4] line = "CONECT%5d" % index1 for index2 in bonded: line = "%s%5d" % (line, index2) print(line, file=self._file) print("END", file=self._file) self._footer_written = True @classmethod def set_chain_names(cls, values): """Set the cycle of chain names used when writing PDB files When writing PDB files, PDBTrajectoryFile translates each chain's index into a name -- the name is what's written in the file. By default, chains are named with the letters A-Z. Parameters ---------- values : list A list of chacters (strings of length 1) that the PDB writer will cycle through to choose chain names. """ for item in values: if not isinstance(item, six.string_types) and len(item) == 1: raise TypeError('Names must be a single character string') cls._chain_names = values @property def positions(self): """The cartesian coordinates of all of the atoms in each frame. Available when a file is opened in mode='r' """ return self._positions @property def topology(self): """The topology from this PDB file. Available when a file is opened in mode='r' """ return self._topology @property def unitcell_lengths(self): "The unitcell lengths (3-tuple) in this PDB file. May be None" return self._unitcell_lengths @property def unitcell_angles(self): "The unitcell angles (3-tuple) in this PDB file. May be None" return self._unitcell_angles @property def closed(self): "Whether the file is closed" return not self._open def close(self): "Close the PDB file" if self._mode == 'w' and not self._footer_written: self._write_footer() if self._open: self._file.close() self._open = False def _read_models(self): if not self._mode == 'r': raise ValueError('file not opened for reading') self._topology = Topology() pdb = PdbStructure(self._file, load_all_models=True) atomByNumber = {} for chain in pdb.iter_chains(): c = self._topology.add_chain() for residue in chain.iter_residues(): resName = residue.get_name() if resName in PDBTrajectoryFile._residueNameReplacements: resName = PDBTrajectoryFile._residueNameReplacements[resName] r = self._topology.add_residue(resName, c, residue.number) r.segment_id = residue.segment_id if resName in PDBTrajectoryFile._atomNameReplacements: atomReplacements = PDBTrajectoryFile._atomNameReplacements[resName] else: atomReplacements = {} for atom in residue.atoms: atomName = atom.get_name() if atomName in atomReplacements: atomName = atomReplacements[atomName] atomName = atomName.strip() element = atom.element if element is None: element = self._guess_element(atomName, residue) newAtom = self._topology.add_atom(atomName, element, r, serial=atom.serial_number) atomByNumber[atom.serial_number] = newAtom # load all of the positions (from every model) _positions = [] for model in pdb.iter_models(use_all_models=True): coords = [] for chain in model.iter_chains(): for residue in chain.iter_residues(): for atom in residue.atoms: coords.append(atom.get_position()) _positions.append(coords) if not all(len(f) == len(_positions[0]) for f in _positions): raise ValueError('PDB Error: All MODELs must contain the same number of ATOMs') self._positions = np.array(_positions) ## The atom positions read from the PDB file self._unitcell_lengths = pdb.get_unit_cell_lengths() self._unitcell_angles = pdb.get_unit_cell_angles() self._topology.create_standard_bonds() self._topology.create_disulfide_bonds(self.positions[0]) # Add bonds based on CONECT records. connectBonds = [] for connect in pdb.models[-1].connects: i = connect[0] for j in connect[1:]: if i in atomByNumber and j in atomByNumber: connectBonds.append((atomByNumber[i], atomByNumber[j])) if len(connectBonds) > 0: # Only add bonds that don't already exist. existingBonds = set(self._topology.bonds) for bond in connectBonds: if bond not in existingBonds and (bond[1], bond[0]) not in existingBonds: self._topology.add_bond(bond[0], bond[1]) existingBonds.add(bond) @staticmethod def _loadNameReplacementTables(): """Load the list of atom and residue name replacements.""" if len(PDBTrajectoryFile._residueNameReplacements) == 0: tree = etree.parse(os.path.join(os.path.dirname(__file__), 'data', 'pdbNames.xml')) allResidues = {} proteinResidues = {} nucleicAcidResidues = {} for residue in tree.getroot().findall('Residue'): name = residue.attrib['name'] if name == 'All': PDBTrajectoryFile._parseResidueAtoms(residue, allResidues) elif name == 'Protein': PDBTrajectoryFile._parseResidueAtoms(residue, proteinResidues) elif name == 'Nucleic': PDBTrajectoryFile._parseResidueAtoms(residue, nucleicAcidResidues) for atom in allResidues: proteinResidues[atom] = allResidues[atom] nucleicAcidResidues[atom] = allResidues[atom] for residue in tree.getroot().findall('Residue'): name = residue.attrib['name'] for id in residue.attrib: if id == 'name' or id.startswith('alt'): PDBTrajectoryFile._residueNameReplacements[residue.attrib[id]] = name if 'type' not in residue.attrib: atoms = copy(allResidues) elif residue.attrib['type'] == 'Protein': atoms = copy(proteinResidues) elif residue.attrib['type'] == 'Nucleic': atoms = copy(nucleicAcidResidues) else: atoms = copy(allResidues) PDBTrajectoryFile._parseResidueAtoms(residue, atoms) PDBTrajectoryFile._atomNameReplacements[name] = atoms def _guess_element(self, atom_name, residue): "Try to guess the element name" upper = atom_name.upper() if upper.startswith('CL'): element = elem.chlorine elif upper.startswith('NA'): element = elem.sodium elif upper.startswith('MG'): element = elem.magnesium elif upper.startswith('BE'): element = elem.beryllium elif upper.startswith('LI'): element = elem.lithium elif upper.startswith('K'): element = elem.potassium elif upper.startswith('ZN'): element = elem.zinc elif len(residue) == 1 and upper.startswith('CA'): element = elem.calcium # TJL has edited this. There are a few issues here. First, # parsing for the element is non-trivial, so I do my best # below. Second, there is additional parsing code in # pdbstructure.py, and I am unsure why it doesn't get used # here... elif len(residue) > 1 and upper.startswith('CE'): element = elem.carbon # (probably) not Celenium... elif len(residue) > 1 and upper.startswith('CD'): element = elem.carbon # (probably) not Cadmium... elif residue.name in ['TRP', 'ARG', 'GLN', 'HIS'] and upper.startswith('NE'): element = elem.nitrogen # (probably) not Neon... elif residue.name in ['ASN'] and upper.startswith('ND'): element = elem.nitrogen # (probably) not ND... elif residue.name == 'CYS' and upper.startswith('SG'): element = elem.sulfur # (probably) not SG... else: try: element = elem.get_by_symbol(atom_name[0]) except KeyError: try: symbol = atom_name[0:2].strip().rstrip("AB0123456789").lstrip("0123456789") element = elem.get_by_symbol(symbol) except KeyError: element = None return element @staticmethod def _parseResidueAtoms(residue, map): for atom in residue.findall('Atom'): name = atom.attrib['name'] for id in atom.attrib: map[atom.attrib[id]] = name def __del__(self): self.close() def __enter__(self): return self def __exit__(self, *exc_info): self.close() def __len__(self): "Number of frames in the file" if str(self._mode) != 'r': raise NotImplementedError('len() only available in mode="r" currently') if not self._open: raise ValueError('I/O operation on closed file') return len(self._positions)
def _to_topology(self, atom_list, chain_types=None, residue_types=None): """Create a mdtraj.Topology from a Compound. Parameters ---------- atom_list : chain_types : residue_types : Returns ------- top : mtraj.Topology """ if isinstance(chain_types, Compound): chain_types = [Compound] if isinstance(chain_types, (list, set)): chain_types = tuple(chain_types) if isinstance(residue_types, Compound): residue_types = [Compound] if isinstance(residue_types, (list, set)): residue_types = tuple(residue_types) top = Topology() atom_mapping = {} default_chain = top.add_chain() default_residue = top.add_residue('RES', default_chain) last_residue_compound = None last_chain_compound = None last_residue = None last_chain = None for atom in atom_list: # Chains for parent in atom.ancestors(): if chain_types and isinstance(parent, chain_types): if parent != last_chain_compound: last_chain_compound = parent last_chain = top.add_chain() last_chain_default_residue = top.add_residue('RES', last_chain) last_chain.compound = last_chain_compound break else: last_chain = default_chain last_chain.compound = last_chain_compound # Residues for parent in atom.ancestors(): if residue_types and isinstance(parent, residue_types): if parent != last_residue_compound: last_residue_compound = parent last_residue = top.add_residue(parent.__class__.__name__, last_chain) last_residue.compound = last_residue_compound break else: if last_chain != default_chain: last_residue = last_chain_default_residue else: last_residue = default_residue last_residue.compound = last_residue_compound # Add the actual atoms try: elem = get_by_symbol(atom.name) except KeyError: elem = get_by_symbol("VS") at = top.add_atom(atom.name, elem, last_residue) at.charge = atom.charge atom_mapping[atom] = at # Remove empty default residues. chains_to_remove = [chain for chain in top.chains if chain.n_atoms == 0] residues_to_remove = [res for res in top.residues if res.n_atoms == 0] for chain in chains_to_remove: top._chains.remove(chain) for res in residues_to_remove: for chain in top.chains: try: chain._residues.remove(res) except ValueError: # Already gone. pass for atom1, atom2 in self.bonds(): # Ensure that both atoms are part of the compound. This becomes an # issue if you try to convert a sub-compound to a topology which is # bonded to a different subcompound. if all(a in atom_mapping.keys() for a in [atom1, atom2]): top.add_bond(atom_mapping[atom1], atom_mapping[atom2]) return top
def load_hoomdxml(filename, top=None): """Load a single conformation from an HOOMD-Blue XML file. For more information on this file format, see: http://codeblue.umich.edu/hoomd-blue/doc/page_xml_file_format.html Notably, all node names and attributes are in all lower case. HOOMD-Blue does not contain residue and chain information explicitly. For this reason, chains will be found by looping over all the bonds and finding what is bonded to what. Each chain consisists of exactly one residue. Parameters ---------- filename : string The path on disk to the XML file top : None This argumet is ignored Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object, with corresponding Topology. Notes ----- This function requires the NetworkX python package. """ from mdtraj.core.trajectory import Trajectory from mdtraj.core.topology import Topology topology = Topology() tree = cElementTree.parse(filename) config = tree.getroot().find('configuration') position = config.find('position') bond = config.find('bond') atom_type = config.find('type') # MDTraj calls this "name" box = config.find('box') box.attrib = dict((key.lower(), val) for key, val in box.attrib.items()) # be generous for case of box attributes lx = float(box.attrib['lx']) ly = float(box.attrib['ly']) lz = float(box.attrib['lz']) try: xy = float(box.attrib['xy']) xz = float(box.attrib['xz']) yz = float(box.attrib['yz']) except (ValueError, KeyError): xy = 0.0 xz = 0.0 yz = 0.0 unitcell_vectors = np.array([[[lx, xy * ly, xz * lz], [0.0, ly, yz * lz], [0.0, 0.0, lz]]]) positions, types = [], {} for pos in position.text.splitlines()[1:]: positions.append((float(pos.split()[0]), float(pos.split()[1]), float(pos.split()[2]))) for idx, atom_name in enumerate(atom_type.text.splitlines()[1:]): types[idx] = str(atom_name.split()[0]) if len(types) != len(positions): raise ValueError('Different number of types and positions in xml file') # ignore the bond type if hasattr(bond, 'text'): bonds = [(int(b.split()[1]), int(b.split()[2])) for b in bond.text.splitlines()[1:]] chains = _find_chains(bonds) else: chains = [] bonds = [] # Relate the first index in the bonded-group to mdtraj.Residue bonded_to_residue = {} for i, _ in enumerate(types): bonded_group = _in_chain(chains, i) if bonded_group is not None: if bonded_group[0] not in bonded_to_residue: t_chain = topology.add_chain() t_residue = topology.add_residue('A', t_chain) bonded_to_residue[bonded_group[0]] = t_residue topology.add_atom(types[i], virtual_site, bonded_to_residue[bonded_group[0]]) if bonded_group is None: t_chain = topology.add_chain() t_residue = topology.add_residue('A', t_chain) topology.add_atom(types[i], virtual_site, t_residue) for bond in bonds: atom1, atom2 = bond[0], bond[1] topology.add_bond(topology.atom(atom1), topology.atom(atom2)) traj = Trajectory(xyz=np.array(positions), topology=topology) traj.unitcell_vectors = unitcell_vectors return traj
def __init__(self, topology, use_chains=None): if use_chains is None: use_chains = range(len(topology._chains)) self._ref_topology = topology.copy() # Build new topology newTopology = Topology() new_atm_idx = 0 res_idx = 1 prev_ca = None ca_idxs = [] self._sidechain_idxs = [] self._sidechain_mass = [] self._chain_indices = [] for chain_count, chain in enumerate(topology._chains): if chain_count in use_chains: newChain = newTopology.add_chain() for residue in chain._residues: #resSeq = getattr(residue, 'resSeq', None) or residue.index newResidue = newTopology.add_residue( residue.name, newChain, res_idx) # map CA new_ca = newTopology.add_atom( 'CA', md.core.element.get_by_symbol('C'), newResidue, serial=new_atm_idx) self._chain_indices.append(chain_count) if prev_ca is None: prev_ca = new_ca else: # only bond atoms in the same chain. if new_ca.residue.chain.index == prev_ca.residue.chain.index: newTopology.add_bond(prev_ca, new_ca) prev_ca = new_ca try: ca_idxs.append([[ atm.index for atm in residue.atoms if \ (atm.name == "CA") ][0], new_atm_idx ]) except: print(residue) print(chain) for atm in residue.atoms: atm.name raise new_atm_idx += 1 if residue.name == 'GLY': self._sidechain_idxs.append([]) self._sidechain_mass.append([]) else: # map CB cb_name = "CB%s" % atom_types.residue_code[ residue.name] new_cb = newTopology.add_atom( cb_name, md.core.element.get_by_symbol('C'), newResidue, serial=new_atm_idx) self._chain_indices.append(chain_count) newTopology.add_bond(new_cb, new_ca) self._sidechain_idxs.append([[ atm.index for atm in residue.atoms if \ (atm.is_sidechain) and (atm.element.symbol != "H") ], new_atm_idx ]) self._sidechain_mass.append(np.array([ atm.element.mass for atm in residue.atoms if \ (atm.is_sidechain) and (atm.element.symbol != "H") ])) new_atm_idx += 1 res_idx += 1 self._ca_idxs = np.array(ca_idxs) self.topology = newTopology assert self.topology.n_atoms == len(self._chain_indices)
def extract(item, atom_indices='all', copy_if_all=True, check=True): if check: digest_item(item, 'mdtraj.Topology') atom_indices = digest_atom_indices(atom_indices) if atom_indices is 'all': if copy_if_all: from copy import deepcopy tmp_item = deepcopy(item) else: tmp_item = item else: from mdtraj.core.topology import Topology from mdtraj.utils import ilen atom_indices_to_be_kept = set(atom_indices) newTopology = Topology() old_atom_to_new_atom = {} for chain in item._chains: newChain = newTopology.add_chain() for group in chain._groups: resSeq = getattr(group, 'resSeq', None) or group.index newResidue = newTopology.add_group(group.name, newChain, resSeq, group.segment_id) for atom in group._atoms: if atom.index in atom_indices_to_be_kept: try: # OpenMM Topology objects don't have serial attributes, so we have to check first. serial = atom.serial except AttributeError: serial = None newAtom = newTopology.add_atom(atom.name, atom.element, newResidue, serial=serial) old_atom_to_new_atom[atom] = newAtom bondsiter = item.bonds if not hasattr(bondsiter, '__iter__'): bondsiter = bondsiter() for bond in bondsiter: try: atom1, atom2 = bond newTopology.add_bond(old_atom_to_new_atom[atom1], old_atom_to_new_atom[atom2], type=bond.type, order=bond.order) except KeyError: pass # we only put bonds into the new topology if both of their partners # were indexed and thus HAVE a new atom # Delete empty groups newTopology._groups = [ r for r in newTopology._groups if len(r._atoms) > 0 ] for chain in newTopology._chains: chain._groups = [r for r in chain._groups if len(r._atoms) > 0] # Delete empty chains newTopology._chains = [ c for c in newTopology._chains if len(c._groups) > 0 ] # Re-set the numAtoms and numResidues newTopology._numAtoms = ilen(newTopology.atoms) newTopology._numResidues = ilen(newTopology.groups) tmp_item = newTopology return tmp_item
def _to_topology(self, atom_list, chain_types=None, residue_types=None): """Create a mdtraj.Topology from a Compound. Parameters ---------- atom_list : chain_types : residue_types : Returns ------- top : mtraj.Topology """ from mdtraj.core.element import get_by_symbol from mdtraj.core.topology import Topology if isinstance(chain_types, Compound): chain_types = [Compound] if isinstance(chain_types, (list, set)): chain_types = tuple(chain_types) if isinstance(residue_types, Compound): residue_types = [Compound] if isinstance(residue_types, (list, set)): residue_types = tuple(residue_types) top = Topology() atom_mapping = {} default_chain = top.add_chain() default_residue = top.add_residue('RES', default_chain) last_residue_compound = None last_chain_compound = None last_residue = None last_chain = None for atom in atom_list: # Chains for parent in atom.ancestors(): if chain_types and isinstance(parent, chain_types): if parent != last_chain_compound: last_chain_compound = parent last_chain = top.add_chain() last_chain_default_residue = top.add_residue( 'RES', last_chain) last_chain.compound = last_chain_compound break else: last_chain = default_chain last_chain.compound = last_chain_compound # Residues for parent in atom.ancestors(): if residue_types and isinstance(parent, residue_types): if parent != last_residue_compound: last_residue_compound = parent last_residue = top.add_residue( parent.__class__.__name__, last_chain) last_residue.compound = last_residue_compound break else: if last_chain != default_chain: last_residue = last_chain_default_residue else: last_residue = default_residue last_residue.compound = last_residue_compound # Add the actual atoms try: elem = get_by_symbol(atom.name) except KeyError: elem = get_by_symbol("VS") at = top.add_atom(atom.name, elem, last_residue) at.charge = atom.charge atom_mapping[atom] = at # Remove empty default residues. chains_to_remove = [ chain for chain in top.chains if chain.n_atoms == 0 ] residues_to_remove = [res for res in top.residues if res.n_atoms == 0] for chain in chains_to_remove: top._chains.remove(chain) for res in residues_to_remove: for chain in top.chains: try: chain._residues.remove(res) except ValueError: # Already gone. pass for atom1, atom2 in self.bonds(): # Ensure that both atoms are part of the compound. This becomes an # issue if you try to convert a sub-compound to a topology which is # bonded to a different subcompound. if all(a in atom_mapping.keys() for a in [atom1, atom2]): top.add_bond(atom_mapping[atom1], atom_mapping[atom2]) return top
class PDBTrajectoryFile(object): """Interface for reading and writing Protein Data Bank (PDB) files Parameters ---------- filename : str The filename to open. A path to a file on disk. mode : {'r', 'w'} The mode in which to open the file, either 'r' for read or 'w' for write. force_overwrite : bool If opened in write mode, and a file by the name of `filename` already exists on disk, should we overwrite it? Attributes ---------- positions : np.ndarray, shape=(n_frames, n_atoms, 3) topology : mdtraj.Topology closed : bool Notes ----- When writing pdb files, mdtraj follows the PDB3.0 standard as closely as possible. During *reading* however, we try to be more lenient. For instance, we will parse common nonstandard atom names during reading, and convert them into the standard names. The replacement table used by mdtraj is at {mdtraj_source}/formats/pdb/data/pdbNames.xml. See Also -------- mdtraj.load_pdb : High-level wrapper that returns a ``md.Trajectory`` """ distance_unit = 'angstroms' _residueNameReplacements = {} _atomNameReplacements = {} _chain_names = [chr(ord('A') + i) for i in range(26)] def __init__(self, filename, mode='r', force_overwrite=True): self._open = False self._file = None self._topology = None self._positions = None self._mode = mode self._last_topology = None if mode == 'r': PDBTrajectoryFile._loadNameReplacementTables() if _is_url(filename): self._file = urlopen(filename) if filename.lower().endswith('.gz'): if six.PY3: self._file = gzip.GzipFile(fileobj=self._file) else: self._file = gzip.GzipFile(fileobj=six.StringIO( self._file.read())) if six.PY3: self._file = six.StringIO(self._file.read().decode('utf-8')) else: if filename.lower().endswith('.gz'): self._file = gzip.open(filename, 'r') self._file = six.StringIO(self._file.read().decode('utf-8')) else: self._file = open(filename, 'r') self._read_models() elif mode == 'w': self._header_written = False self._footer_written = False if os.path.exists(filename) and not force_overwrite: raise IOError('"%s" already exists' % filename) self._file = open(filename, 'w') else: raise ValueError("invalid mode: %s" % mode) self._open = True def write(self, positions, topology, modelIndex=None, unitcell_lengths=None, unitcell_angles=None, bfactors=None): """Write a PDB file to disk Parameters ---------- positions : array_like The list of atomic positions to write. topology : mdtraj.Topology The Topology defining the model to write. modelIndex : {int, None} If not None, the model will be surrounded by MODEL/ENDMDL records with this index unitcell_lengths : {tuple, None} Lengths of the three unit cell vectors, or None for a non-periodic system unitcell_angles : {tuple, None} Angles between the three unit cell vectors, or None for a non-periodic system bfactors : array_like, default=None, shape=(n_atoms,) Save bfactors with pdb file. Should contain a single number for each atom in the topology """ if not self._mode == 'w': raise ValueError('file not opened for writing') if not self._header_written: self._write_header(unitcell_lengths, unitcell_angles) self._header_written = True if ilen(topology.atoms) != len(positions): raise ValueError('The number of positions must match the number of atoms') if np.any(np.isnan(positions)): raise ValueError('Particle position is NaN') if np.any(np.isinf(positions)): raise ValueError('Particle position is infinite') self._last_topology = topology # Hack to save the topology of the last frame written, allows us to output CONECT entries in write_footer() if bfactors is None: bfactors = ['{0:5.2f}'.format(0.0)] * len(positions) else: if (np.max(bfactors) >= 100) or (np.min(bfactors) <= -10): raise ValueError("bfactors must be in (-10, 100)") bfactors = ['{0:5.2f}'.format(b) for b in bfactors] atomIndex = 1 posIndex = 0 if modelIndex is not None: print("MODEL %4d" % modelIndex, file=self._file) for (chainIndex, chain) in enumerate(topology.chains): chainName = self._chain_names[chainIndex % len(self._chain_names)] residues = list(chain.residues) for (resIndex, res) in enumerate(residues): if len(res.name) > 3: resName = res.name[:3] else: resName = res.name for atom in res.atoms: if len(atom.name) < 4 and atom.name[:1].isalpha() and (atom.element is None or len(atom.element.symbol) < 2): atomName = ' '+atom.name elif len(atom.name) > 4: atomName = atom.name[:4] else: atomName = atom.name coords = positions[posIndex] if atom.element is not None: symbol = atom.element.symbol else: symbol = ' ' line = "ATOM %5d %-4s %3s %s%4d %s%s%s 1.00 %s %2s " % ( atomIndex % 100000, atomName, resName, chainName, (res.resSeq) % 10000, _format_83(coords[0]), _format_83(coords[1]), _format_83(coords[2]), bfactors[posIndex], symbol) assert len(line) == 80, 'Fixed width overflow detected' print(line, file=self._file) posIndex += 1 atomIndex += 1 if resIndex == len(residues)-1: print("TER %5d %3s %s%4d" % (atomIndex, resName, chainName, res.resSeq), file=self._file) atomIndex += 1 if modelIndex is not None: print("ENDMDL", file=self._file) def _write_header(self, unitcell_lengths, unitcell_angles, write_metadata=True): """Write out the header for a PDB file. Parameters ---------- unitcell_lengths : {tuple, None} The lengths of the three unitcell vectors, ``a``, ``b``, ``c`` unitcell_angles : {tuple, None} The angles between the three unitcell vectors, ``alpha``, ``beta``, ``gamma`` """ if not self._mode == 'w': raise ValueError('file not opened for writing') if unitcell_lengths is None and unitcell_angles is None: return if unitcell_lengths is not None and unitcell_angles is not None: if not len(unitcell_lengths) == 3: raise ValueError('unitcell_lengths must be length 3') if not len(unitcell_angles) == 3: raise ValueError('unitcell_angles must be length 3') else: raise ValueError('either unitcell_lengths and unitcell_angles' 'should both be spefied, or neither') box = list(unitcell_lengths) + list(unitcell_angles) assert len(box) == 6 if write_metadata: print("REMARK 1 CREATED WITH MDTraj %s, %s" % (version.version, str(date.today())), file=self._file) print("CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1 1 " % tuple(box), file=self._file) def _write_footer(self): if not self._mode == 'w': raise ValueError('file not opened for writing') # Identify bonds that should be listed as CONECT records. standardResidues = ['ALA', 'ASN', 'CYS', 'GLU', 'HIS', 'LEU', 'MET', 'PRO', 'THR', 'TYR', 'ARG', 'ASP', 'GLN', 'GLY', 'ILE', 'LYS', 'PHE', 'SER', 'TRP', 'VAL', 'A', 'G', 'C', 'U', 'I', 'DA', 'DG', 'DC', 'DT', 'DI', 'HOH'] conectBonds = [] if self._last_topology is not None: for atom1, atom2 in self._last_topology.bonds: if atom1.residue.name not in standardResidues or atom2.residue.name not in standardResidues: conectBonds.append((atom1, atom2)) elif atom1.name == 'SG' and atom2.name == 'SG' and atom1.residue.name == 'CYS' and atom2.residue.name == 'CYS': conectBonds.append((atom1, atom2)) if len(conectBonds) > 0: # Work out the index used in the PDB file for each atom. atomIndex = {} nextAtomIndex = 0 prevChain = None for chain in self._last_topology.chains: for atom in chain.atoms: if atom.residue.chain != prevChain: nextAtomIndex += 1 prevChain = atom.residue.chain atomIndex[atom] = nextAtomIndex nextAtomIndex += 1 # Record which other atoms each atom is bonded to. atomBonds = {} for atom1, atom2 in conectBonds: index1 = atomIndex[atom1] index2 = atomIndex[atom2] if index1 not in atomBonds: atomBonds[index1] = [] if index2 not in atomBonds: atomBonds[index2] = [] atomBonds[index1].append(index2) atomBonds[index2].append(index1) # Write the CONECT records. for index1 in sorted(atomBonds): bonded = atomBonds[index1] while len(bonded) > 4: print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=self._file) del bonded[:4] line = "CONECT%5d" % index1 for index2 in bonded: line = "%s%5d" % (line, index2) print(line, file=self._file) print("END", file=self._file) self._footer_written = True @classmethod def set_chain_names(cls, values): """Set the cycle of chain names used when writing PDB files When writing PDB files, PDBTrajectoryFile translates each chain's index into a name -- the name is what's written in the file. By default, chains are named with the letters A-Z. Parameters ---------- values : list A list of chacters (strings of length 1) that the PDB writer will cycle through to choose chain names. """ for item in values: if not isinstance(item, six.string_types) and len(item) == 1: raise TypeError('Names must be a single character string') cls._chain_names = values @property def positions(self): """The cartesian coordinates of all of the atoms in each frame. Available when a file is opened in mode='r' """ return self._positions @property def topology(self): """The topology from this PDB file. Available when a file is opened in mode='r' """ return self._topology @property def unitcell_lengths(self): "The unitcell lengths (3-tuple) in this PDB file. May be None" return self._unitcell_lengths @property def unitcell_angles(self): "The unitcell angles (3-tuple) in this PDB file. May be None" return self._unitcell_angles @property def closed(self): "Whether the file is closed" return not self._open def close(self): "Close the PDB file" if self._mode == 'w' and not self._footer_written: self._write_footer() if self._open: self._file.close() self._open = False def _read_models(self): if not self._mode == 'r': raise ValueError('file not opened for reading') self._topology = Topology() pdb = PdbStructure(self._file, load_all_models=True) atomByNumber = {} for chain in pdb.iter_chains(): c = self._topology.add_chain() for residue in chain.iter_residues(): resName = residue.get_name() if resName in PDBTrajectoryFile._residueNameReplacements: resName = PDBTrajectoryFile._residueNameReplacements[resName] r = self._topology.add_residue(resName, c, residue.number) if resName in PDBTrajectoryFile._atomNameReplacements: atomReplacements = PDBTrajectoryFile._atomNameReplacements[resName] else: atomReplacements = {} for atom in residue.atoms: atomName = atom.get_name() if atomName in atomReplacements: atomName = atomReplacements[atomName] atomName = atomName.strip() element = atom.element if element is None: element = self._guess_element(atomName, residue) newAtom = self._topology.add_atom(atomName, element, r, serial=atom.serial_number) atomByNumber[atom.serial_number] = newAtom # load all of the positions (from every model) _positions = [] for model in pdb.iter_models(use_all_models=True): coords = [] for chain in model.iter_chains(): for residue in chain.iter_residues(): for atom in residue.atoms: coords.append(atom.get_position()) _positions.append(coords) if not all(len(f) == len(_positions[0]) for f in _positions): raise ValueError('PDB Error: All MODELs must contain the same number of ATOMs') self._positions = np.array(_positions) ## The atom positions read from the PDB file self._unitcell_lengths = pdb.get_unit_cell_lengths() self._unitcell_angles = pdb.get_unit_cell_angles() self._topology.create_standard_bonds() self._topology.create_disulfide_bonds(self.positions[0]) # Add bonds based on CONECT records. connectBonds = [] for connect in pdb.models[0].connects: i = connect[0] for j in connect[1:]: if i in atomByNumber and j in atomByNumber: connectBonds.append((atomByNumber[i], atomByNumber[j])) if len(connectBonds) > 0: # Only add bonds that don't already exist. existingBonds = set(self._topology.bonds) for bond in connectBonds: if bond not in existingBonds and (bond[1], bond[0]) not in existingBonds: self._topology.add_bond(bond[0], bond[1]) existingBonds.add(bond) @staticmethod def _loadNameReplacementTables(): """Load the list of atom and residue name replacements.""" if len(PDBTrajectoryFile._residueNameReplacements) == 0: tree = etree.parse(os.path.join(os.path.dirname(__file__), 'data', 'pdbNames.xml')) allResidues = {} proteinResidues = {} nucleicAcidResidues = {} for residue in tree.getroot().findall('Residue'): name = residue.attrib['name'] if name == 'All': PDBTrajectoryFile._parseResidueAtoms(residue, allResidues) elif name == 'Protein': PDBTrajectoryFile._parseResidueAtoms(residue, proteinResidues) elif name == 'Nucleic': PDBTrajectoryFile._parseResidueAtoms(residue, nucleicAcidResidues) for atom in allResidues: proteinResidues[atom] = allResidues[atom] nucleicAcidResidues[atom] = allResidues[atom] for residue in tree.getroot().findall('Residue'): name = residue.attrib['name'] for id in residue.attrib: if id == 'name' or id.startswith('alt'): PDBTrajectoryFile._residueNameReplacements[residue.attrib[id]] = name if 'type' not in residue.attrib: atoms = copy(allResidues) elif residue.attrib['type'] == 'Protein': atoms = copy(proteinResidues) elif residue.attrib['type'] == 'Nucleic': atoms = copy(nucleicAcidResidues) else: atoms = copy(allResidues) PDBTrajectoryFile._parseResidueAtoms(residue, atoms) PDBTrajectoryFile._atomNameReplacements[name] = atoms def _guess_element(self, atom_name, residue): "Try to guess the element name" upper = atom_name.upper() if upper.startswith('CL'): element = elem.chlorine elif upper.startswith('NA'): element = elem.sodium elif upper.startswith('MG'): element = elem.magnesium elif upper.startswith('BE'): element = elem.beryllium elif upper.startswith('LI'): element = elem.lithium elif upper.startswith('K'): element = elem.potassium elif upper.startswith('ZN'): element = elem.zinc elif len(residue) == 1 and upper.startswith('CA'): element = elem.calcium # TJL has edited this. There are a few issues here. First, # parsing for the element is non-trivial, so I do my best # below. Second, there is additional parsing code in # pdbstructure.py, and I am unsure why it doesn't get used # here... elif len(residue) > 1 and upper.startswith('CE'): element = elem.carbon # (probably) not Celenium... elif len(residue) > 1 and upper.startswith('CD'): element = elem.carbon # (probably) not Cadmium... elif residue.name in ['TRP', 'ARG', 'GLN', 'HIS'] and upper.startswith('NE'): element = elem.nitrogen # (probably) not Neon... elif residue.name in ['ASN'] and upper.startswith('ND'): element = elem.nitrogen # (probably) not ND... elif residue.name == 'CYS' and upper.startswith('SG'): element = elem.sulfur # (probably) not SG... else: try: element = elem.get_by_symbol(atom_name[0]) except KeyError: try: symbol = atom_name[0:2].strip().rstrip("AB0123456789").lstrip("0123456789") element = elem.get_by_symbol(symbol) except KeyError: element = None return element @staticmethod def _parseResidueAtoms(residue, map): for atom in residue.findall('Atom'): name = atom.attrib['name'] for id in atom.attrib: map[atom.attrib[id]] = name def __del__(self): self.close() def __enter__(self): return self def __exit__(self, *exc_info): self.close() def __len__(self): "Number of frames in the file" if str(self._mode) != 'r': raise NotImplementedError('len() only available in mode="r" currently') if not self._open: raise ValueError('I/O operation on closed file') return len(self._positions)
def load_hoomdxml(filename, top=None): """Load a single conformation from an HOOMD-Blue XML file. For more information on this file format, see: http://codeblue.umich.edu/hoomd-blue/doc/page_xml_file_format.html Notably, all node names and attributes are in all lower case. HOOMD-Blue does not contain residue and chain information explicitly. For this reason, chains will be found by looping over all the bonds and finding what is bonded to what. Each chain consisists of exactly one residue. Parameters ---------- filename : string The path on disk to the XML file Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object, with corresponding Topology. Notes ----- This function requires the NetworkX python package. """ from mdtraj.core.trajectory import Trajectory from mdtraj.core.topology import Topology topology = Topology() tree = cElementTree.parse(filename) config = tree.getroot().find('configuration') position = config.find('position') bond = config.find('bond') atom_type = config.find('type') # MDTraj calls this "name" box = config.find('box') box.attrib = dict((key.lower(), val) for key, val in box.attrib.items()) # be generous for case of box attributes lx = float(box.attrib['lx']) ly = float(box.attrib['ly']) lz = float(box.attrib['lz']) try: xy = float(box.attrib['xy']) xz = float(box.attrib['xz']) yz = float(box.attrib['yz']) except: xy = 0.0 xz = 0.0 yz = 0.0 unitcell_vectors = np.array([[[lx, xy*ly, xz*lz], [0.0, ly, yz*lz], [0.0, 0.0, lz ]]]) positions, types = [], {} for pos in position.text.splitlines()[1:]: positions.append((float(pos.split()[0]), float(pos.split()[1]), float(pos.split()[2]))) for idx, atom_name in enumerate(atom_type.text.splitlines()[1:]): types[idx] = str(atom_name.split()[0]) if len(types) != len(positions): raise ValueError('Different number of types and positions in xml file') # ignore the bond type bonds = [(int(b.split()[1]), int(b.split()[2])) for b in bond.text.splitlines()[1:]] chains = _find_chains(bonds) ions = [i for i in range(len(types)) if not _in_chain(chains, i)] # add chains, bonds and ions (each chain = 1 residue) for chain in chains: t_chain = topology.add_chain() t_residue = topology.add_residue('A', t_chain) for atom in chain: topology.add_atom(types[atom], 'U', t_residue) for ion in ions: t_chain = topology.add_chain() t_residue = topology.add_residue('A', t_chain) topology.add_atom(types[atom], 'U', t_residue) for bond in bonds: atom1, atom2 = bond[0], bond[1] topology.add_bond(topology.atom(atom1), topology.atom(atom2)) traj = Trajectory(xyz=np.array(positions), topology=topology) traj.unitcell_vectors = unitcell_vectors return traj
def __init__(self, topology, use_chains=None): if use_chains is None: use_chains = range(len(topology._chains)) self._ref_topology = topology.copy() # Build new topology newTopology = Topology() new_atm_idx = 0 res_idx = 1 prev_ca = None ca_idxs = [] self._sidechain_idxs = [] self._sidechain_mass = [] self._chain_indices = [] for chain_count, chain in enumerate(topology._chains): if chain_count in use_chains: newChain = newTopology.add_chain() for residue in chain._residues: #resSeq = getattr(residue, 'resSeq', None) or residue.index newResidue = newTopology.add_residue(residue.name, newChain, res_idx) # map CA new_ca = newTopology.add_atom('CA', md.core.element.get_by_symbol('C'), newResidue, serial=new_atm_idx) self._chain_indices.append(chain_count) if prev_ca is None: prev_ca = new_ca else: # only bond atoms in the same chain. if new_ca.residue.chain.index == prev_ca.residue.chain.index: newTopology.add_bond(prev_ca, new_ca) prev_ca = new_ca try: ca_idxs.append([[ atm.index for atm in residue.atoms if \ (atm.name == "CA") ][0], new_atm_idx ]) except: print(residue) print(chain) for atm in residue.atoms: atm.name raise new_atm_idx += 1 if residue.name == 'GLY': self._sidechain_idxs.append([]) self._sidechain_mass.append([]) else: # map CB cb_name = "CB%s" % atom_types.residue_code[residue.name] new_cb = newTopology.add_atom(cb_name, md.core.element.get_by_symbol('C'), newResidue, serial=new_atm_idx) self._chain_indices.append(chain_count) newTopology.add_bond(new_cb, new_ca) self._sidechain_idxs.append([[ atm.index for atm in residue.atoms if \ (atm.is_sidechain) and (atm.element.symbol != "H") ], new_atm_idx ]) self._sidechain_mass.append(np.array([ atm.element.mass for atm in residue.atoms if \ (atm.is_sidechain) and (atm.element.symbol != "H") ])) new_atm_idx += 1 res_idx += 1 self._ca_idxs = np.array(ca_idxs) self.topology = newTopology assert self.topology.n_atoms == len(self._chain_indices)