def __init__(self, filename, mode='r', force_overwrite=True): self._closed = True # is the file currently closed? self._mode = mode # what mode were we opened in if StrictVersion(import_( 'scipy.version').short_version) < StrictVersion('0.12.0'): raise ImportError('MDTraj NetCDF support requires scipy>=0.12.0. ' 'You have %s' % import_('scipy.version').short_version) netcdf = import_('scipy.io').netcdf_file if mode not in ['r', 'w']: raise ValueError("mode must be one of ['r', 'w']") if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # AMBER uses the NetCDF3 format, with 64 bit encodings, which # for scipy.io.netcdf_file is "version=2" self._handle = netcdf(filename, mode=mode, version=2) self._closed = False # self._frame_index is the current frame that we're at in the # file # self._needs_initialization indicates whether we need to set the # global properties of the file. This is required before the first # write operation on a new file if mode == 'w': self._frame_index = 0 self._needs_initialization = True elif mode == 'r': self._frame_index = 0 self._needs_initialization = False else: raise RuntimeError()
def __init__(self, filename, mode='r', force_overwrite=False): self._closed = True self._mode = mode if StrictVersion(import_( 'scipy.version').short_version) < StrictVersion('0.12.0'): raise ImportError('MDTraj NetCDF support requires scipy>=0.12.0. ' 'You have %s' % import_('scipy.version').short_version) netcdf = import_('scipy.io').netcdf_file if mode not in ('r', 'w'): raise ValueError("mode must be one of ['r', 'w']") if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # AMBER uses the NetCDF3 format, with 64 bit encodings, which for # scipy.io.netcdf_file is "version=2" self._handle = netcdf(filename, mode=mode, version=2) self._closed = False if mode == 'w': self._needs_initialization = True elif mode == 'r': self._needs_initialization = False else: raise RuntimeError()
def __init__(self, filename, mode='r', force_overwrite=True): self._closed = True # is the file currently closed? self._mode = mode # what mode were we opened in if StrictVersion(import_('scipy.version').short_version) < StrictVersion('0.12.0'): raise ImportError('MDTraj NetCDF support requires scipy>=0.12.0. ' 'You have %s' % import_('scipy.version').short_version) netcdf = import_('scipy.io').netcdf_file if mode not in ['r', 'w']: raise ValueError("mode must be one of ['r', 'w']") if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # AMBER uses the NetCDF3 format, with 64 bit encodings, which # for scipy.io.netcdf_file is "version=2" self._handle = netcdf(filename, mode=mode, version=2) self._closed = False # self._frame_index is the current frame that we're at in the # file # self._needs_initialization indicates whether we need to set the # global properties of the file. This is required before the first # write operation on a new file if mode == 'w': self._frame_index = 0 self._needs_initialization = True elif mode == 'r': self._frame_index = 0 self._needs_initialization = False else: raise RuntimeError()
def get_dihedral_connectivity(ibonds): """Given the bonds, get the indices of the atoms defining all the dihedral angles Parameters ---------- ibonds : np.ndarray, shape=[n_bonds, 2], dtype=int n_bonds x 2 array of indices, where each row is the index of two atom who participate in a bond. Returns ------- idihedrals : np.ndarray, shape[n_dihedrals, 4], dtype=int All sets of 4 atoms A,B,C,D such that A is bonded to B, B is bonded to C, and C is bonded to D """ nx = import_('networkx') graph = nx.from_edgelist(ibonds) n_atoms = graph.number_of_nodes() idihedrals = [] # TODO: CHECK FOR DIHEDRAL ANGLES THAT ARE 180 and recover # conf : msmbuilder.Trajectory # An msmbuilder trajectory, only the first frame will be used. This # is used purely to make the check for angle(ABC) != 180. for a in xrange(n_atoms): for b in graph.neighbors(a): for c in filter(lambda c: c not in [a, b], graph.neighbors(b)): for d in filter(lambda d: d not in [a, b, c], graph.neighbors(c)): idihedrals.append((a, b, c, d)) return np.array(idihedrals)
def get_angle_connectivity(ibonds): """Given the bonds, get the indices of the atoms defining all the bond angles Parameters ---------- ibonds : np.ndarray, shape=[n_bonds, 2], dtype=int n_bonds x 2 array of indices, where each row is the index of two atom who participate in a bond. Returns ------- iangles : np.ndarray, shape[n_angles, 3], dtype=int n_angles x 3 array of indices, where each row is the index of three atoms m,n,o such that n is bonded to both m and o. """ nx = import_('networkx') graph = nx.from_edgelist(ibonds) n_atoms = graph.number_of_nodes() iangles = [] for i in xrange(n_atoms): for (m, n) in combinations(graph.neighbors(i), 2): # so now the there is a bond angle m-i-n iangles.append((m, i, n)) return np.array(iangles)
def entry_point(): subparsers = parser.add_subparsers(dest="subparser_name") scriptfiles = {} argv = sys.argv[:] if len(argv) == 1: argv.append('-h') for scriptname in scripts.__all__: # get the name and first sentence of the description from each of the # msmbuilder commands with warnings.catch_warnings(): warnings.filterwarnings("ignore") script = import_('msmbuilder.scripts.%s' % scriptname) scriptparser = getattr(script, 'parser', None) scriptfiles[scriptname] = script.__file__ try: description = scriptparser.description except: description = scriptparser.parser.description # http://stackoverflow.com/a/17124446/1079728 first_sentence = ' '.join(' '.join( re.split(r'(?<=[.:;])\s', description)[:1]).split()) subparsers.add_parser(scriptname, help=first_sentence) args = parser.parse_args(argv[1:2]) sys.argv = argv[1:] getattr(scripts, args.subparser_name).entry_point()
def _find_chains(bond_list): """Given a set of bonds, find unique molecules, with the assumption that there are no bonds between separate chains (i.e., only INTRAmolecular bonds), which also implies that each atom can be in exactly one chain. Parameters ---------- bond_list : list of (int, int) The list of bonds Returns _______ chains : list of list of int List of atoms in each chain Notes ----- This function requires the NetworkX python package. """ nx = import_('networkx') chains = [] bond_list = np.asarray(bond_list) molecules = nx.Graph() molecules.add_nodes_from(set(bond_list.flatten())) molecules.add_edges_from(bond_list) return [sorted(x) for x in list(nx.connected_components(molecules))]
def _init_from_handle(self, handle): self._handle = handle self._open = handle.isopen != 0 self.mode = mode = handle.mode # the mode in which the file was opened? if mode not in ['r', 'w', 'a']: raise ValueError("mode must be one of ['r', 'w', 'a']") # import tables self.tables = import_('tables') if mode == 'w': # what frame are we currently reading or writing at? self._frame_index = 0 # do we need to write the header information? self._needs_initialization = True elif mode == 'a': try: self._frame_index = len(self._handle.root.coordinates) self._needs_initialization = False except self.tables.NoSuchNodeError: self._frame_index = 0 self._needs_initialization = True elif mode == 'r': self._frame_index = 0 self._needs_initialization = False
def from_openmm(cls, value): """Create a mdtraj topology from an OpenMM topology Parameters ---------- value : simtk.openmm.app.Topology An OpenMM topology that you wish to convert to a mdtraj topology. """ app = import_('simtk.openmm.app') if not isinstance(value, app.Topology): raise TypeError('value must be an OpenMM Topology. ' 'You supplied a %s' % type(value)) out = cls() atom_mapping = {} for chain in value.chains(): c = out.add_chain() for residue in chain.residues(): r = out.add_residue(residue.name, c, residue.segment_id) for atom in residue.atoms(): if atom.element is None: element = elem.virtual else: element = elem.get_by_symbol(atom.element.symbol) a = out.add_atom(atom.name, element, r) atom_mapping[atom] = a for a1, a2 in value.bonds(): out.add_bond(atom_mapping[a1], atom_mapping[a2]) return out
def to_dataframe(self): """Convert this topology into a pandas dataframe Returns ------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. bonds : np.ndarray The bonds in this topology, represented as an n_bonds x 2 array of the indices of the atoms involved in each bond. """ pd = import_('pandas') data = [ (atom.serial, atom.name, atom.element.symbol, atom.residue.resSeq, atom.residue.name, atom.residue.chain.index, atom.segment_id) for atom in self.atoms ] atoms = pd.DataFrame(data, columns=[ "serial", "name", "element", "resSeq", "resName", "chainID", "segmentID" ]) bonds = np.array([(a.index, b.index) for (a, b) in self.bonds]) return atoms, bonds
def from_openmm(cls, value): """Create a mdtraj topology from an OpenMM topology Parameters ---------- value : simtk.openmm.app.Topology An OpenMM topology that you wish to convert to a mdtraj topology. """ app = import_('simtk.openmm.app') if not isinstance(value, app.Topology): raise TypeError('value must be an OpenMM Topology. ' 'You supplied a %s' % type(value)) out = cls() atom_mapping = {} for chain in value.chains(): c = out.add_chain() for residue in chain.residues(): r = out.add_residue(residue.name, c) for atom in residue.atoms(): if atom.element is None: element = elem.virtual else: element = elem.get_by_symbol(atom.element.symbol) a = out.add_atom(atom.name, element, r) atom_mapping[atom] = a for a1, a2 in value.bonds(): out.add_bond(atom_mapping[a1], atom_mapping[a2]) return out
def to_openmm(self): """Convert this topology into OpenMM topology Returns ------- topology : simtk.openmm.app.Topology This topology, as an OpenMM topology """ app = import_('simtk.openmm.app') out = app.Topology() atom_mapping = {} for chain in self.chains: c = out.addChain() for residue in chain.residues: r = out.addResidue(residue.name, c) for atom in residue.atoms: a = out.addAtom(atom.name, app.Element.getBySymbol(atom.element.symbol), r) atom_mapping[atom] = a for a1, a2 in self.bonds: out.addBond(atom_mapping[a1], atom_mapping[a2]) return out
def __init__(self, filename, mode='r', force_overwrite=True): self._open = False self.filename = filename self.mode = mode if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # import tables self.tables = import_('tables') if mode == 'w': print("Warning: The LH5 trajectory format is deprecated.", file=sys.stderr) # what frame are we currently reading or writing at? self._frame_index = 0 # do we need to write the header information? self._needs_initialization = True if not filename.endswith('.lh5'): warnings.warn('The .lh5 extension is recommended.') elif mode == 'r': self._frame_index = 0 self._needs_initialization = False else: raise ValueError("mode must be one of ['r', 'w']") # Compression style of legacy MSMBuilder2 lh5 trajectory format compression = self.tables.Filters( complib='blosc', shuffle=True, complevel=1) self._handle = self._open_file( filename, mode=mode, filters=compression) self._open = True
def chemical_shifts_ppm(trj): """Predict chemical shifts of a trajectory using ppm. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. Returns ------- results : pandas.DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have ppm available on your path; see (http://spin.ccic.ohio-state.edu/index.php/download/index). Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Li, DW, and Bruschweiler, R. "PPM: a side-chain and backbone chemical shift predictor for the assessment of protein conformational ensembles." J Biomol NMR. 2012 Nov;54(3):257-65. """ pd = import_('pandas') binary = find_executable(PPM) first_resSeq = trj.top.residue(0).resSeq if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_ppm` requires the external program PPM, available at http://spin.ccic.ohio-state.edu/index.php/download/index' % ', '.join(PPM)) with enter_temp_directory(): trj.save("./trj.pdb") cmd = "%s -pdb trj.pdb -mode detail" % binary return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your PPM installation or your input trajectory." % cmd)) d = pd.read_csv("./bb_details.dat", delim_whitespace=True) columns = ["resSeq", "resName", "name", "expt", "other"] d = pd.read_csv("./bb_details.dat", delim_whitespace=True, header=None).drop([0, 4], axis=1) d = d.rename(columns={1: "resSeq", 2: "resName", 3: "name"}) d["resSeq"] += first_resSeq - 1 # Fix bug in PPM that reindexes to 1 d = d.drop("resName", axis=1) d = d.set_index(["resSeq", "name"]) d.columns = np.arange(trj.n_frames) d.columns.name = "frame" return d
def _str_to_unit(unit_string): """eval() based transformer that extracts a simtk.unit object from a string description. Parameters ---------- unit_string : str string description of a unit. this may contain expressions with multiplication, division, powers, etc. Examples -------- >>> type(_str_to_unit('nanometers**2/meters*gigajoules')) <class 'simtk.unit.unit.Unit'> >>> str(_str_to_unit('nanometers**2/meters*gigajoules')) 'nanometer**2*gigajoule/meter' """ units = import_('simtk.unit') # parse the string with the ast, and then run out unit context # visitor on it, which will basically change bare names like # "nanometers" into "unit.nanometers" and simulataniously check that # there's no nefarious stuff in the expression. node = _unit_context.visit(ast.parse(unit_string, mode='eval')) fixed_node = ast.fix_missing_locations(node) output = eval(compile(fixed_node, '<string>', mode='eval')) return output
def _find_chains(bond_list): """Given a set of bonds, find unique molecules, with the assumption that there are no bonds between separate chains (i.e., only INTRAmolecular bonds), which also implies that each atom can be in exactly one chain. Parameters ---------- bond_list : list of (int, int) The list of bonds Returns _______ chains : list of list of int List of atoms in each chain Notes ----- This function requires the NetworkX python package. """ nx = import_('networkx') chains = [] bond_list = np.asarray(bond_list) molecules = nx.Graph() molecules.add_nodes_from(set(bond_list.flatten())) molecules.add_edges_from(bond_list) return list(nx.connected_components(molecules))
def in_units_of(quantity, units_out, units_in=None): """Convert a quantity between unit systems Parameters ---------- quantity : number, np.ndarray, or simtk.unit.Quantity quantity can either be a unitted quantity -- i.e. instance of simtk.unit.Quantity, or just a bare number or numpy array units_out : str A string description of the units you want out. This should look like "nanometers/picosecondsecond" or "nanometers**3" or whatever units_in : str If you supply a quantity that's not a simtk.unit.Quantity, you should tell me what units it is in. If you don't, i'm just going to echo you back your quantity without doing any unit checking. Examples -------- >>> in_units_of(1*units.meter**2/units.second, 'nanometers**2/picosecond') # doctest: +SKIP 1000000.0 """ units = import_('simtk.unit') if quantity is None: return quantity if isinstance(quantity, units.Quantity): return quantity.value_in_unit(_str_to_unit(units_out)) else: if units_in is None: return quantity united_quantity = units.Quantity(quantity, _str_to_unit(units_in)) return united_quantity.value_in_unit(_str_to_unit(units_out))
def __init__(self, filename, mode='r', force_overwrite=True): self._open = False self.filename = filename self.mode = mode if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # import tables self.tables = import_('tables') if mode == 'w': print("Warning: The LH5 trajectory format is deprecated.", file=sys.stderr) # what frame are we currently reading or writing at? self._frame_index = 0 # do we need to write the header information? self._needs_initialization = True if not filename.endswith('.lh5'): warnings.warn('The .lh5 extension is recommended.') elif mode == 'r': self._frame_index = 0 self._needs_initialization = False else: raise ValueError("mode must be one of ['r', 'w']") # Compression style of legacy MSMBuilder2 lh5 trajectory format compression = self.tables.Filters(complib='blosc', shuffle=True, complevel=1) self._handle = self._open_file(filename, mode=mode, filters=compression) self._open = True
def to_dataframe(self): """Convert this topology into a pandas dataframe Returns ------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. bonds : np.ndarray The bonds in this topology, represented as an n_bonds x 2 array of the indices of the atoms involved in each bond. """ pd = import_('pandas') data = [] for atom in self.atoms: if atom.element is None: element_symbol = "" else: element_symbol = atom.element.symbol data.append((atom.serial, atom.name, element_symbol, atom.residue.resSeq, atom.residue.name, atom.residue.chain.index)) atoms = pd.DataFrame(data, columns=["serial", "name", "element", "resSeq", "resName", "chainID"]) bonds = np.array([(a.index, b.index) for (a, b) in self.bonds]) return atoms, bonds
def entry_point(): subparsers = parser.add_subparsers(dest="subparser_name") scriptfiles = {} argv = sys.argv[:] if len(argv) == 1: argv.append('-h') for scriptname in scripts.__all__: # get the name and first sentence of the description from each of the # msmbuilder commands with warnings.catch_warnings(): warnings.filterwarnings("ignore") script = import_('msmbuilder.scripts.%s' % scriptname) scriptparser = getattr(script, 'parser', None) scriptfiles[scriptname] = script.__file__ try: description = scriptparser.description except: description = scriptparser.parser.description # http://stackoverflow.com/a/17124446/1079728 first_sentence = ' '.join(' '.join(re.split(r'(?<=[.:;])\s', description)[:1]).split()) subparsers.add_parser(scriptname, help=first_sentence) args = parser.parse_args(argv[1:2]) sys.argv = argv[1:] getattr(scripts, args.subparser_name).entry_point()
def _str_to_unit(unit_string, simtk=False): """eval() based transformer that extracts a simtk.unit object from a string description. Parameters ---------- unit_string : str string description of a unit. this may contain expressions with multiplication, division, powers, etc. Examples -------- >>> type(_str_to_unit('nanometers**2/meters*gigajoules')) <class 'simtk.unit.unit.Unit'> >>> str(_str_to_unit('nanometers**2/meters*gigajoules')) 'nanometer**2*gigajoule/meter' """ # parse the string with the ast, and then run out unit context # visitor on it, which will basically change bare names like # "nanometers" into "unit.nanometers" and simulataniously check that # there's no nefarious stuff in the expression. assert isinstance(unit_string, six.string_types) unit_definitions = UNIT_DEFINITIONS if simtk: unit_definitions = import_('simtk.unit').unit_definitions parsed = ast.parse(unit_string, mode='eval') node = _unit_context.visit(parsed) fixed_node = ast.fix_missing_locations(node) output = eval(compile(fixed_node, '<string>', mode='eval'), {}, locals()) return output
def to_openmm(self): """Convert this topology into OpenMM topology Returns ------- topology : simtk.openmm.app.Topology This topology, as an OpenMM topology """ app = import_('simtk.openmm.app') out = app.Topology() atom_mapping = {} for chain in self.chains: c = out.addChain() for residue in chain.residues: r = out.addResidue(residue.name, c) for atom in residue.atoms: a = out.addAtom( atom.name, app.Element.getBySymbol(atom.element.symbol), r) atom_mapping[atom] = a for a1, a2 in self.bonds: out.addBond(atom_mapping[a1], atom_mapping[a2]) return out
def to_openmm(self, traj=None): """Convert this topology into OpenMM topology Parameters ---------- traj : MDTraj.Trajectory, optional, default=None If specified, use the first frame from this trajectory to set the unitcell information in the openmm topology. Returns ------- topology : simtk.openmm.app.Topology This topology, as an OpenMM topology """ app = import_('simtk.openmm.app') mm = import_('simtk.openmm') u = import_('simtk.unit') out = app.Topology() atom_mapping = {} for chain in self.chains: c = out.addChain() for residue in chain.residues: r = out.addResidue(residue.name, c) for atom in residue.atoms: if atom.element is elem.virtual: element = None else: element = app.Element.getBySymbol(atom.element.symbol) a = out.addAtom(atom.name, element, r) atom_mapping[atom] = a for a1, a2 in self.bonds: out.addBond(atom_mapping[a1], atom_mapping[a2]) if traj is not None: angles = traj.unitcell_angles[0] if np.linalg.norm(angles - 90.0) > 1E-4: raise (ValueError("Unitcell angles must be 90.0 to use " "in OpenMM topology.")) box_vectors = mm.Vec3(*traj.unitcell_lengths[0]) * u.nanometer out.setUnitCellDimensions(box_vectors) return out
def to_openmm(self, traj=None): """Convert this topology into OpenMM topology Parameters ---------- traj : MDTraj.Trajectory, optional, default=None If specified, use the first frame from this trajectory to set the unitcell information in the openmm topology. Returns ------- topology : simtk.openmm.app.Topology This topology, as an OpenMM topology """ app = import_('simtk.openmm.app') mm = import_('simtk.openmm') u = import_('simtk.unit') out = app.Topology() atom_mapping = {} for chain in self.chains: c = out.addChain() for residue in chain.residues: r = out.addResidue(residue.name, c) for atom in residue.atoms: if atom.element is elem.virtual: element = None else: element = app.Element.getBySymbol(atom.element.symbol) a = out.addAtom(atom.name, element, r) atom_mapping[atom] = a for a1, a2 in self.bonds: out.addBond(atom_mapping[a1], atom_mapping[a2]) if traj is not None: angles = traj.unitcell_angles[0] if np.linalg.norm(angles - 90.0) > 1E-4: raise(ValueError("Unitcell angles must be 90.0 to use " "in OpenMM topology.")) box_vectors = mm.Vec3(*traj.unitcell_lengths[0]) * u.nanometer out.setUnitCellDimensions(box_vectors) return out
def chemical_shifts_shiftx2(trj, pH=5.0, temperature=298.00): """Predict chemical shifts of a trajectory using ShiftX2. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. pH : float, optional, default=5.0 pH value which gets passed to the ShiftX2 predictor. temperature : float, optional, default=298.00 Temperature which gets passed to the ShiftX2 predictor. Returns ------- results : pandas DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have ShiftX2 available on your path; see (http://www.shiftx2.ca/). Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Beomsoo Han, Yifeng Liu, Simon Ginzinger, and David Wishart. "SHIFTX2: significantly improved protein chemical shift prediction." J. Biomol. NMR, 50, 1 43-57 (2011) """ pd = import_('pandas') binary = find_executable(SHIFTX2) if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_shiftx2` requires the external program SHIFTX2, available at http://www.shiftx2.ca/' % ', '.join(SHIFTX2)) results = [] with enter_temp_directory(): for i in range(trj.n_frames): trj[i].save("./trj%d.pdb" % i) cmd = "%s -b 'trj*.pdb' -p %.1f -t %.2f" % (binary, pH, temperature) return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your ShiftX2 installation or your input trajectory." % cmd)) for i in range(trj.n_frames): d = pd.read_csv("./trj%d.pdb.cs" % i) d.rename(columns={"NUM": "resSeq", "RES": "resName", "ATOMNAME": "name"}, inplace=True) d["frame"] = i results.append(d) results = pd.concat(results) results = results.pivot_table(rows=["resSeq", "name"], cols="frame", values="SHIFT") return results
def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_('pandas') with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) # Mol2 can have "status bits" at the end of the bond lines. We don't care # about these, but they interfere with using pd_read_table because it looks # like one line has too many columns. So we just regex out the offending # text. status_bit_regex = "BACKBONE|DICT|INTERRES|\|" data["@<TRIPOS>BOND\n"] = [re.sub(status_bit_regex, lambda _: "", s) for s in data["@<TRIPOS>BOND\n"]] if len(data["@<TRIPOS>BOND\n"]) > 1: csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*", engine='python') else: bonds_frame = None csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv(csv, sep="\s*", engine='python', header=None) ncols = atoms_frame.shape[1] names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge", "status"] atoms_frame.columns = names[:ncols] return atoms_frame, bonds_frame
def __init__(self, filename, mode='r', force_overwrite=True, compression='zlib', root_uep='/'): self._open = False # is the file handle currently open? self.mode = mode # the mode in which the file was opened? if not mode in ['r', 'w', 'a']: raise ValueError("mode must be one of ['r', 'w', 'a']") if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # import tables self.tables = import_('tables') if compression == 'zlib': compression = self.tables.Filters(complib='zlib', shuffle=True, complevel=1) elif compression is None: compression = None else: raise ValueError('compression must be either "zlib" or None') try: self._handle = self._open_file(filename, mode=mode, filters=compression, root_uep=root_uep) except self.tables.exceptions.HDF5ExtError: # Cannot find group raise ValueError( 'The group {root_uep} was not found in {filename}. ' 'Create the group first.'.format(root_uep=root_uep, filename=filename)) self._open = True if mode == 'w': # what frame are we currently reading or writing at? self._frame_index = 0 # do we need to write the header information? self._needs_initialization = True if not filename.endswith('.h5'): warnings.warn('The .h5 extension is recommended.') elif mode == 'a': try: self._frame_index = len(self._handle.root.coordinates) self._needs_initialization = False except self.tables.NoSuchNodeError: self._frame_index = 0 self._needs_initialization = True elif mode == 'r': self._frame_index = 0 self._needs_initialization = False
def chemical_shifts_ppm(trj): """Predict chemical shifts of a trajectory using ppm. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. Returns ------- results : pandas.DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have ppm available on your path; see (http://spin.ccic.ohio-state.edu/index.php/download/index). Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Li, DW, and Bruschweiler, R. "PPM: a side-chain and backbone chemical shift predictor for the assessment of protein conformational ensembles." J Biomol NMR. 2012 Nov;54(3):257-65. """ pd = import_('pandas') binary = find_executable(PPM) first_resSeq = trj.top.residue(0).resSeq if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_ppm` requires the external program PPM, available at http://spin.ccic.ohio-state.edu/index.php/download/index' % ', '.join(PPM)) with enter_temp_directory(): trj.save("./trj.pdb") cmd = "%s -pdb trj.pdb -mode detail" % binary return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your PPM installation or your input trajectory." % cmd)) d = pd.read_table("./bb_details.dat", index_col=False, header=None, sep="\s*").drop([3], axis=1) d = d.rename(columns={0: "resSeq", 1: "resName", 2: "name"}) d["resSeq"] += first_resSeq - 1 # Fix bug in PPM that reindexes to 1 d = d.drop("resName", axis=1) d = d.set_index(["resSeq", "name"]) d.columns = np.arange(trj.n_frames) d.columns.name = "frame" return d
def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_('pandas') with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*", engine='python') csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv(csv, sep="\s*", engine='python', header=None, names=[ "serial", "name", "x", "y", "z", "atype", "code", "resName", "charge" ]) return atoms_frame, bonds_frame
def chemical_shifts_shiftx2(trj): """Predict chemical shifts of a trajectory using ShiftX2. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. Returns ------- results : pandas DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have ShiftX2 available on your path; see (http://www.shiftx2.ca/). Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Beomsoo Han, Yifeng Liu, Simon Ginzinger, and David Wishart. "SHIFTX2: significantly improved protein chemical shift prediction." J. Biomol. NMR, 50, 1 43-57 (2011) """ pd = import_('pandas') binary = find_executable(SHIFTX2) if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_shiftx2` requires the external program SHIFTX2, available at http://www.shiftx2.ca/' % ', '.join(SHIFTX2)) results = [] with enter_temp_directory(): for i in range(trj.n_frames): trj[i].save("./trj%d.pdb" % i) cmd = "%s -b 'trj*.pdb'" % binary return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your ShiftX2 installation or your input trajectory." % cmd)) for i in range(trj.n_frames): d = pd.read_csv("./trj%d.pdb.cs" % i) d.rename(columns={"NUM": "resSeq", "RES": "resName", "ATOMNAME": "name"}, inplace=True) d["frame"] = i results.append(d) results = pd.concat(results) results = results.pivot_table(rows=["resSeq", "name"], cols="frame", values="SHIFT") return results
def visit_Name(self, node): # we want to prefix all names to look like unit.nanometers instead # of just "nanometers", because I don't want to import * from # units into this module. units = import_('simtk.unit') if not (node.id == 'units' or hasattr(units, node.id)): # also, let's take this opporunity to check that the node.id # (which supposed to be the name of the unit, like "nanometers") # is actually an attribute in simtk.unit raise ValueError('%s is not a valid unit' % node.id) return ast.Attribute(value=ast.Name(id='units', ctx=ast.Load()), attr=node.id, ctx=ast.Load())
def __init__(self, filename, mode='r', force_overwrite=False): self._closed = True self._mode = mode if StrictVersion(import_('scipy.version').short_version) < StrictVersion('0.12.0'): raise ImportError('MDTraj NetCDF support requires scipy>=0.12.0. ' 'You have %s' % import_('scipy.version').short_version) netcdf = import_('scipy.io').netcdf_file if mode not in ('r', 'w'): raise ValueError("mode must be one of ['r', 'w']") if mode == 'w' and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # AMBER uses the NetCDF3 format, with 64 bit encodings, which for # scipy.io.netcdf_file is "version=2" self._handle = netcdf(filename, mode=mode, version=2) self._closed = False if mode == 'w': self._needs_initialization = True elif mode == 'r': self._needs_initialization = False else: raise RuntimeError()
def __init__(self, filename, mode='r', force_overwrite=True): self._closed = True # is the file currently closed? self._mode = mode # what mode were we opened in netcdf = import_('netCDF4') if mode not in ['r', 'w', 'a', 'ws', 'as']: raise ValueError( ("mode must be one of ['r', 'w', 'a', 'ws', 'as']" " 'r' indicates read, 'w' indicates write, and 'a' indicates" " append. 'a' and 'w' can be appended with 's', which turns " " off buffering")) if mode in ['w', 'ws' ] and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists') # AMBER uses the NetCDF3 format, with 64 bit encodings self._handle = netcdf.Dataset(filename, mode=mode, format='NETCDF3_64BIT', clobber=force_overwrite) self._closed = False # self._frame_index is the current frame that we're at in the # file # self._needs_initialization indicates whether we need to set the # global properties of the file. This is required before the first # write operation on a new file # self._n_atoms is the number of atoms in the file if mode in ['a', 'as']: self._frame_index = len(self._handle.dimensions['frame']) self._n_atoms = len(self._handle.dimensions['atom']) self._needs_initialization = False elif mode in ['w', 'ws']: self._frame_index = 0 self._n_atoms = None # self._n_atoms will be set during _initialize_headers call self._needs_initialization = True elif mode == 'r': self._frame_index = 0 self._n_atoms = len(self._handle.dimensions['atom']) self._needs_initialization = False else: raise RuntimeError()
def mol2_to_dataframes(filename): """Convert a GAFF (or sybyl) mol2 file to a pair of pandas dataframes. Parameters ---------- filename : str Name of mol2 filename Returns ------- atoms_frame : pd.DataFrame DataFrame containing atom information bonds_frame : pd.DataFrame DataFrame containing bond information Notes ----- These dataframes may contain force field information as well as the information necessary for constructing the coordinates and molecular topology. This function has been tested for GAFF and sybyl-style mol2 files but has been primarily tested on GAFF mol2 files. This function does NOT accept multi-structure MOL2 files!!! See Also -------- If you just need the coordinates and bonds, use load_mol2(filename) to get a Trajectory object. """ pd = import_("pandas") with open(filename) as f: data = dict((key, list(grp)) for key, grp in itertools.groupby(f, _parse_mol2_sections)) csv = StringIO() csv.writelines(data["@<TRIPOS>BOND\n"][1:]) csv.seek(0) bonds_frame = pd.read_table(csv, names=["bond_id", "id0", "id1", "bond_type"], index_col=0, header=None, sep="\s*") csv = StringIO() csv.writelines(data["@<TRIPOS>ATOM\n"][1:]) csv.seek(0) atoms_frame = pd.read_csv( csv, sep="\s*", names=["serial", "name", "x", "y", "z", "atype", "code", "resName", "charge"], header=None ) # , usecols=range(1, 10)) # usecols not available in pandas 0.11 return atoms_frame, bonds_frame
def __init__(self, filename, mode="r", force_overwrite=True): self._closed = True # is the file currently closed? self._mode = mode # what mode were we opened in netcdf = import_("netCDF4") if mode not in ["r", "w", "a", "ws", "as"]: raise ValueError( ( "mode must be one of ['r', 'w', 'a', 'ws', 'as']" " 'r' indicates read, 'w' indicates write, and 'a' indicates" " append. 'a' and 'w' can be appended with 's', which turns " " off buffering" ) ) if mode in ["w", "ws"] and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists') # AMBER uses the NetCDF3 format, with 64 bit encodings self._handle = netcdf.Dataset(filename, mode=mode, format="NETCDF3_64BIT", clobber=force_overwrite) self._closed = False # self._frame_index is the current frame that we're at in the # file # self._needs_initialization indicates whether we need to set the # global properties of the file. This is required before the first # write operation on a new file # self._n_atoms is the number of atoms in the file if mode in ["a", "as"]: self._frame_index = len(self._handle.dimensions["frame"]) self._n_atoms = len(self._handle.dimensions["atom"]) self._needs_initialization = False elif mode in ["w", "ws"]: self._frame_index = 0 self._n_atoms = None # self._n_atoms will be set during _initialize_headers call self._needs_initialization = True elif mode == "r": self._frame_index = 0 self._n_atoms = len(self._handle.dimensions["atom"]) self._needs_initialization = False else: raise RuntimeError()
def __init__(self, filename, mode="r", force_overwrite=True, compression="zlib"): self._open = False # is the file handle currently open? self.mode = mode # the mode in which the file was opened? if not mode in ["r", "w", "a"]: raise ValueError("mode must be one of ['r', 'w', 'a']") if mode == "w" and not force_overwrite and os.path.exists(filename): raise IOError('"%s" already exists' % filename) # import tables self.tables = import_("tables") if compression == "zlib": compression = self.tables.Filters(complib="zlib", shuffle=True, complevel=1) elif compression is None: compression = None else: raise ValueError('compression must be either "zlib" or None') self._handle = self._open_file(filename, mode=mode, filters=compression) self._open = True if mode == "w": # what frame are we currently reading or writing at? self._frame_index = 0 # do we need to write the header information? self._needs_initialization = True if not filename.endswith(".h5"): warnings.warn("The .h5 extension is recommended.") elif mode == "a": try: self._frame_index = len(self._handle.root.coordinates) self._needs_initialization = False except self.tables.NoSuchNodeError: self._frame_index = 0 self._needs_initialization = True elif mode == "r": self._frame_index = 0 self._needs_initialization = False
def to_bondgraph(self): """Create a NetworkX graph from the atoms and bonds in this topology Returns ------- g : nx.Graph A graph whose nodes are the Atoms in this topology, and whose edges are the bonds See Also -------- atoms bonds Notes ----- This method requires the NetworkX python package. """ nx = import_('networkx') g = nx.Graph() g.add_nodes_from(self.atoms) g.add_edges_from(self.bonds) return g
def test_delay_import(): import_('scipy.sparse')
def test_delay_import_fail_1(): import_('sdfsdfsfsfdsdf')
>>> io.saveh('file.hdf5', x=x) # doctest: +SKIP >>> np.all(x == io.loadh('file.hdf5')['x']) # doctest: +SKIP True Functions --------- """ from __future__ import print_function, division import os import warnings import numpy as np from mdtraj.utils import import_ from mdtraj.utils.six import PY3, iteritems if PY3: basestring = str tables = import_('tables') __all__ = ['saveh', 'loadh'] try: COMPRESSION = tables.Filters(complevel=9, complib='blosc', shuffle=True) except Exception: #type? warnings.warn("Missing BLOSC; no compression will used.") COMPRESSION = tables.Filters() def saveh(file, *args, **kwargs): """Save several numpy arrays into a single file in compressed ``.hdf`` format. If arguments are passed in with no keywords, the corresponding variable names, in the ``.hdf`` file, are 'arr_0', 'arr_1', etc. If keyword arguments are given, the corresponding variable names, in the ``.hdf`` file will
#------------------------------------------------------------------------------ from __future__ import print_function import os import sys import warnings import functools import operator from argparse import ArgumentParser import numpy as np import mdtraj as md from mdtraj.utils import import_, ilen from mdtraj.geometry.internal import COVALENT_RADII spatial = import_('scipy.spatial') #------------------------------------------------------------------------------ # Code #------------------------------------------------------------------------------ class NoTopologyError(Exception): def __init__(self): super(NoTopologyError, self).__init__("One more more of the " "trajectory files should contain topology information (i.e. " "either HDF5 or PDB)") def parse_args(): parser = ArgumentParser(description=__doc__) parser.add_argument('files', nargs='+', help='''Input trajectory file(s),
def test_delay_import_fail_1(): with pytest.raises(ImportError): import_('sdfsdfsfsfdsdf')
# ------------------------------------------------------------------------------ from __future__ import print_function import os import sys import warnings import functools import operator from argparse import ArgumentParser import numpy as np import mdtraj as md from mdtraj.utils import import_, ilen from mdtraj.geometry.internal import COVALENT_RADII spatial = import_("scipy.spatial") # ------------------------------------------------------------------------------ # Code # ------------------------------------------------------------------------------ class NoTopologyError(Exception): def __init__(self): super(NoTopologyError, self).__init__( "One more more of the " "trajectory files should contain topology information (i.e. " "either HDF5 or PDB)" ) def parse_args(): parser = ArgumentParser(description=__doc__)
def chemical_shifts_spartaplus(trj, rename_HN=True): """Predict chemical shifts of a trajectory using SPARTA+. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. rename_HN : bool, optional, default=True SPARTA+ calls the amide proton "HN" instead of the standard "H". When True, this option renames the output as "H" to match the PDB and BMRB nomenclature. Returns ------- results : pandas.DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have SPARTA+ available on your path; see (http://spin.niddk.nih.gov/bax/software/SPARTA+/). Also, the SPARTAP_DIR environment variable must be set so that SPARTA+ knows where to find its database files. Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Shen, Y., and Bax, Ad. "SPARTA+: a modest improvement in empirical NMR chemical shift prediction by means of an artificial neural network." J. Biomol. NMR, 48, 13-22 (2010) """ pd = import_('pandas') binary = find_executable(SPARTA_PLUS) if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_spartaplus` requires the external program SPARTA+, available at http://spin.niddk.nih.gov/bax/software/SPARTA+/' % ', '.join(SPARTA_PLUS)) names = ["resSeq", "resName", "name", "SS_SHIFT", "SHIFT", "RC_SHIFT", "HM_SHIFT", "EF_SHIFT", "SIGMA"] with enter_temp_directory(): for i in range(trj.n_frames): trj[i].save("./trj%d.pdb" % i) cmd = "%s -in %s" % (binary, ' '.join("trj%d.pdb" % i for i in range(trj.n_frames))) return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your SPARTA+ installation or your input trajectory." % cmd)) lines_to_skip = _get_lines_to_skip("trj0_pred.tab") results = [] for i in range(trj.n_frames): d = pd.read_table("./trj%d_pred.tab" % i, names=names, header=None, sep="\s*", skiprows=lines_to_skip) d["frame"] = i results.append(d) results = pd.concat(results) if rename_HN: results.name[results.name == "HN"] = "H" results = results.pivot_table(rows=["resSeq", "name"], cols="frame", values="SHIFT") return results
True Functions --------- """ from __future__ import print_function, division, absolute_import import io import os import warnings import numpy as np from mdtraj.utils import import_ from mdtraj.utils.six import PY2, PY3, iteritems, StringIO if PY3: basestring = str tables = import_('tables') TABLES2 = tables.__version__ < '3.0.0' __all__ = ['saveh', 'loadh'] try: COMPRESSION = tables.Filters(complevel=9, complib='zlib', shuffle=True) except Exception: #type? warnings.warn("Missing Zlib; no compression will used.") COMPRESSION = tables.Filters() # Note to developers: This module is pseudo-deprecated. It provides (loadh, saveh) # which are useful functions (and we want to maintain them), but aren't really # within the scope of MDTraj as we now understand it. # # With that said, many people use these functions and no good would come from getting
############################################################################## # imports ############################################################################## import os import tempfile import shutil import numpy as np import mdtraj as md from mdtraj import element from mdtraj.utils import import_ from mdtraj.testing import skipif, get_fn, eq, slow try: scripttest = import_('scripttest') HAVE_SCRIPTTEST = True except SystemExit: HAVE_SCRIPTTEST = False ############################################################################## # globals ############################################################################## # if you switch DEBUG_MODE to True, none of the files will deleted # at the end of the execution of this suite, so that you can debug the # problem by running mdconvert manually. DEBUG_MODE = False # DEBUG_MODE = False staging_dir = tempfile.mkdtemp()
def from_dataframe(cls, atoms, bonds=None): """Create a mdtraj topology from a pandas data frame Parameters ---------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. This data frame should have columns "serial" (atom index), "name" (atom name), "element" (atom's element), "resSeq" (index of the residue) "resName" (name of the residue), "chainID" (index of the chain), and optionally "segmentID", following the same conventions as wwPDB 3.0 format. bonds : np.ndarray, shape=(n_bonds, 2), dtype=int, optional The bonds in the topology, represented as an n_bonds x 2 array of the indices of the atoms involved in each bond. Specifiying bonds here is optional. To create standard protein bonds, you can use `create_standard_bonds` to "fill in" the bonds on your newly created Topology object See Also -------- create_standard_bonds """ pd = import_('pandas') if bonds is None: bonds = np.zeros((0, 2)) for col in ["name", "element", "resSeq", "resName", "chainID", "serial"]: if col not in atoms.columns: raise ValueError('dataframe must have column %s' % col) if "segmentID" not in atoms.columns: atoms["segmentID"] = "" out = cls() if not isinstance(atoms, pd.DataFrame): raise TypeError('atoms must be an instance of pandas.DataFrame. ' 'You supplied a %s' % type(atoms)) if not isinstance(bonds, np.ndarray): raise TypeError('bonds must be an instance of numpy.ndarray. ' 'You supplied a %s' % type(bonds)) if not np.all(np.arange(len(atoms)) == atoms.index): raise ValueError('atoms must be uniquely numbered ' 'starting from zero.') out._atoms = [None for i in range(len(atoms))] for ci in np.unique(atoms['chainID']): chain_atoms = atoms[atoms['chainID'] == ci] c = out.add_chain() for ri in np.unique(chain_atoms['resSeq']): residue_atoms = chain_atoms[chain_atoms['resSeq'] == ri] rnames = residue_atoms['resName'] residue_name = np.array(rnames)[0] segids = residue_atoms['segmentID'] segment_id = np.array(segids)[0] if not np.all(rnames == residue_name): raise ValueError('All of the atoms with residue index %d ' 'do not share the same residue name' % ri) r = out.add_residue(residue_name, c, ri,segment_id) for atom_index, atom in residue_atoms.iterrows(): atom_index = int(atom_index) # Fixes bizarre hashing issue on Py3K. See #545 a = Atom(atom['name'], elem.get_by_symbol(atom['element']), atom_index, r, serial=atom['serial']) out._atoms[atom_index] = a r._atoms.append(a) for ai1, ai2 in bonds: out.add_bond(out.atom(ai1), out.atom(ai2)) out._numAtoms = out.n_atoms return out
def chemical_shifts_spartaplus(trj): """Predict chemical shifts of a trajectory using SPARTA+. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. Returns ------- results : pandas.DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have SPARTA+ available on your path; see (http://spin.niddk.nih.gov/bax/software/SPARTA+/). Also, the SPARTAP_DIR environment variable must be set so that SPARTA+ knows where to find its database files. Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Shen, Y., and Bax, Ad. "SPARTA+: a modest improvement in empirical NMR chemical shift prediction by means of an artificial neural network." J. Biomol. NMR, 48, 13-22 (2010) """ pd = import_('pandas') binary = find_executable(SPARTA_PLUS) if binary is None: raise OSError('External command not found. Looked for %s in PATH. `chemical_shifts_spartaplus` requires the external program SPARTA+, available at http://spin.niddk.nih.gov/bax/software/SPARTA+/' % ', '.join(SPARTA_PLUS)) names = ["VARS", "resSeq", "resName", "name", "SS_SHIFT", "SHIFT", "RC_SHIFT", "HM_SHIFT", "EF_SHIFT", "SIGMA"] with enter_temp_directory(): for i in range(trj.n_frames): trj[i].save("./trj%d.pdb" % i) cmd = "%s -in %s" % (binary, ' '.join("trj%d.pdb" % i for i in range(trj.n_frames))) return_flag = os.system(cmd) if return_flag != 0: raise(IOError("Could not successfully execute command '%s', check your SPARTA+ installation or your input trajectory." % cmd)) lines_to_skip = _get_lines_to_skip("trj0_pred.tab") results = [] for i in range(trj.n_frames): d = pd.read_csv("./trj%d_pred.tab" % i, skiprows=lines_to_skip, delim_whitespace=True, header=None, names=names) d["frame"] = i results.append(d) results = pd.concat(results) results = results.pivot_table(rows=["resSeq", "name"], cols="frame", values="SHIFT") return results
def from_dataframe(cls, atoms, bonds=None): """Create a mdtraj topology from a pandas data frame Parameters ---------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. This data frame should have columns "serial" (atom index), "name" (atom name), "element" (atom's element), "resSeq" (index of the residue) "resName" (name of the residue), "chainID" (index of the chain), and optionally "segmentID", following the same conventions as wwPDB 3.0 format. bonds : np.ndarray, shape=(n_bonds, 2), dtype=int, optional The bonds in the topology, represented as an n_bonds x 2 array of the indices of the atoms involved in each bond. Specifiying bonds here is optional. To create standard protein bonds, you can use `create_standard_bonds` to "fill in" the bonds on your newly created Topology object See Also -------- create_standard_bonds """ pd = import_('pandas') if bonds is None: bonds = np.zeros((0, 2)) for col in [ "name", "element", "resSeq", "resName", "chainID", "serial" ]: if col not in atoms.columns: raise ValueError('dataframe must have column %s' % col) if "segmentID" not in atoms.columns: atoms["segmentID"] = "" out = cls() if not isinstance(atoms, pd.DataFrame): raise TypeError('atoms must be an instance of pandas.DataFrame. ' 'You supplied a %s' % type(atoms)) if not isinstance(bonds, np.ndarray): raise TypeError('bonds must be an instance of numpy.ndarray. ' 'You supplied a %s' % type(bonds)) if not np.all(np.arange(len(atoms)) == atoms.index): raise ValueError('atoms must be uniquely numbered ' 'starting from zero.') out._atoms = [None for i in range(len(atoms))] for ci in np.unique(atoms['chainID']): chain_atoms = atoms[atoms['chainID'] == ci] c = out.add_chain() for ri in np.unique(chain_atoms['resSeq']): residue_atoms = chain_atoms[chain_atoms['resSeq'] == ri] rnames = residue_atoms['resName'] residue_name = np.array(rnames)[0] segids = residue_atoms['segmentID'] segment_id = np.array(segids)[0] if not np.all(rnames == residue_name): raise ValueError('All of the atoms with residue index %d ' 'do not share the same residue name' % ri) r = out.add_residue(residue_name, c, ri, segment_id) for atom_index, atom in residue_atoms.iterrows(): atom_index = int( atom_index ) # Fixes bizarre hashing issue on Py3K. See #545 a = Atom(atom['name'], elem.get_by_symbol(atom['element']), atom_index, r, serial=atom['serial']) out._atoms[atom_index] = a r._atoms.append(a) for ai1, ai2 in bonds: out.add_bond(out.atom(ai1), out.atom(ai2)) out._numAtoms = out.n_atoms return out
def test_topology_openmm_boxes(): u = import_('simtk.unit') traj = md.load(get_fn('1vii_sustiva_water.pdb')) mmtop = traj.topology.to_openmm(traj=traj) box = mmtop.getUnitCellDimensions() / u.nanometer
def chemical_shifts_spartaplus(trj, rename_HN=True): """Predict chemical shifts of a trajectory using SPARTA+. Parameters ---------- trj : Trajectory Trajectory to predict shifts for. rename_HN : bool, optional, default=True SPARTA+ calls the amide proton "HN" instead of the standard "H". When True, this option renames the output as "H" to match the PDB and BMRB nomenclature. Returns ------- results : pandas.DataFrame Dataframe containing results, with index consisting of (resSeq, atom_name) pairs and columns for each frame in trj. Notes ----- You must have SPARTA+ available on your path; see (http://spin.niddk.nih.gov/bax/software/SPARTA+/). Also, the SPARTAP_DIR environment variable must be set so that SPARTA+ knows where to find its database files. Chemical shift prediction is for PROTEIN atoms; trajectory objects with ligands, solvent, ions, or other non-protein components may give UNKNOWN RESULTS. Please cite the appropriate reference below. References ---------- .. [1] Shen, Y., and Bax, Ad. "SPARTA+: a modest improvement in empirical NMR chemical shift prediction by means of an artificial neural network." J. Biomol. NMR, 48, 13-22 (2010) """ pd = import_('pandas') binary = find_executable(SPARTA_PLUS) if binary is None: raise OSError( 'External command not found. Looked for %s in PATH. `chemical_shifts_spartaplus` requires the external program SPARTA+, available at http://spin.niddk.nih.gov/bax/software/SPARTA+/' % ', '.join(SPARTA_PLUS)) names = [ "resSeq", "resName", "name", "SS_SHIFT", "SHIFT", "RC_SHIFT", "HM_SHIFT", "EF_SHIFT", "SIGMA" ] with enter_temp_directory(): for i in range(trj.n_frames): trj[i].save("./trj%d.pdb" % i) subprocess.check_call( [binary, '-in'] + ["trj{}.pdb".format(i) for i in range(trj.n_frames)] + ['-out', 'trj0_pred.tab']) lines_to_skip = _get_lines_to_skip("trj0_pred.tab") results = [] for i in range(trj.n_frames): d = pd.read_table("./trj%d_pred.tab" % i, names=names, header=None, sep="\s+", skiprows=lines_to_skip) d["frame"] = i results.append(d) results = pd.concat(results) if rename_HN: results.name[results.name == "HN"] = "H" if LooseVersion(pd.__version__) < LooseVersion('0.14.0'): results = results.pivot_table(rows=["resSeq", "name"], cols="frame", values="SHIFT") else: results = results.pivot_table(index=["resSeq", "name"], columns="frame", values="SHIFT") return results