def _parse_residue(fileobj, name): """ Parses the residue information out of the OFF file assuming the file is pointed at the first line of an atoms table section of the OFF file Parameters ---------- fileobj : file-like Assumed to be open for read, this file is parsed until the *next* atom table is read name : str The name of the residue being processed right now """ container = ResidueTemplateContainer(name) nres = 1 templ = ResidueTemplate(name) line = fileobj.readline() while line[0] != '!': nam, typ, typx, resx, flags, seq, elmnt, chg = line.split() nam = _strip_enveloping_quotes(nam) typ = _strip_enveloping_quotes(typ) typx = int(typx) resx = int(resx) flags = int(flags) seq = int(seq) elmnt = int(elmnt) chg = float(chg) atom = Atom(atomic_number=elmnt, type=typ, name=nam, charge=chg) if resx == nres + 1: container.append(templ) nres += 1 templ = ResidueTemplate(name) templ.add_atom(atom) line = fileobj.readline() # Skip blank lines while line and not line.strip(): line = fileobj.readline() container.append(templ) if nres > 1: start_atoms = [] runsum = 0 for res in container: start_atoms.append(runsum) runsum += len(res) # Make sure we get the next section rematch = AmberOFFLibrary._sec2re.match(line) if not rematch: raise RuntimeError('Expected pertinfo table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': if not line: raise RuntimeError('Unexpected EOF in Amber OFF library') # Not used, just skip # TODO sanity check line = fileobj.readline() rematch = AmberOFFLibrary._sec3re.match(line) if not rematch: raise RuntimeError('Expected boundbox table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) # Only 5 lines try: hasbox = float(fileobj.readline().strip()) angle = float(fileobj.readline().strip()) a = float(fileobj.readline().strip()) b = float(fileobj.readline().strip()) c = float(fileobj.readline().strip()) except ValueError: raise RuntimeError('Error processing boundbox table entries') else: if hasbox > 0: if angle < 3.15: # No box is this acute -- must be in radians angle *= RAD_TO_DEG container.box = [a, b, c, angle, angle, angle] # Get the child sequence entry line = fileobj.readline() rematch = AmberOFFLibrary._sec4re.match(line) if not rematch: raise RuntimeError('Expected childsequence table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) n = int(fileobj.readline().strip()) if nres + 1 != n: warnings.warn('Unexpected childsequence (%d); expected %d for ' 'residue %s' % (n, nres+1, name), AmberWarning) elif not isinstance(templ, ResidueTemplate) and n != len(templ) + 1: raise RuntimeError('child sequence must be 1 greater than the ' 'number of residues in the unit') # Get the CONNECT array to set head and tail line = fileobj.readline() rematch = AmberOFFLibrary._sec5re.match(line) if not rematch: raise RuntimeError('Expected connect array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) try: head = int(fileobj.readline().strip()) tail = int(fileobj.readline().strip()) except ValueError: raise RuntimeError('Error processing connect table entries') if head > 0 and nres == 1: templ.head = templ[head-1] elif head > 0 and nres > 1: if head < sum((len(r) for r in container)): raise RuntimeError('HEAD on multi-residue unit not supported') if tail > 0 and nres == 1: templ.tail = templ[tail-1] elif tail > 0 and nres > 1: if tail < sum((len(r) for r in container)): warnings.warn('TAIL on multi-residue unit not supported (%s). ' 'Ignored...' % name, AmberWarning) # Get the connectivity array to set bonds line = fileobj.readline() if len(templ.atoms) > 1: rematch = AmberOFFLibrary._sec6re.match(line) if not rematch: raise RuntimeError('Expected connectivity table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': i, j, flag = line.split() line = fileobj.readline() if nres > 1: # Find which residue we belong in i = int(i) - 1 j = int(j) - 1 for ii, idx in enumerate(start_atoms): if idx > i: ii -= 1 break start_idx = start_atoms[ii] container[ii].add_bond(i-start_idx, j-start_idx) else: templ.add_bond(int(i)-1, int(j)-1) # Get the hierarchy table rematch = AmberOFFLibrary._sec7re.match(line) if not rematch: raise RuntimeError('Expected hierarchy table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': # Skip this section... not used # TODO turn this into a sanity check line = fileobj.readline() # Get the unit name rematch = AmberOFFLibrary._sec8re.match(line) if not rematch: raise RuntimeError('Expected unit name string not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) fileobj.readline() # Skip this... not used line = fileobj.readline() # Get the atomic positions rematch = AmberOFFLibrary._sec9re.match(line) if not rematch: raise RuntimeError('Expected unit positions table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for res in container: for atom in res: x, y, z = fileobj.readline().split() atom.xx, atom.xy, atom.xz = float(x), float(y), float(z) line = fileobj.readline() # Get the residueconnect table rematch = AmberOFFLibrary._sec10re.match(line) if not rematch: raise RuntimeError('Expected unit residueconnect table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): c1,c2,c3,c4,c5,c6 = (int(x) for x in fileobj.readline().split()) if (c1 > 0 and templ.head is not None and templ.head is not templ[c1-1]): raise RuntimeError('HEAD atom is not connect0') if (c2 > 0 and templ.tail is not None and templ.tail is not templ[c2-1]): raise RuntimeError('TAIL atom is not connect1') for i in (c3, c4, c5, c6): if i == 0: continue templ.connections.append(templ[i-1]) # Get the residues table line = fileobj.readline() rematch = AmberOFFLibrary._sec11re.match(line) if not rematch: raise RuntimeError('Expected unit residues table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): resname, id, next, start, typ, img = fileobj.readline().split() resname = _strip_enveloping_quotes(resname) id = int(id) start = int(start) next = int(next) typ = _strip_enveloping_quotes(typ) img = int(img) if next - start != len(container[i]): warnings.warn('residue table predicted %d, not %d atoms for ' 'residue %s' % (next-start, len(container[i]), name), AmberWarning) if typ == 'p': container[i].type = PROTEIN elif typ == 'n': container[i].type = NUCLEIC elif typ == 'w': container[i].type = SOLVENT elif typ != '?': warnings.warn('Unknown residue type "%s"' % typ, AmberWarning) if nres > 1: container[i].name = resname # Get the residues sequence table line = fileobj.readline() rematch = AmberOFFLibrary._sec12re.match(line) if not rematch: raise RuntimeError('Expected residue sequence number not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): #TODO sanity check fileobj.readline() line = fileobj.readline() # Get the solventcap array rematch = AmberOFFLibrary._sec13re.match(line) if not rematch: raise RuntimeError('Expected unit solventcap array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) # Ignore the solvent cap fileobj.readline() fileobj.readline() fileobj.readline() fileobj.readline() fileobj.readline() # Velocities line = fileobj.readline() rematch = AmberOFFLibrary._sec14re.match(line) if not rematch: raise RuntimeError('Expected unit solventcap array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for res in container: for atom in res: vx, vy, vz = (float(x) for x in fileobj.readline().split()) atom.vx, atom.vy, atom.vz = vx, vy, vz if nres > 1: return container return templ
def _parse_residue(fileobj, name): """ Parses the residue information out of the OFF file assuming the file is pointed at the first line of an atoms table section of the OFF file Parameters ---------- fileobj : file-like Assumed to be open for read, this file is parsed until the *next* atom table is read name : str The name of the residue being processed right now """ container = ResidueTemplateContainer(name) nres = 1 templ = ResidueTemplate(name) line = fileobj.readline() while line[0] != '!': nam, typ, typx, resx, flags, seq, elmnt, chg = line.split() nam = _strip_enveloping_quotes(nam) typ = _strip_enveloping_quotes(typ) typx = int(typx) resx = int(resx) flags = int(flags) seq = int(seq) elmnt = int(elmnt) chg = float(chg) atom = Atom(atomic_number=elmnt, type=typ, name=nam, charge=chg) if resx == nres + 1: container.append(templ) nres += 1 templ = ResidueTemplate(name) templ.add_atom(atom) line = fileobj.readline() container.append(templ) if nres > 1: start_atoms = [] runsum = 0 for res in container: start_atoms.append(runsum) runsum += len(res) # Make sure we get the next section rematch = AmberOFFLibrary._sec2re.match(line) if not rematch: raise RuntimeError('Expected pertinfo table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': if not line: raise RuntimeError('Unexpected EOF in Amber OFF library') # Not used, just skip # TODO sanity check line = fileobj.readline() rematch = AmberOFFLibrary._sec3re.match(line) if not rematch: raise RuntimeError('Expected boundbox table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) # Only 5 lines try: hasbox = float(fileobj.readline().strip()) angle = float(fileobj.readline().strip()) a = float(fileobj.readline().strip()) b = float(fileobj.readline().strip()) c = float(fileobj.readline().strip()) except ValueError: raise RuntimeError('Error processing boundbox table entries') else: if hasbox > 0: angle *= RAD_TO_DEG container.box = [a, b, c, angle, angle, angle] # Get the child sequence entry line = fileobj.readline() rematch = AmberOFFLibrary._sec4re.match(line) if not rematch: raise RuntimeError('Expected childsequence table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) n = int(fileobj.readline().strip()) if nres + 1 != n: warnings.warn('Unexpected childsequence (%d); expected %d for ' 'residue %s' % (n, nres+1, name), AmberWarning) elif not isinstance(templ, ResidueTemplate) and n != len(templ) + 1: raise RuntimeError('child sequence must be 1 greater than the ' 'number of residues in the unit') # Get the CONNECT array to set head and tail line = fileobj.readline() rematch = AmberOFFLibrary._sec5re.match(line) if not rematch: raise RuntimeError('Expected connect array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) try: head = int(fileobj.readline().strip()) tail = int(fileobj.readline().strip()) except ValueError: raise RuntimeError('Error processing connect table entries') if head > 0 and nres == 1: templ.head = templ[head-1] elif head > 0 and nres > 1: if head < sum([len(r) for r in container]): raise RuntimeError('HEAD on multi-residue unit not supported') if tail > 0 and nres == 1: templ.tail = templ[tail-1] elif tail > 0 and nres > 1: if tail < sum([len(r) for r in container]): warnings.warn('TAIL on multi-residue unit not supported (%s). ' 'Ignored...' % name, AmberWarning) # Get the connectivity array to set bonds line = fileobj.readline() rematch = AmberOFFLibrary._sec6re.match(line) if not rematch: raise RuntimeError('Expected connectivity table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': i, j, flag = line.split() line = fileobj.readline() if nres > 1: # Find which residue we belong in i = int(i) - 1 j = int(j) - 1 for ii, idx in enumerate(start_atoms): if idx > i: ii -= 1 break start_idx = start_atoms[ii] container[ii].add_bond(i-start_idx, j-start_idx) else: templ.add_bond(int(i)-1, int(j)-1) # Get the hierarchy table rematch = AmberOFFLibrary._sec7re.match(line) if not rematch: raise RuntimeError('Expected hierarchy table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) line = fileobj.readline() while line[0] != '!': # Skip this section... not used # TODO turn this into a sanity check line = fileobj.readline() # Get the unit name rematch = AmberOFFLibrary._sec8re.match(line) if not rematch: raise RuntimeError('Expected unit name string not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) fileobj.readline() # Skip this... not used line = fileobj.readline() # Get the atomic positions rematch = AmberOFFLibrary._sec9re.match(line) if not rematch: raise RuntimeError('Expected unit positions table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for res in container: for atom in res: x, y, z = fileobj.readline().split() atom.xx, atom.xy, atom.xz = float(x), float(y), float(z) line = fileobj.readline() # Get the residueconnect table rematch = AmberOFFLibrary._sec10re.match(line) if not rematch: raise RuntimeError('Expected unit residueconnect table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): c1,c2,c3,c4,c5,c6 = [int(x) for x in fileobj.readline().split()] if templ.head is not None and templ.head is not templ[c1-1]: warnings.warn('HEAD atom is not connect0') if templ.tail is not None and templ.tail is not templ[c2-1]: warnings.warn('TAIL atom is not connect1') for i in (c3, c4, c5, c6): if i == 0: continue templ.connections.append(templ[i-1]) # Get the residues table line = fileobj.readline() rematch = AmberOFFLibrary._sec11re.match(line) if not rematch: raise RuntimeError('Expected unit residues table not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): resname, id, next, start, typ, img = fileobj.readline().split() resname = _strip_enveloping_quotes(resname) id = int(id) start = int(start) next = int(next) typ = _strip_enveloping_quotes(typ) img = int(img) if next - start != len(container[i]): warnings.warn('residue table predicted %d, not %d atoms for ' 'residue %s' % (next-start, len(container[i]), name), AmberWarning) if typ == 'p': container[i].type = PROTEIN elif typ == 'n': container[i].type = NUCLEIC elif typ == 'w': container[i].type = SOLVENT elif typ != '?': warnings.warn('Unknown residue type "%s"' % typ, AmberWarning) if nres > 1: container[i].name = resname # Get the residues sequence table line = fileobj.readline() rematch = AmberOFFLibrary._sec12re.match(line) if not rematch: raise RuntimeError('Expected residue sequence number not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for i in range(nres): #TODO sanity check fileobj.readline() line = fileobj.readline() # Get the solventcap array rematch = AmberOFFLibrary._sec13re.match(line) if not rematch: raise RuntimeError('Expected unit solventcap array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) # Ignore the solvent cap fileobj.readline() fileobj.readline() fileobj.readline() fileobj.readline() fileobj.readline() # Velocities line = fileobj.readline() rematch = AmberOFFLibrary._sec14re.match(line) if not rematch: raise RuntimeError('Expected unit solventcap array not found') elif rematch.groups()[0] != name: raise RuntimeError('Found residue %s while processing residue %s' % (rematch.groups()[0], name)) for res in container: for atom in res: vx, vy, vz = [float(x) for x in fileobj.readline().split()] atom.vx, atom.vy, atom.vz = vx, vy, vz if nres > 1: return container return templ
def parse(filename, structure=False): """ Parses a mol2 file (or mol3) file Parameters ---------- filename : str or file-like Name of the file to parse or file-like object to parse from structure : bool, optional If True, the return value is a :class:`Structure` instance. If False, it is either a :class:`ResidueTemplate` or :class:`ResidueTemplateContainter` instance, depending on whether there is one or more than one residue defined in it. Default is False Returns ------- molecule : :class:`Structure`, :class:`ResidueTemplate`, or :class:`ResidueTemplateContainer` The molecule defined by this mol2 file Raises ------ Mol2Error If the file format is not recognized or non-numeric values are present where integers or floating point numbers are expected. Also raises Mol2Error if you try to parse a mol2 file that has multiple @<MOLECULE> entries with ``structure=True``. """ if isinstance(filename, string_types): f = genopen(filename, 'r') own_handle = True else: f = filename own_handle = False rescont = ResidueTemplateContainer() struct = Structure() restemp = ResidueTemplate() mol_info = [] multires_structure = False try: section = None last_residue = None headtail = 'head' molecule_number = 0 for line in f: if line.startswith('#'): continue if not line.strip() and section is None: continue if line.startswith('@<TRIPOS>'): section = line[9:].strip() if section == 'MOLECULE' and (restemp.atoms or rescont): if structure: raise Mol2Error('Cannot convert MOL2 with multiple ' '@<MOLECULE>s to a Structure') # Set the residue name from the MOL2 title if the # molecule had only 1 residue and it was given a name in # the title if not multires_structure and mol_info[0]: restemp.name = mol_info[0] multires_structure = False rescont.append(restemp) restemp = ResidueTemplate() struct = Structure() last_residue = None molecule_number += 1 mol_info = [] continue if section is None: raise Mol2Error('Bad mol2 file format') if section == 'MOLECULE': # Section formatted as follows: # mol_name # num_atoms [num_bonds [num_substr [num_feat [num_sets]]]] # mol_type # charge_type # [status_bits] # [mol_comment] # TODO: Do something with the name. if len(mol_info) == 0: mol_info.append(line.strip()) elif len(mol_info) == 1: mol_info.append([int(x) for x in line.split()]) elif len(mol_info) == 2: mol_info.append(line.strip()) elif len(mol_info) == 3: mol_info.append(line.strip()) # Ignore the rest continue if section == 'ATOM': # Section formatted as follows: # atom_id -- serial number of atom # atom_name -- name of the atom # x -- X-coordinate of the atom # y -- Y-coordinate of the atom # z -- Z-coordinate of the atom # atom_type -- type of the atom # subst_id -- Residue serial number # subst_name -- Residue name # charge -- partial atomic charge # status_bit -- ignored words = line.split() id = int(words[0]) name = words[1] x = float(words[2]) y = float(words[3]) z = float(words[4]) typ = words[5] try: resid = int(words[6]) except IndexError: resid = 0 try: resname = words[7] except IndexError: resname = 'UNK' if 'NO_CHARGES' not in mol_info: try: charge = float(words[8]) except IndexError: charge = 0 else: charge = 0 if last_residue is None: last_residue = (resid, resname) restemp.name = resname atom = Atom(name=name, type=typ, number=id, charge=charge) atom.xx, atom.xy, atom.xz = x, y, z struct.add_atom(atom, resname, resid) if last_residue != (resid, resname): rescont.append(restemp) restemp = ResidueTemplate() restemp.name = resname last_residue = (resid, resname) multires_structure = True try: restemp.add_atom(copy.copy(atom)) except ValueError: # Allow mol2 files being parsed as a Structure to have # duplicate atom names if not structure: raise continue if section == 'BOND': # Section formatted as follows: # bond_id -- serial number of bond (ignored) # origin_atom_id -- serial number of first atom in bond # target_atom_id -- serial number of other atom in bond # bond_type -- string describing bond type (ignored) # status_bits -- ignored words = line.split() int(words[0]) # Bond serial number... redundant and ignored a1 = int(words[1]) a2 = int(words[2]) atom1 = struct.atoms.find_original_index(a1) atom2 = struct.atoms.find_original_index(a2) struct.bonds.append(Bond(atom1, atom2)) # Now add it to our residue container # See if it's a head/tail connection if atom1.residue is not atom2.residue: if atom1.residue.idx == len(rescont): res1 = restemp elif atom1.residue.idx < len(rescont): res1 = rescont[atom1.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' if atom2.residue.idx == len(rescont): res2 = restemp elif atom2.residue.idx < len(rescont): res2 = rescont[atom2.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' assert res1 is not res2, 'BAD identical residues' idx1 = atom1.idx - atom1.residue[0].idx idx2 = atom2.idx - atom2.residue[0].idx if atom1.residue.idx < atom2.residue.idx: res1.tail = res1[idx1] res2.head = res2[idx2] else: res1.head = res1[idx1] res2.tail = res2[idx2] elif not multires_structure: if not structure: restemp.add_bond(a1-1, a2-1) else: # Same residue, add the bond offset = atom1.residue[0].idx if atom1.residue.idx == len(rescont): res = restemp else: res = rescont[atom1.residue.idx] res.add_bond(atom1.idx-offset, atom2.idx-offset) continue if section == 'CRYSIN': # Section formatted as follows: # a -- length of first unit cell vector # b -- length of second unit cell vector # c -- length of third unit cell vector # alpha -- angle b/w b and c # beta -- angle b/w a and c # gamma -- angle b/w a and b # space group -- number of space group (ignored) # space group setting -- ignored words = line.split() box = [float(w) for w in words[:6]] if len(box) != 6: raise ValueError('%d box dimensions found; needed 6' % len(box)) struct.box = copy.copy(box) rescont.box = copy.copy(box) continue if section == 'SUBSTRUCTURE': # Section formatted as follows: # subst_id -- residue number # subst_name -- residue name # root_atom -- first atom of residue # subst_type -- ignored (usually 'RESIDUE') # dict_type -- type of substructure (ignored) # chain -- chain ID of residue # sub_type -- type of the chain # inter_bonds -- # of inter-substructure bonds # status -- ignored # comment -- ignored words = line.split() if not words: continue id = int(words[0]) resname = words[1] try: chain = words[5] except IndexError: chain = '' # Set the chain ID for res in struct.residues: if res.number == id and res.name == resname: res.chain = chain continue # MOL3 sections if section == 'HEADTAIL': atname, residx = line.split() residx = int(residx) if residx in (0, 1) or residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in head/tail') for atom in res: if atom.name == atname: if headtail == 'head': res.head = atom headtail = 'tail' else: res.tail = atom headtail = 'head' break else: if headtail == 'head': headtail = 'tail' else: headtail = 'head' continue if section == 'RESIDUECONNECT': words = line.split() residx = int(words[0]) if residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in ' 'residueconnect') for a in words[3:]: if a == '0': continue for atom in res: if atom.name == a: res.connections.append(atom) break else: raise Mol2Error('Residue connection atom %s not ' 'found in residue %d' % (a, residx)) if structure: return struct elif len(rescont) > 0: if not multires_structure and mol_info[0]: restemp.name = mol_info[0] rescont.append(restemp) return rescont else: return restemp except ValueError as e: raise Mol2Error('String conversion trouble: %s' % e) finally: if own_handle: f.close()