def parse(filename, structure=False): """ Parses a mol2 file (or mol3) file Parameters ---------- filename : str or file-like Name of the file to parse or file-like object to parse from structure : bool, optional If True, the return value is a :class:`Structure` instance. If False, it is either a :class:`ResidueTemplate` or :class:`ResidueTemplateContainter` instance, depending on whether there is one or more than one residue defined in it. Default is False Returns ------- molecule : :class:`Structure`, :class:`ResidueTemplate`, or :class:`ResidueTemplateContainer` The molecule defined by this mol2 file Raises ------ Mol2Error If the file format is not recognized or non-numeric values are present where integers or floating point numbers are expected. Also raises Mol2Error if you try to parse a mol2 file that has multiple @<MOLECULE> entries with ``structure=True``. """ if isinstance(filename, string_types): f = genopen(filename, 'r') own_handle = True else: f = filename own_handle = False rescont = ResidueTemplateContainer() struct = Structure() restemp = ResidueTemplate() mol_info = [] multires_structure = False try: section = None last_residue = None headtail = 'head' molecule_number = 0 for line in f: if line.startswith('#'): continue if not line.strip() and section is None: continue if line.startswith('@<TRIPOS>'): section = line[9:].strip() if section == 'MOLECULE' and (restemp.atoms or rescont): if structure: raise Mol2Error( 'Cannot convert MOL2 with multiple ' '@<MOLECULE>s to a Structure') # Set the residue name from the MOL2 title if the # molecule had only 1 residue and it was given a name in # the title if not multires_structure and mol_info[0]: restemp.name = mol_info[0] multires_structure = False rescont.append(restemp) restemp = ResidueTemplate() struct = Structure() last_residue = None molecule_number += 1 mol_info = [] continue if section is None: raise Mol2Error('Bad mol2 file format') if section == 'MOLECULE': # Section formatted as follows: # mol_name # num_atoms [num_bonds [num_substr [num_feat [num_sets]]]] # mol_type # charge_type # [status_bits] # [mol_comment] # TODO: Do something with the name. if len(mol_info) == 0: mol_info.append(line.strip()) elif len(mol_info) == 1: mol_info.append([int(x) for x in line.split()]) elif len(mol_info) == 2: mol_info.append(line.strip()) elif len(mol_info) == 3: mol_info.append(line.strip()) # Ignore the rest continue if section == 'ATOM': # Section formatted as follows: # atom_id -- serial number of atom # atom_name -- name of the atom # x -- X-coordinate of the atom # y -- Y-coordinate of the atom # z -- Z-coordinate of the atom # atom_type -- type of the atom # subst_id -- Residue serial number # subst_name -- Residue name # charge -- partial atomic charge # status_bit -- ignored words = line.split() id = int(words[0]) name = words[1] x = float(words[2]) y = float(words[3]) z = float(words[4]) typ = words[5] try: resid = int(words[6]) except IndexError: resid = 0 try: resname = words[7] except IndexError: resname = 'UNK' if 'NO_CHARGES' not in mol_info: try: charge = float(words[8]) except IndexError: charge = 0 else: charge = 0 if last_residue is None: last_residue = (resid, resname) restemp.name = resname atom = Atom(name=name, type=typ, number=id, charge=charge) atom.xx, atom.xy, atom.xz = x, y, z struct.add_atom(atom, resname, resid) if last_residue != (resid, resname): rescont.append(restemp) restemp = ResidueTemplate() restemp.name = resname last_residue = (resid, resname) multires_structure = True try: restemp.add_atom(copy.copy(atom)) except ValueError: # Allow mol2 files being parsed as a Structure to have # duplicate atom names if not structure: raise continue if section == 'BOND': # Section formatted as follows: # bond_id -- serial number of bond (ignored) # origin_atom_id -- serial number of first atom in bond # target_atom_id -- serial number of other atom in bond # bond_type -- string describing bond type # status_bits -- ignored words = line.split() int(words[0] ) # Bond serial number... redundant and ignored a1 = int(words[1]) a2 = int(words[2]) try: order = words[3] except IndexError: order = 1.0 if order in Mol2File.BOND_ORDER_MAP: order = Mol2File.BOND_ORDER_MAP[order] else: try: order = float(order) except ValueError: warnings.warn( 'Mol2 bond order not recognized: %s' % order, ParameterWarning) order = 1.0 atom1 = struct.atoms.find_original_index(a1) atom2 = struct.atoms.find_original_index(a2) struct.bonds.append(Bond(atom1, atom2, order=order)) # Now add it to our residue container # See if it's a head/tail connection if atom1.residue is not atom2.residue: if atom1.residue.idx == len(rescont): res1 = restemp elif atom1.residue.idx < len(rescont): res1 = rescont[atom1.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' if atom2.residue.idx == len(rescont): res2 = restemp elif atom2.residue.idx < len(rescont): res2 = rescont[atom2.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' assert res1 is not res2, 'BAD identical residues' idx1 = atom1.idx - atom1.residue[0].idx idx2 = atom2.idx - atom2.residue[0].idx if atom1.residue.idx < atom2.residue.idx: res1.tail = res1[idx1] res2.head = res2[idx2] else: res1.head = res1[idx1] res2.tail = res2[idx2] elif not multires_structure: if not structure: restemp.add_bond(a1 - 1, a2 - 1, order) else: # Same residue, add the bond offset = atom1.residue[0].idx if atom1.residue.idx == len(rescont): res = restemp else: res = rescont[atom1.residue.idx] res.add_bond(atom1.idx - offset, atom2.idx - offset, order) continue if section == 'CRYSIN': # Section formatted as follows: # a -- length of first unit cell vector # b -- length of second unit cell vector # c -- length of third unit cell vector # alpha -- angle b/w b and c # beta -- angle b/w a and c # gamma -- angle b/w a and b # space group -- number of space group (ignored) # space group setting -- ignored words = line.split() box = [float(w) for w in words[:6]] if len(box) != 6: raise ValueError('%d box dimensions found; needed 6' % len(box)) struct.box = copy.copy(box) rescont.box = copy.copy(box) continue if section == 'SUBSTRUCTURE': # Section formatted as follows: # subst_id -- residue number # subst_name -- residue name # root_atom -- first atom of residue # subst_type -- ignored (usually 'RESIDUE') # dict_type -- type of substructure (ignored) # chain -- chain ID of residue # sub_type -- type of the chain # inter_bonds -- # of inter-substructure bonds # status -- ignored # comment -- ignored words = line.split() if not words: continue id = int(words[0]) resname = words[1] try: chain = words[5] except IndexError: chain = '' # Set the chain ID for res in struct.residues: if res.number == id and res.name == resname: res.chain = chain continue # MOL3 sections if section == 'HEADTAIL': atname, residx = line.split() residx = int(residx) if residx in (0, 1) or residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx - 1] else: raise Mol2Error('Residue out of range in head/tail') for atom in res: if atom.name == atname: if headtail == 'head': res.head = atom headtail = 'tail' else: res.tail = atom headtail = 'head' break else: if headtail == 'head': headtail = 'tail' else: headtail = 'head' continue if section == 'RESIDUECONNECT': words = line.split() residx = int(words[0]) if residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx - 1] else: raise Mol2Error('Residue out of range in ' 'residueconnect') for a in words[3:]: if a == '0': continue for atom in res: if atom.name == a: res.connections.append(atom) break else: raise Mol2Error('Residue connection atom %s not ' 'found in residue %d' % (a, residx)) if structure: for atom in struct.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return struct elif len(rescont) > 0: if not multires_structure and mol_info[0]: restemp.name = mol_info[0] rescont.append(restemp) for res in rescont: for atom in res.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return rescont else: for atom in restemp.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return restemp except ValueError as e: raise Mol2Error('String conversion trouble: %s' % e) finally: if own_handle: f.close()
def __init__(self, fname, seq=None): super(XyzFile, self).__init__() if isinstance(fname, string_types): fxyz = genopen(fname, 'r') own_handle_xyz = True else: fxyz = fname own_handle_xyz = False if seq is not None: seqstruct = load_file(seq) # Now parse the file try: natom = int(fxyz.readline().split()[0]) except (ValueError, IndexError): raise TinkerError('Bad XYZ file format; first line') if seq is not None and natom != len(seqstruct.atoms): raise ValueError( 'Sequence file %s # of atoms does not match the # ' 'of atoms in the XYZ file' % seq) words = fxyz.readline().split() if len(words) == 6 and not XyzFile._check_atom_record(words): self.box = [float(w) for w in words] words = fxyz.readline().split() residue = Residue('SYS') residue.number = 1 residue._idx = 0 if seq is not None: residue = seqstruct.residues[0] atomic_number = _guess_atomic_number(words[1], residue) else: atomic_number = AtomicNum[element_by_name(words[1])] atom = Atom(atomic_number=atomic_number, name=words[1], type=words[5]) atom.xx, atom.xy, atom.xz = [float(w) for w in words[2:5]] self.add_atom(atom, residue.name, residue.number, residue.chain, residue.insertion_code, residue.segid) bond_ids = [[int(w) for w in words[6:]]] for i, line in enumerate(fxyz): words = line.split() if seq is not None: residue = seqstruct.atoms[i + 1].residue atomic_number = _guess_atomic_number(words[1], residue) else: atomic_number = AtomicNum[element_by_name(words[1])] atom = Atom(atomic_number=atomic_number, name=words[1], type=words[5]) atom.xx, atom.xy, atom.xz = [float(w) for w in words[2:5]] self.add_atom(atom, residue.name, residue.number, residue.chain, residue.insertion_code, residue.segid) bond_ids.append([int(w) for w in words[6:]]) # All of the bonds are stored now -- go ahead and make them now for atom, bonds in zip(self.atoms, bond_ids): i = atom.idx + 1 for idx in bonds: if idx > i: self.bonds.append(Bond(atom, self.atoms[idx - 1])) if seq is None: # Try to improve atomic number prediction for monoatomic species # (like ions) if no sequence as loaded for atom in self.atoms: if len(atom.bonds) == 0: # not bonded to anybody else atom.atomic_number = _guess_atomic_number(atom.name) if own_handle_xyz: fxyz.close()
def parse(filename, structure=False): """ Parses a mol2 file (or mol3) file Parameters ---------- filename : str or file-like Name of the file to parse or file-like object to parse from structure : bool, optional If True, the return value is a :class:`Structure` instance. If False, it is either a :class:`ResidueTemplate` or :class:`ResidueTemplateContainter` instance, depending on whether there is one or more than one residue defined in it. Default is False Returns ------- molecule : :class:`Structure`, :class:`ResidueTemplate`, or :class:`ResidueTemplateContainer` The molecule defined by this mol2 file Raises ------ Mol2Error If the file format is not recognized or non-numeric values are present where integers or floating point numbers are expected. Also raises Mol2Error if you try to parse a mol2 file that has multiple @<MOLECULE> entries with ``structure=True``. """ if isinstance(filename, string_types): f = genopen(filename, 'r') own_handle = True else: f = filename own_handle = False rescont = ResidueTemplateContainer() struct = Structure() restemp = ResidueTemplate() mol_info = [] multires_structure = False try: section = None last_residue = None headtail = 'head' molecule_number = 0 for line in f: if line.startswith('#'): continue if not line.strip() and section is None: continue if line.startswith('@<TRIPOS>'): section = line[9:].strip() if section == 'MOLECULE' and (restemp.atoms or rescont): if structure: raise Mol2Error('Cannot convert MOL2 with multiple ' '@<MOLECULE>s to a Structure') # Set the residue name from the MOL2 title if the # molecule had only 1 residue and it was given a name in # the title if not multires_structure and mol_info[0]: restemp.name = mol_info[0] multires_structure = False rescont.append(restemp) restemp = ResidueTemplate() struct = Structure() last_residue = None molecule_number += 1 mol_info = [] continue if section is None: raise Mol2Error('Bad mol2 file format') if section == 'MOLECULE': # Section formatted as follows: # mol_name # num_atoms [num_bonds [num_substr [num_feat [num_sets]]]] # mol_type # charge_type # [status_bits] # [mol_comment] # TODO: Do something with the name. if len(mol_info) == 0: mol_info.append(line.strip()) elif len(mol_info) == 1: mol_info.append([int(x) for x in line.split()]) elif len(mol_info) == 2: mol_info.append(line.strip()) elif len(mol_info) == 3: mol_info.append(line.strip()) # Ignore the rest continue if section == 'ATOM': # Section formatted as follows: # atom_id -- serial number of atom # atom_name -- name of the atom # x -- X-coordinate of the atom # y -- Y-coordinate of the atom # z -- Z-coordinate of the atom # atom_type -- type of the atom # subst_id -- Residue serial number # subst_name -- Residue name # charge -- partial atomic charge # status_bit -- ignored words = line.split() id = int(words[0]) name = words[1] x = float(words[2]) y = float(words[3]) z = float(words[4]) typ = words[5] try: resid = int(words[6]) except IndexError: resid = 0 try: resname = words[7] except IndexError: resname = 'UNK' if 'NO_CHARGES' not in mol_info: try: charge = float(words[8]) except IndexError: charge = 0 else: charge = 0 if last_residue is None: last_residue = (resid, resname) restemp.name = resname atom = Atom(name=name, type=typ, number=id, charge=charge) atom.xx, atom.xy, atom.xz = x, y, z struct.add_atom(atom, resname, resid) if last_residue != (resid, resname): rescont.append(restemp) restemp = ResidueTemplate() restemp.name = resname last_residue = (resid, resname) multires_structure = True try: restemp.add_atom(copy.copy(atom)) except ValueError: # Allow mol2 files being parsed as a Structure to have # duplicate atom names if not structure: raise continue if section == 'BOND': # Section formatted as follows: # bond_id -- serial number of bond (ignored) # origin_atom_id -- serial number of first atom in bond # target_atom_id -- serial number of other atom in bond # bond_type -- string describing bond type # status_bits -- ignored words = line.split() int(words[0]) # Bond serial number... redundant and ignored a1 = int(words[1]) a2 = int(words[2]) try: order = words[3] except IndexError: order = 1.0 if order in Mol2File.BOND_ORDER_MAP: order = Mol2File.BOND_ORDER_MAP[order] else: try: order = float(order) except ValueError: warnings.warn('Mol2 bond order not recognized: %s' % order, ParameterWarning) order = 1.0 atom1 = struct.atoms.find_original_index(a1) atom2 = struct.atoms.find_original_index(a2) struct.bonds.append(Bond(atom1, atom2, order=order)) # Now add it to our residue container # See if it's a head/tail connection if atom1.residue is not atom2.residue: if atom1.residue.idx == len(rescont): res1 = restemp elif atom1.residue.idx < len(rescont): res1 = rescont[atom1.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' if atom2.residue.idx == len(rescont): res2 = restemp elif atom2.residue.idx < len(rescont): res2 = rescont[atom2.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' assert res1 is not res2, 'BAD identical residues' idx1 = atom1.idx - atom1.residue[0].idx idx2 = atom2.idx - atom2.residue[0].idx if atom1.residue.idx < atom2.residue.idx: res1.tail = res1[idx1] res2.head = res2[idx2] else: res1.head = res1[idx1] res2.tail = res2[idx2] elif not multires_structure: if not structure: restemp.add_bond(a1-1, a2-1, order) else: # Same residue, add the bond offset = atom1.residue[0].idx if atom1.residue.idx == len(rescont): res = restemp else: res = rescont[atom1.residue.idx] res.add_bond(atom1.idx-offset, atom2.idx-offset, order) continue if section == 'CRYSIN': # Section formatted as follows: # a -- length of first unit cell vector # b -- length of second unit cell vector # c -- length of third unit cell vector # alpha -- angle b/w b and c # beta -- angle b/w a and c # gamma -- angle b/w a and b # space group -- number of space group (ignored) # space group setting -- ignored words = line.split() box = [float(w) for w in words[:6]] if len(box) != 6: raise ValueError('%d box dimensions found; needed 6' % len(box)) struct.box = copy.copy(box) rescont.box = copy.copy(box) continue if section == 'SUBSTRUCTURE': # Section formatted as follows: # subst_id -- residue number # subst_name -- residue name # root_atom -- first atom of residue # subst_type -- ignored (usually 'RESIDUE') # dict_type -- type of substructure (ignored) # chain -- chain ID of residue # sub_type -- type of the chain # inter_bonds -- # of inter-substructure bonds # status -- ignored # comment -- ignored words = line.split() if not words: continue id = int(words[0]) resname = words[1] try: chain = words[5] except IndexError: chain = '' # Set the chain ID for res in struct.residues: if res.number == id and res.name == resname: res.chain = chain continue # MOL3 sections if section == 'HEADTAIL': atname, residx = line.split() residx = int(residx) if residx in (0, 1) or residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in head/tail') for atom in res: if atom.name == atname: if headtail == 'head': res.head = atom headtail = 'tail' else: res.tail = atom headtail = 'head' break else: if headtail == 'head': headtail = 'tail' else: headtail = 'head' continue if section == 'RESIDUECONNECT': words = line.split() residx = int(words[0]) if residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in ' 'residueconnect') for a in words[3:]: if a == '0': continue for atom in res: if atom.name == a: res.connections.append(atom) break else: raise Mol2Error('Residue connection atom %s not ' 'found in residue %d' % (a, residx)) if structure: for atom in struct.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return struct elif len(rescont) > 0: if not multires_structure and mol_info[0]: restemp.name = mol_info[0] rescont.append(restemp) for res in rescont: for atom in res.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return rescont else: for atom in restemp.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return restemp except ValueError as e: raise Mol2Error('String conversion trouble: %s' % e) finally: if own_handle: f.close()