def conect(self): # Return pairs of numbers that should be CONECTed # First extract the backbone IDs cg = self.cg() bb = [i+1 for i, j in zip(range(len(cg)), cg) if j[0] == "BB"] bb = zip(bb, bb[1:]+[len(bb)]) # Set the backbone CONECTs (check whether the distance is consistent with binding) conect = [(i, j) for i, j in bb[:-1] if FUNC.distance2(cg[i-1][4:7], cg[j-1][4:7]) < 14] # Now add CONECTs for sidechains for i, j in bb: nsc = j-i-1
def bbGetBond(self,r,ca,ss): # Retrieve parameters for each residue from tables defined above # Check is it DNA residue if r[0] in MAP.dnares3: return ca in self.dnaBbBondDictC.keys() and self.dnaBbBondDictC[ca] or None # RNA is not implemented properly yet elif r[0] in MAP.rnares3: return ca in self.rnaBbBondDictC.keys() and self.rnaBbBondDictC[ca] or None # If it's protein else: import FUNC import math # The 150000 forceconstant gave an error message, turning to constraints would be better. return ( math.sqrt(FUNC.distance2(ca[0],ca[1]))/10., None )
def rubberBands(atomList, lowerBound, upperBound, decayFactor, decayPower, forceConstant, minimumForce): out = [] u2 = upperBound**2 while len(atomList) > 3: bi, xi = atomList.pop(0) for bj, xj in atomList[2:]: # Mind the nm/A conversion -- This has to be standardized! Global use of nm? d2 = FUNC.distance2(xi, xj)/100 if d2 < u2: dij = math.sqrt(d2) fscl = decayFunction(dij, lowerBound, decayFactor, decayPower) if fscl*forceConstant > minimumForce: out.append({"atoms": (bi, bj), "parameters": (dij, "RUBBER_FC*%f" % fscl)}) return out
def rubberBands(atomList,lowerBound,upperBound,decayFactor,decayPower,forceConstant,minimumForce): out = [] u2 = upperBound**2 while len(atomList) > 3: bi,xi = atomList.pop(0) # This is a bit weird (=wrong I think) way of doing the cutoff... #for bj,xj in atomList[2:]: for bj,xj in atomList: # Mind the nm/A conversion -- This has to be standardized! Global use of nm? d2 = FUNC.distance2(xi,xj)/100 #if bi==73 and bj==79: # print xi, xj, d2 if d2 < u2: dij = math.sqrt(d2) fscl = decayFunction(dij,lowerBound,decayFactor,decayPower) if fscl*forceConstant > minimumForce: out.append({"atoms":(bi,bj),"parameters": (dij,"RUBBER_FC*%f"%fscl)}) return out
def rubberBands(atomList, lowerBound, upperBound, decayFactor, decayPower, forceConstant, minimumForce): out = [] u2 = upperBound**2 while len(atomList) > 3: bi, xi = atomList.pop(0) for bj, xj in atomList[2:]: # Mind the nm/A conversion -- This has to be standardized! Global use of nm? d2 = FUNC.distance2(xi, xj) / 100 if d2 < u2: dij = math.sqrt(d2) fscl = decayFunction(dij, lowerBound, decayFactor, decayPower) if fscl * forceConstant > minimumForce: out.append({ "atoms": (bi, bj), "parameters": (dij, "RUBBER_FC*%f" % fscl) }) return out
def bbGetBond(self,r,ca,ss): import FUNC import math # The 150000 forceconstant gave an error message, turning to constraints would be better. return ( math.sqrt(FUNC.distance2(ca[0],ca[1]))/10., 150000 )
def bbGetBond(self, r, ca, ss): import FUNC import math # The 150000 forceconstant gave an error message, turning to constraints would be better. return (math.sqrt(FUNC.distance2(ca[0], ca[1])) / 10., None)
def check_merge(chains, m_list=[], l_list=[], ss_cutoff=0): chainIndex = range(len(chains)) if 'all' in m_list: logging.info("All chains will be merged in a single moleculetype.") return chainIndex, [chainIndex] chainID = [chain.id for chain in chains] # Mark the combinations of chains that need to be merged merges = [] if m_list: # Build a dictionary of chain IDs versus index # To give higher priority to top chains the lists are reversed # before building the dictionary chainIndex.reverse() chainID.reverse() dct = dict(zip(chainID, chainIndex)) chainIndex.reverse() # Convert chains in the merge_list to numeric, if necessary # NOTE The internal numbering is zero-based, while the # command line chain indexing is one-based. We have to add # one to the number in the dictionary to bring it on par with # the numbering from the command line, but then from the # result we need to subtract one again to make indexing # zero-based merges = [[(i.isdigit() and int(i) or dct[i]+1)-1 for i in j] for j in m_list] for i in merges: i.sort() # Rearrange merge list to a list of pairs pairs = [(i[j], i[k]) for i in merges for j in range(len(i)-1) for k in range(j+1, len(i))] # Check each combination of chains for connections based on # ss-bridges, links and distance restraints for i in chainIndex[:-1]: for j in chainIndex[i+1:]: if (i, j) in pairs: continue # Check whether any link links these two groups for a, b in l_list: if ((a in chains[i] and b in chains[j]) or (a in chains[j] and b in chains[i])): logging.info("Merging chains %d and %d to allow link %s" % (i+1, j+1, str((a, b)))) pairs.append(i < j and (i, j) or (j, i)) break if (i, j) in pairs: continue # Check whether any cystine bond given links these two groups #for a,b in s_list: # if ((a in chains[i] and b in chains[j]) or # (a in chains[j] and b in chains[i])): # logging.info("Merging chains %d and %d to allow cystine bridge"%(i+1,j+1)) # pairs.append( i<j and (i,j) or (j,i) ) # break #if (i,j) in pairs: # continue # Check for cystine bridges based on distance if not ss_cutoff: continue # Get SG atoms from cysteines from either chain # Check this pair of chains for cysA in chains[i]["CYS"]: for cysB in chains[j]["CYS"]: d2 = FUNC.distance2(cysA["SG"][4:7], cysB["SG"][4:7]) if d2 <= ss_cutoff: logging.info("Found SS contact linking chains %d and %d (%f nm)" % (i+1, j+1, math.sqrt(d2)/10)) pairs.append((i, j)) break if (i, j) in pairs: break # Sort the combinations pairs.sort(reverse=True) merges = [] while pairs: merges.append(set([pairs[-1][0]])) for i in range(len(pairs)-1, -1, -1): if pairs[i][0] in merges[-1]: merges[-1].add(pairs.pop(i)[1]) elif pairs[i][1] in merges[-1]: merges[-1].add(pairs.pop(i)[0]) merges = [list(i) for i in merges] for i in merges: i.sort() order = [j for i in merges for j in i] if merges: logging.warning("Merging chains.") logging.warning("This may change the order of atoms and will change the number of topology files.") logging.info("Merges: " + ", ".join([str([j+1 for j in i]) for i in merges])) if len(merges) == 1 and len(merges[0]) > 1 and set(merges[0]) == set(chainIndex): logging.info("All chains will be merged in a single moleculetype") # Determine the order for writing; merged chains go first merges.extend([[j] for j in chainIndex if j not in order]) order.extend([j for j in chainIndex if j not in order]) return order, merges
def contacts(atoms, cutoff=5): rla = range(len(atoms)) crd = [atom[4:] for atom in atoms] return [(i, j) for i in rla[:-1] for j in rla[i+1:] if FUNC.distance2(crd[i], crd[j]) < cutoff]
def residueDistance2(r1, r2): return min([FUNC.distance2(i, j) for i in r1 for j in r2])
def main(options): # Check whether to read from a gro/pdb file or from stdin # We use an iterator to wrap around the stream to allow # inferring the file type, without consuming lines already inStream = IO.streamTag(options["-f"] and options["-f"].value or sys.stdin) # The streamTag iterator first yields the file type, which # is used to specify the function for reading frames fileType = inStream.next() if fileType == "GRO": frameIterator = IO.groFrameIterator else: frameIterator = IO.pdbFrameIterator # ITERATE OVER FRAMES IN STRUCTURE FILE # # Now iterate over the frames in the stream # This should become a StructureFile class with a nice .next method model = 1 cgOutPDB = None ssTotal = [] cysteines = [] for title, atoms, box in frameIterator(inStream): if fileType == "PDB": # The PDB file can have chains, in which case we list and process them specifically # TER statements are also interpreted as chain separators # A chain may have breaks in which case the breaking residues are flagged chains = [ IO.Chain(options, [i for i in IO.residues(chain)]) for chain in IO.pdbChains(atoms) ] else: # The GRO file does not define chains. Here breaks in the backbone are # interpreted as chain separators. residuelist = [residue for residue in IO.residues(atoms)] # The breaks are indices to residues broken = IO.breaks(residuelist) # Reorder, such that each chain is specified with (i,j,k) # where i and j are the start and end of the chain, and # k is a chain identifier chains = zip([0] + broken, broken + [len(residuelist)], range(len(broken) + 1)) chains = [ IO.Chain(options, residuelist[i:j], name=chr(65 + k)) for i, j, k in chains ] for chain in chains: chain.multiscale = "all" in options[ 'multi'] or chain.id in options['multi'] # Check the chain identifiers if model == 1 and len(chains) != len(set([i.id for i in chains])): # Ending down here means that non-consecutive blocks of atoms in the # PDB file have the same chain ID. The warning pertains to PDB files only, # since chains from GRO files get a unique chain identifier assigned. logging.warning( "Several chains have identical chain identifiers in the PDB file." ) # Check if chains are of mixed type. If so, split them. # Note that in some cases HETATM residues are part of a # chain. This will get problematic. But we cannot cover # all, probably. if not options['MixedChains']: demixedChains = [] for chain in chains: demixedChains.extend(chain.split()) chains = demixedChains n = 1 logging.info("Found %d chains:" % len(chains)) for chain in chains: logging.info(" %2d: %s (%s), %d atoms in %d residues." % (n, chain.id, chain._type, chain.natoms, len(chain))) n += 1 # Check all chains keep = [] for chain in chains: if chain.type() == "Water": logging.info("Removing %d water molecules (chain %s)." % (len(chain), chain.id)) elif chain.type() in ("Protein", "Nucleic"): keep.append(chain) # This is currently not active: elif options['RetainHETATM']: keep.append(chain) else: logging.info( "Removing HETATM chain %s consisting of %d residues." % (chain.id, len(chain))) chains = keep # Here we interactively check the charge state of resides # Can be easily expanded to residues other than HIS for chain in chains: for i, resname in enumerate(chain.sequence): if resname == 'HIS' and options['chHIS']: choices = {0: 'HIH', 1: 'HIS'} choice = IO.getChargeType(resname, i, choices) chain.sequence[i] = choice # Check which chains need merging if model == 1: order, merge = IO.check_merge( chains, options['mergeList'], options['linkList'], options['CystineCheckBonds'] and options['CystineMaxDist2']) # Get the total length of the sequence seqlength = sum([len(chain) for chain in chains]) logging.info('Total size of the system: %s residues.' % seqlength) ## SECONDARY STRUCTURE ss = '' if options['Collagen']: for chain in chains: chain.set_ss("F") ss += chain.ss elif options["-ss"]: # XXX We need error-catching here, # in case the file doesn't excist, or the string contains bogus. # If the string given for the sequence consists strictly of upper case letters # and does not appear to be a file, assume it is the secondary structure ss = options["-ss"].value.replace('~', 'L').replace(' ', 'L') if ss.isalnum() and ss.isupper() and not os.path.exists( options["-ss"].value): ss = options["-ss"].value logging.info('Secondary structure read from command-line:\n' + ss) else: # There ought to be a file with the name specified ssfile = [i.strip() for i in open(options["-ss"].value)] # Try to read the file as a Gromacs Secondary Structure Dump # Those have an integer as first line if ssfile[0].isdigit(): logging.info( 'Will read secondary structure from file (assuming Gromacs ssdump).' ) ss = "".join([i for i in ssfile[1:]]) else: # Get the secondary structure type from DSSP output logging.info( 'Will read secondary structure from file (assuming DSSP output).' ) pss = re.compile(r"^([ 0-9]{4}[0-9]){2}") ss = "".join([ i[16] for i in open(options["-ss"].value) if re.match(pss, i) ]) # Now set the secondary structure for each of the chains sstmp = ss for chain in chains: ln = min(len(sstmp), len(chain)) chain.set_ss(sstmp[:ln]) sstmp = ss[:ln] else: if options["-dssp"]: method, executable = "dssp", options["-dssp"].value #elif options["-pymol"]: # method, executable = "pymol", options["-pymol"].value else: logging.warning( "No secondary structure or determination method speficied. Protein chains will be set to 'COIL'." ) method, executable = None, None for chain in chains: ss += chain.dss(method, executable) # Used to be: if method in ("dssp","pymol"): but pymol is not supported if method in ["dssp"]: logging.debug('%s determined secondary structure:\n' % method.upper() + ss) # Collect the secondary structure classifications for different frames ssTotal.append(ss) # Write the coarse grained structure if requested if options["-x"].value: logging.info("Writing coarse grained structure.") if cgOutPDB is None: cgOutPDB = open(options["-x"].value, "w") cgOutPDB.write("MODEL %8d\n" % model) cgOutPDB.write(title) cgOutPDB.write(IO.pdbBoxString(box)) atid = 1 for i in order: ci = chains[i] if ci.multiscale: for r in ci.residues: for name, resn, resi, chain, x, y, z in r: cgOutPDB.write( IO.pdbOut( (name, resn[:3], resi, chain, x, y, z), i=atid)) atid += 1 coarseGrained = ci.cg(com=True) if coarseGrained: for name, resn, resi, chain, x, y, z, ssid in coarseGrained: if ci.multiscale: name = "v" + name cgOutPDB.write( IO.pdbOut((name, resn[:3], resi, chain, x, y, z), i=atid, ssid=ssid)) atid += 1 cgOutPDB.write("TER\n") else: logging.warning( "No mapping for coarse graining chain %s (%s); chain is skipped." % (ci.id, ci.type())) cgOutPDB.write("ENDMDL\n") # Gather cysteine sulphur coordinates cyslist = [cys["SG"] for chain in chains for cys in chain["CYS"]] cysteines.append([cys for cys in cyslist if cys]) model += 1 # Write the index file if requested. # Mainly of interest for multiscaling. # Could be improved by adding separte groups for BB, SC, etc. if options["-n"].value: logging.info("Writing index file.") # Lists for All-atom, Virtual sites and Coarse Grain. NAA, NVZ, NCG = [], [], [] atid = 1 for i in order: ci = chains[i] coarseGrained = ci.cg(force=True) if ci.multiscale: NAA.extend([" %5d" % (a + atid) for a in range(ci.natoms)]) atid += ci.natoms if coarseGrained: if ci.multiscale: NVZ.extend([ " %5d" % (a + atid) for a in range(len(coarseGrained)) ]) else: NCG.extend([ " %5d" % (a + atid) for a in range(len(coarseGrained)) ]) atid += len(coarseGrained) outNDX = open(options["-n"].value, "w") outNDX.write("\n[ AA ]\n" + "\n".join( [" ".join(NAA[i:i + 15]) for i in range(0, len(NAA), 15)])) outNDX.write("\n[ VZ ]\n" + "\n".join( [" ".join(NVZ[i:i + 15]) for i in range(0, len(NVZ), 15)])) outNDX.write("\n[ CG ]\n" + "\n".join( [" ".join(NCG[i:i + 15]) for i in range(0, len(NCG), 15)])) outNDX.close() # Write the index file for mapping AA trajectory if requested if options["-nmap"].value: logging.info("Writing trajectory index file.") atid = 1 outNDX = open(options["-nmap"].value, "w") # Get all AA atoms as lists of atoms in residues # First we skip hetatoms and unknowns then iterate over beads # In DNA the O3' atom is mapped together with atoms from the next residue # This stores it until we get to the next residue o3_shift = '' for i_count, i in enumerate(IO.residues(atoms)): if i[0][1] in ("SOL", "HOH", "TIP"): continue if not i[0][1] in MAP.CoarseGrained.mapping.keys(): continue nra = 0 names = [j[0] for j in i] # This gives out a list of atoms in residue, each tuple has other # stuff in it that's needed elsewhere so we just take the last # element which is the atom index (in that residue) for j_count, j in enumerate(MAP.mapIndex(i)): outNDX.write('[ Bead %i of residue %i ]\n' % (j_count + 1, i_count + 1)) line = '' for k in j: if names[k[2]] == "O3'": line += '%s ' % (str(o3_shift)) o3_shift = k[2] + atid else: line += '%i ' % (k[2] + atid) line += '\n' nra += len(j) outNDX.write(line) atid += nra # Evertything below here we only need, if we need to write a Topology if options['-o']: # Collect the secondary structure stuff and decide what to do with it # First rearrange by the residue ssTotal = zip(*ssTotal) ssAver = [] for i in ssTotal: si = list(set(i)) if len(si) == 1: # Only one type -- consensus ssAver.append(si[0]) else: # Transitions between secondary structure types i = list(i) si = [(1.0 * i.count(j) / len(i), j) for j in si] si.sort() if si[-1][0] > options["-ssc"].value: ssAver.append(si[-1][1]) else: ssAver.append(" ") ssAver = "".join(ssAver) logging.info( '(Average) Secondary structure has been determined (see head of .itp-file).' ) # Divide the secondary structure according to the division in chains # This will set the secondary structure types to be used for the # topology. for chain in chains: chain.set_ss(ssAver[:len(chain)]) ssAver = ssAver[len(chain):] # Now the chains are complete, each consisting of a residuelist, # and a secondary structure designation if the chain is of type 'Protein'. # There may be mixed chains, there may be HETATM things. # Water has been discarded. Maybe this has to be changed at some point. # The order in the coarse grained files matches the order in the set of chains. # # If there are no merges to be done, i.e. no global Elnedyn network, no # disulphide bridges, no links, no distance restraints and no explicit merges, # then we can write out the topology, which will match the coarse grained file. # # If there are merges to be done, the order of things may be changed, in which # case the coarse grained structure will not match with the topology... # CYSTINE BRIDGES # # Extract the cysteine coordinates (for all frames) and the cysteine identifiers if options['CystineCheckBonds']: logging.info( "Checking for cystine bridges, based on sulphur (SG) atoms lying closer than %.4f nm" % math.sqrt(options['CystineMaxDist2'] / 100)) cyscoord = zip(*[[j[4:7] for j in i] for i in cysteines]) cysteines = [i[:4] for i in cysteines[0]] bl, kb = options['ForceField'].special[(("SC1", "CYS"), ("SC1", "CYS"))] # Check the distances and add the cysteines to the link list if the # SG atoms have a distance smaller than the cutoff. rlc = range(len(cysteines)) for i in rlc[:-1]: for j in rlc[i + 1:]: # Checking the minimum distance over all frames # But we could also take the maximum, or the mean d2 = min([ FUNC.distance2(a, b) for a, b in zip(cyscoord[i], cyscoord[j]) ]) if d2 <= options['CystineMaxDist2']: a, b = cysteines[i], cysteines[j] options['linkListCG'].append( (("SC1", "CYS", a[2], a[3]), ("SC1", "CYS", b[2], b[3]), bl, kb)) a, b = (a[0], a[1], a[2] - (32 << 20), a[3]), (b[0], b[1], b[2] - (32 << 20), b[3]) logging.info( "Detected SS bridge between %s and %s (%f nm)" % (a, b, math.sqrt(d2) / 10)) # REAL ITP STUFF # # Check whether we have identical chains, in which case we # only write the ITP for one... # This means making a distinction between chains and # moleculetypes. molecules = [tuple([chains[i] for i in j]) for j in merge] # At this point we should have a list or dictionary of chains # Each chain should be given a unique name, based on the value # of options["-o"] combined with the chain identifier and possibly # a number if there are chains with identical identifiers. # For each chain we then write an ITP file using the name for # moleculetype and name + ".itp" for the topology include file. # In addition we write a master topology file, using the value of # options["-o"], with an added extension ".top" if not given. # XXX *NOTE*: This should probably be gathered in a 'Universe' class itp = 0 moleculeTypes = {} for mi in range(len(molecules)): mol = molecules[mi] # Check if the moleculetype is already listed # If not, generate the topology from the chain definition if mol not in moleculeTypes or options['SeparateTop']: # Name of the moleculetype # XXX: The naming should be changed; now it becomes Protein_X+Protein_Y+... name = "+".join( [chain.getname(options['-name'].value) for chain in mol]) moleculeTypes[mol] = name # Write the molecule type topology top = TOP.Topology(mol[0], options=options, name=name) for m in mol[1:]: top += TOP.Topology(m, options=options) # Have to add the connections, like the connecting network # Gather coordinates mcg, coords = zip(*[(j[:4], j[4:7]) for m in mol for j in m.cg(force=True)]) mcg = list(mcg) # Run through the link list and add connections (links = cys bridges or hand specified links) for atomA, atomB, bondlength, forceconst in options[ 'linkListCG']: if bondlength == -1 and forceconst == -1: bondlength, forceconst = options['ForceField'].special[ (atomA[:2], atomB[:2])] # Check whether this link applies to this group atomA = atomA in mcg and mcg.index(atomA) + 1 atomB = atomB in mcg and mcg.index(atomB) + 1 if atomA and atomB: cat = (forceconst is None) and "Constraint" or "Link" top.bonds.append( TOP.Bond((atomA, atomB), options=options, type=1, parameters=(bondlength, forceconst), category=cat, comments="Cys-bonds/special link")) # Elastic Network # The elastic network is added after the topology is constructed, since that # is where the correct atom list with numbering and the full set of # coordinates for the merged chains are available. if options['ElasticNetwork']: rubberType = options['ForceField'].EBondType rubberList = ELN.rubberBands( [(i[0], j) for i, j in zip(top.atoms, coords) if i[4] in options['ElasticBeads']], options['ElasticLowerBound'], options['ElasticUpperBound'], options['ElasticDecayFactor'], options['ElasticDecayPower'], options['ElasticMaximumForce'], options['ElasticMinimumForce']) top.bonds.extend([ TOP.Bond(i, options=options, type=rubberType, category="Rubber band") for i in rubberList ]) # Write out the MoleculeType topology destination = options["-o"] and open( moleculeTypes[mol] + ".itp", 'w') or sys.stdout destination.write(str(top)) itp += 1 # Check whether other chains are equal to this one # Skip this step if we are to write all chains to separate moleculetypes if not options['SeparateTop']: for j in range(mi + 1, len(molecules)): if not molecules[j] in moleculeTypes and mol == molecules[ j]: # Molecule j is equal to a molecule mi # Set the name of the moleculetype to the one of that molecule moleculeTypes[molecules[j]] = moleculeTypes[mol] logging.info('Written %d ITP file%s' % (itp, itp > 1 and "s" or "")) # WRITING THE MASTER TOPOLOGY # Output stream top = options["-o"] and open(options['-o'].value, 'w') or sys.stdout # ITP file listing itps = '\n'.join([ '#include "%s.itp"' % molecule for molecule in set(moleculeTypes.values()) ]) # Molecule listing logging.info("Output contains %d molecules:" % len(molecules)) n = 1 for molecule in molecules: chainInfo = (n, moleculeTypes[molecule], len(molecule) > 1 and "s" or " ", " ".join([i.id for i in molecule])) logging.info(" %2d-> %s (chain%s %s)" % chainInfo) n += 1 molecules = '\n'.join( ['%s \t 1' % moleculeTypes[molecule] for molecule in molecules]) # Set a define if we are to use rubber bands useRubber = options['ElasticNetwork'] and "#define RUBBER_BANDS" or "" # XXX Specify a better, version specific base-itp name. # Do not set a define for position restrains here, as people are more used to do it in mdp file? top.write('''#include "martini.itp" %s %s [ system ] ; name Martini system from %s [ molecules ] ; name number %s''' % (useRubber, itps, options["-f"] and options["-f"].value or "stdin", molecules)) logging.info('Written topology files') # Maybe there are forcefield specific log messages? options['ForceField'].messages() # The following lines are always printed (if no errors occur). print "\n\tThere you are. One MARTINI. Shaken, not stirred.\n" Q = DOC.martiniq.pop(random.randint(0, len(DOC.martiniq) - 1)) print "\n", Q[1], "\n%80s" % ("--" + Q[0]), "\n"
def main(options): # Check whether to read from a gro/pdb file or from stdin # We use an iterator to wrap around the stream to allow # inferring the file type, without consuming lines already inStream = IO.streamTag(options["-f"] and options["-f"].value or sys.stdin) # The streamTag iterator first yields the file type, which # is used to specify the function for reading frames fileType = inStream.next() if fileType == "GRO": frameIterator = IO.groFrameIterator else: frameIterator = IO.pdbFrameIterator # ITERATE OVER FRAMES IN STRUCTURE FILE # # Now iterate over the frames in the stream # This should become a StructureFile class with a nice .next method model = 1 cgOutPDB = None ssTotal = [] cysteines = [] for title, atoms, box in frameIterator(inStream): if fileType == "PDB": # The PDB file can have chains, in which case we list and process them specifically # TER statements are also interpreted as chain separators # A chain may have breaks in which case the breaking residues are flagged chains = [IO.Chain(options, [i for i in IO.residues(chain)]) for chain in IO.pdbChains(atoms)] else: # The GRO file does not define chains. Here breaks in the backbone are # interpreted as chain separators. residuelist = [residue for residue in IO.residues(atoms)] # The breaks are indices to residues broken = IO.breaks(residuelist) # Reorder, such that each chain is specified with (i,j,k) # where i and j are the start and end of the chain, and # k is a chain identifier chains = zip([0]+broken, broken+[len(residuelist)], range(len(broken)+1)) chains = [IO.Chain(options, residuelist[i:j], name=chr(65+k)) for i, j, k in chains] for chain in chains: chain.multiscale = "all" in options['multi'] or chain.id in options['multi'] # Check the chain identifiers if model == 1 and len(chains) != len(set([i.id for i in chains])): # Ending down here means that non-consecutive blocks of atoms in the # PDB file have the same chain ID. The warning pertains to PDB files only, # since chains from GRO files get a unique chain identifier assigned. logging.warning("Several chains have identical chain identifiers in the PDB file.") # Check if chains are of mixed type. If so, split them. # Note that in some cases HETATM residues are part of a # chain. This will get problematic. But we cannot cover # all, probably. if not options['MixedChains']: demixedChains = [] for chain in chains: demixedChains.extend(chain.split()) chains = demixedChains n = 1 logging.info("Found %d chains:" % len(chains)) for chain in chains: logging.info(" %2d: %s (%s), %d atoms in %d residues." % (n, chain.id, chain._type, chain.natoms, len(chain))) n += 1 # Check all chains keep = [] for chain in chains: if chain.type() == "Water": logging.info("Removing %d water molecules (chain %s)." % (len(chain), chain.id)) elif chain.type() in ("Protein", "Nucleic"): keep.append(chain) # This is currently not active: elif options['RetainHETATM']: keep.append(chain) else: logging.info("Removing HETATM chain %s consisting of %d residues." % (chain.id, len(chain))) chains = keep # Here we interactively check the charge state of resides # Can be easily expanded to residues other than HIS for chain in chains: for i, resname in enumerate(chain.sequence): if resname == 'HIS' and options['chHIS']: choices = {0: 'HIH', 1: 'HIS'} choice = IO.getChargeType(resname, i, choices) chain.sequence[i] = choice # Check which chains need merging if model == 1: order, merge = IO.check_merge(chains, options['mergeList'], options['linkList'], options['CystineCheckBonds'] and options['CystineMaxDist2']) # Get the total length of the sequence seqlength = sum([len(chain) for chain in chains]) logging.info('Total size of the system: %s residues.' % seqlength) ## SECONDARY STRUCTURE ss = '' if options['Collagen']: for chain in chains: chain.set_ss("F") ss += chain.ss elif options["-ss"]: # XXX We need error-catching here, # in case the file doesn't excist, or the string contains bogus. # If the string given for the sequence consists strictly of upper case letters # and does not appear to be a file, assume it is the secondary structure ss = options["-ss"].value.replace('~', 'L').replace(' ', 'L') if ss.isalnum() and ss.isupper() and not os.path.exists(options["-ss"].value): ss = options["-ss"].value logging.info('Secondary structure read from command-line:\n'+ss) else: # There ought to be a file with the name specified ssfile = [i.strip() for i in open(options["-ss"].value)] # Try to read the file as a Gromacs Secondary Structure Dump # Those have an integer as first line if ssfile[0].isdigit(): logging.info('Will read secondary structure from file (assuming Gromacs ssdump).') ss = "".join([i for i in ssfile[1:]]) else: # Get the secondary structure type from DSSP output logging.info('Will read secondary structure from file (assuming DSSP output).') pss = re.compile(r"^([ 0-9]{4}[0-9]){2}") ss = "".join([i[16] for i in open(options["-ss"].value) if re.match(pss, i)]) # Now set the secondary structure for each of the chains sstmp = ss for chain in chains: ln = min(len(sstmp), len(chain)) chain.set_ss(sstmp[:ln]) sstmp = ss[:ln] else: if options["-dssp"]: method, executable = "dssp", options["-dssp"].value #elif options["-pymol"]: # method, executable = "pymol", options["-pymol"].value else: logging.warning("No secondary structure or determination method speficied. Protein chains will be set to 'COIL'.") method, executable = None, None for chain in chains: ss += chain.dss(method, executable) # Used to be: if method in ("dssp","pymol"): but pymol is not supported if method in ["dssp"]: logging.debug('%s determined secondary structure:\n' % method.upper()+ss) # Collect the secondary structure classifications for different frames ssTotal.append(ss) # Write the coarse grained structure if requested if options["-x"].value: logging.info("Writing coarse grained structure.") if cgOutPDB is None: cgOutPDB = open(options["-x"].value, "w") cgOutPDB.write("MODEL %8d\n" % model) cgOutPDB.write(title) cgOutPDB.write(IO.pdbBoxString(box)) atid = 1 for i in order: ci = chains[i] if ci.multiscale: for r in ci.residues: for name, resn, resi, chain, x, y, z in r: cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid)) atid += 1 coarseGrained = ci.cg(com=True) if coarseGrained: for name, resn, resi, chain, x, y, z, ssid in coarseGrained: if ci.multiscale: name = "v"+name cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid,ssid=ssid)) atid += 1 cgOutPDB.write("TER\n") else: logging.warning("No mapping for coarse graining chain %s (%s); chain is skipped." % (ci.id, ci.type())) cgOutPDB.write("ENDMDL\n") # Gather cysteine sulphur coordinates cyslist = [cys["SG"] for chain in chains for cys in chain["CYS"]] cysteines.append([cys for cys in cyslist if cys]) model += 1 # Write the index file if requested. # Mainly of interest for multiscaling. # Could be improved by adding separte groups for BB, SC, etc. if options["-n"].value: logging.info("Writing index file.") # Lists for All-atom, Virtual sites and Coarse Grain. NAA, NVZ, NCG = [], [], [] atid = 1 for i in order: ci = chains[i] coarseGrained = ci.cg(force=True) if ci.multiscale: NAA.extend([" %5d" % (a+atid) for a in range(ci.natoms)]) atid += ci.natoms if coarseGrained: if ci.multiscale: NVZ.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))]) else: NCG.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))]) atid += len(coarseGrained) outNDX = open(options["-n"].value, "w") outNDX.write("\n[ AA ]\n"+"\n".join([" ".join(NAA[i:i+15]) for i in range(0, len(NAA), 15)])) outNDX.write("\n[ VZ ]\n"+"\n".join([" ".join(NVZ[i:i+15]) for i in range(0, len(NVZ), 15)])) outNDX.write("\n[ CG ]\n"+"\n".join([" ".join(NCG[i:i+15]) for i in range(0, len(NCG), 15)])) outNDX.close() # Write the index file for mapping AA trajectory if requested if options["-nmap"].value: logging.info("Writing trajectory index file.") atid = 1 outNDX = open(options["-nmap"].value, "w") # Get all AA atoms as lists of atoms in residues # First we skip hetatoms and unknowns then iterate over beads # In DNA the O3' atom is mapped together with atoms from the next residue # This stores it until we get to the next residue o3_shift = '' for i_count, i in enumerate(IO.residues(atoms)): if i[0][1] in ("SOL", "HOH", "TIP"): continue if not i[0][1] in MAP.CoarseGrained.mapping.keys(): continue nra = 0 names = [j[0] for j in i] # This gives out a list of atoms in residue, each tuple has other # stuff in it that's needed elsewhere so we just take the last # element which is the atom index (in that residue) for j_count, j in enumerate(MAP.mapIndex(i)): outNDX.write('[ Bead %i of residue %i ]\n' % (j_count+1, i_count+1)) line = '' for k in j: if names[k[2]] == "O3'": line += '%s ' % (str(o3_shift)) o3_shift = k[2]+atid else: line += '%i ' % (k[2]+atid) line += '\n' nra += len(j) outNDX.write(line) atid += nra # Evertything below here we only need, if we need to write a Topology if options['-o']: # Collect the secondary structure stuff and decide what to do with it # First rearrange by the residue ssTotal = zip(*ssTotal) ssAver = [] for i in ssTotal: si = list(set(i)) if len(si) == 1: # Only one type -- consensus ssAver.append(si[0]) else: # Transitions between secondary structure types i = list(i) si = [(1.0*i.count(j)/len(i), j) for j in si] si.sort() if si[-1][0] > options["-ssc"].value: ssAver.append(si[-1][1]) else: ssAver.append(" ") ssAver = "".join(ssAver) logging.info('(Average) Secondary structure has been determined (see head of .itp-file).') # Divide the secondary structure according to the division in chains # This will set the secondary structure types to be used for the # topology. for chain in chains: chain.set_ss(ssAver[:len(chain)]) ssAver = ssAver[len(chain):] # Now the chains are complete, each consisting of a residuelist, # and a secondary structure designation if the chain is of type 'Protein'. # There may be mixed chains, there may be HETATM things. # Water has been discarded. Maybe this has to be changed at some point. # The order in the coarse grained files matches the order in the set of chains. # # If there are no merges to be done, i.e. no global Elnedyn network, no # disulphide bridges, no links, no distance restraints and no explicit merges, # then we can write out the topology, which will match the coarse grained file. # # If there are merges to be done, the order of things may be changed, in which # case the coarse grained structure will not match with the topology... # CYSTINE BRIDGES # # Extract the cysteine coordinates (for all frames) and the cysteine identifiers if options['CystineCheckBonds']: logging.info("Checking for cystine bridges, based on sulphur (SG) atoms lying closer than %.4f nm" % math.sqrt(options['CystineMaxDist2']/100)) cyscoord = zip(*[[j[4:7] for j in i] for i in cysteines]) cysteines = [i[:4] for i in cysteines[0]] bl, kb = options['ForceField'].special[(("SC1", "CYS"), ("SC1", "CYS"))] # Check the distances and add the cysteines to the link list if the # SG atoms have a distance smaller than the cutoff. rlc = range(len(cysteines)) for i in rlc[:-1]: for j in rlc[i+1:]: # Checking the minimum distance over all frames # But we could also take the maximum, or the mean d2 = min([FUNC.distance2(a, b) for a, b in zip(cyscoord[i], cyscoord[j])]) if d2 <= options['CystineMaxDist2']: a, b = cysteines[i], cysteines[j] options['linkListCG'].append((("SC1", "CYS", a[2], a[3]), ("SC1", "CYS", b[2], b[3]), bl, kb)) a, b = (a[0], a[1], a[2]-(32 << 20), a[3]), (b[0], b[1], b[2]-(32 << 20), b[3]) logging.info("Detected SS bridge between %s and %s (%f nm)" % (a, b, math.sqrt(d2)/10)) # REAL ITP STUFF # # Check whether we have identical chains, in which case we # only write the ITP for one... # This means making a distinction between chains and # moleculetypes. molecules = [tuple([chains[i] for i in j]) for j in merge] # At this point we should have a list or dictionary of chains # Each chain should be given a unique name, based on the value # of options["-o"] combined with the chain identifier and possibly # a number if there are chains with identical identifiers. # For each chain we then write an ITP file using the name for # moleculetype and name + ".itp" for the topology include file. # In addition we write a master topology file, using the value of # options["-o"], with an added extension ".top" if not given. # XXX *NOTE*: This should probably be gathered in a 'Universe' class itp = 0 moleculeTypes = {} for mi in range(len(molecules)): mol = molecules[mi] # Check if the moleculetype is already listed # If not, generate the topology from the chain definition if mol not in moleculeTypes or options['SeparateTop']: # Name of the moleculetype # XXX: The naming should be changed; now it becomes Protein_X+Protein_Y+... name = "+".join([chain.getname(options['-name'].value) for chain in mol]) moleculeTypes[mol] = name # Write the molecule type topology top = TOP.Topology(mol[0], options=options, name=name) for m in mol[1:]: top += TOP.Topology(m, options=options) # Have to add the connections, like the connecting network # Gather coordinates mcg, coords = zip(*[(j[:4], j[4:7]) for m in mol for j in m.cg(force=True)]) mcg = list(mcg) # Run through the link list and add connections (links = cys bridges or hand specified links) for atomA, atomB, bondlength, forceconst in options['linkListCG']: if bondlength == -1 and forceconst == -1: bondlength, forceconst = options['ForceField'].special[(atomA[:2], atomB[:2])] # Check whether this link applies to this group atomA = atomA in mcg and mcg.index(atomA)+1 atomB = atomB in mcg and mcg.index(atomB)+1 if atomA and atomB: cat = (forceconst is None) and "Constraint" or "Link" top.bonds.append(TOP.Bond( (atomA, atomB), options = options, type = 1, parameters = (bondlength, forceconst), category = cat, comments = "Cys-bonds/special link")) # Elastic Network # The elastic network is added after the topology is constructed, since that # is where the correct atom list with numbering and the full set of # coordinates for the merged chains are available. if options['ElasticNetwork']: rubberType = options['ForceField'].EBondType rubberList = ELN.rubberBands( [(i[0], j) for i, j in zip(top.atoms, coords) if i[4] in options['ElasticBeads']], options['ElasticLowerBound'], options['ElasticUpperBound'], options['ElasticDecayFactor'], options['ElasticDecayPower'], options['ElasticMaximumForce'], options['ElasticMinimumForce']) top.bonds.extend([TOP.Bond(i, options=options, type=rubberType, category="Rubber band") for i in rubberList]) # Write out the MoleculeType topology destination = options["-o"] and open(moleculeTypes[mol]+".itp", 'w') or sys.stdout destination.write(str(top)) itp += 1 # Check whether other chains are equal to this one # Skip this step if we are to write all chains to separate moleculetypes if not options['SeparateTop']: for j in range(mi+1, len(molecules)): if not molecules[j] in moleculeTypes and mol == molecules[j]: # Molecule j is equal to a molecule mi # Set the name of the moleculetype to the one of that molecule moleculeTypes[molecules[j]] = moleculeTypes[mol] logging.info('Written %d ITP file%s' % (itp, itp > 1 and "s" or "")) # WRITING THE MASTER TOPOLOGY # Output stream top = options["-o"] and open(options['-o'].value, 'w') or sys.stdout # ITP file listing itps = '\n'.join(['#include "%s.itp"' % molecule for molecule in set(moleculeTypes.values())]) # Molecule listing logging.info("Output contains %d molecules:" % len(molecules)) n = 1 for molecule in molecules: chainInfo = (n, moleculeTypes[molecule], len(molecule) > 1 and "s" or " ", " ".join([i.id for i in molecule])) logging.info(" %2d-> %s (chain%s %s)" % chainInfo) n += 1 molecules = '\n'.join(['%s \t 1' % moleculeTypes[molecule] for molecule in molecules]) # Set a define if we are to use rubber bands useRubber = options['ElasticNetwork'] and "#define RUBBER_BANDS" or "" # XXX Specify a better, version specific base-itp name. # Do not set a define for position restrains here, as people are more used to do it in mdp file? top.write( '''#include "martini.itp" %s %s [ system ] ; name Martini system from %s [ molecules ] ; name number %s''' % (useRubber, itps, options["-f"] and options["-f"].value or "stdin", molecules)) logging.info('Written topology files') # Maybe there are forcefield specific log messages? options['ForceField'].messages() # The following lines are always printed (if no errors occur). print "\n\tThere you are. One MARTINI. Shaken, not stirred.\n" Q = DOC.martiniq.pop(random.randint(0, len(DOC.martiniq)-1)) print "\n", Q[1], "\n%80s" % ("--"+Q[0]), "\n"