def audit(debug=False,source='calcs/auditor.py'): """ Command-line interface to an auditor for tracking the status of different calculations. """ if not os.path.isfile(source): raise Exception('requires source code at %s'%source) else: from makeface import import_remote auditor = import_remote(source) auditor['CalcsAuditor'](debug=debug)
def interpret_docker_instructions(config, mods=None): """ Read a docker configuration for running things in the docker. """ if os.path.basename(config) == 'config.py': raise Exception( 'you cannot call the config file "config.py" or we have an import failure' ) # import_remote wraps exec and discards builtins from makeface import import_remote if not os.path.isfile(config): raise Exception('cannot find %s' % config) mod = import_remote(os.path.join('./', config)) instruct = mod['interpreter'](mods=mods) # validators go here return instruct
def get_lipid_resnames(): """ Brief utility for getting the lipid names from automacs. """ import makeface #---get an automacs landscape #---! DEV. needs a clone and make to work try: mod = makeface.import_remote('amx/amx') except: raise Exception( 'please clone a copy of automacs next to omni in `amx`') mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/') #---use the landscape to get hydrogen bond donors and acceptors for lipids hydrogen_bond_ref = {} targets = land.objects_by_category('lipid') return targets
def contacts(grofile,trajfile,**kwargs): """ Identify, catalog, and count contacts in a simulation. Note that this code was developed to mimic the data structures used by hydrogen bonding and salt bridging codes, and stores the various contacts up to a very high level of specificity (i.e. the specific residue and atom names). """ #---unpack sn = kwargs['sn'] work = kwargs['workspace'] calc = kwargs['calc'] debug = kwargs.get('debug',False) run_parallel = kwargs.get('run_parallel',True) #---settings lenscale = 10.0 #---distance cutoff stays in angstroms until the compute function distance_cutoff = calc['specs']['cutoff'] subject_selection = calc['specs'].get('subject','protein') object_flag = calc['specs'].get('object','lipid') #---prepare universe uni = MDAnalysis.Universe(grofile,trajfile) nframes = len(uni.trajectory) start_job_time = time.time() #---save topology for later _,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True) resnames = uni.residues.resnames[np.sort(idx)] resnames_master = np.array(resnames) rescounts = counts[np.argsort(idx)] import makeface #---get an automacs landscape with a little help from the user try: mod = makeface.import_remote('amx/amx') except: raise Exception('please clone a copy of automacs next to omni in `amx`. ' 'you must also run `make setup all` from that directory to get force field files.') ff_name = work.vars.get('force_field',None) if not ff_name: raise Exception('we must be very careful with the residue naming. ' 'you must add `force_field` to the `variables` dictionary in your metadata to continue.') mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/',ff=ff_name) #---get the subject of the calculation, the thing we wish to study the contacts of #---...typically the protein #---! need to add resid redundancy checks possibly if subject_selection=='lipids': #---typically the subject selection goes right to MDAnalysis but this routine allows for lipid-lipid #---...contacts which can be filtered for salt bridges. note that this feature is solely for ptdins target_resnames = land.objects_by_category('lipid') subject_selection = '(%s) and not name H*'%' or '.join(['resname %s'%i for i in target_resnames]) subject = uni.select_atoms(subject_selection) #---get the objects if object_flag=='lipid': #---objects from the landscape returns resnames target_resnames = land.objects_by_category('lipid') #---explicitly ignore hydrogen contacts here targets = uni.select_atoms('(%s) and not name H*'% ' or '.join(['resname %s'%i for i in target_resnames])) elif object_flag=='protein': target_resnames = np.unique(uni.resnames) #---explicitly ignore hydrogen contacts here targets = uni.select_atoms('(%s) and not name H*'% ' or '.join(['resname %s'%i for i in target_resnames])) else: raise Exception('not set up for object %s'%object_flag) #---prepare coordinates for each frame st = time.time() global vecs,coords_subj,coords_targ vecs,coords_subj,coords_targ,times = [],[],[],[] #---purposefully profligate with the memory so this goes quickly for fr in range(nframes): status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st) uni.trajectory[fr] times.append(uni.trajectory.time) vecs.append(uni.dimensions[:3]/lenscale) coords_subj.append(subject.positions/lenscale) coords_targ.append(targets.positions/lenscale) status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status') #---debug compute_function = contacts_framewise if debug: fr = 50 incoming = compute_function(fr,distance_cutoff=distance_cutoff,debug=True) import ipdb;ipdb.set_trace() sys.quit() #---compute loop start = time.time() out_args = {'distance_cutoff':distance_cutoff} if run_parallel: incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)( delayed(compute_function,has_shareable_memory)(fr,**out_args) for fr in framelooper(nframes,start=start)) else: incoming = [] for fr in framelooper(nframes): incoming.append(compute_function(fr,**out_args)) #---chompdown #---get valid frames valid_frames = np.where([len(i['subjects'])>0 for i in incoming])[0] obs_by_frames = np.array([len(incoming[i]['subjects']) for i in valid_frames]).astype(int) #---concatenate the donor/acceptor indices across all frames subject_cat = np.concatenate([incoming[i]['subjects'] for i in valid_frames]).astype(int) target_cat = np.concatenate([incoming[i]['targets'] for i in valid_frames]).astype(int) start_time = time.time() #---tabulate each bond observation tabulation = np.transpose((subject.resnames[subject_cat],subject.resids[subject_cat], subject.names[subject_cat],targets.resnames[target_cat],targets.resids[target_cat], targets.names[target_cat],)) status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') idx,counts = uniquify(tabulation.astype(str)) bonds_catalog = tabulation[idx] start_time = time.time() #---preallocate bond counts per frame counts_per_frame = np.zeros((len(valid_frames),len(idx))) #---hash the binds over the indices bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds_catalog)]) frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames))) for fr,i in enumerate(frame_lims[:-1]): status('counting observations per frame',i=fr,looplen=len(valid_frames), tag='compute',start=start_time) obs_this = tabulation[frame_lims[fr]:frame_lims[fr+1]] counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs_this])] += 1 status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') status('done heavy lifting',tag='compute') #---note the size of the outgoing data. we could shrink this by discarding atom names status('observation array for cutoff %.1f is %.1fMB'%( distance_cutoff,sys.getsizeof(counts_per_frame)/10**6.),tag='note') #---package the dataset result,attrs = {},{} #---everything is indexed by idx result['bonds'] = bonds_catalog result['observations'] = counts_per_frame result['valid_frames'] = valid_frames result['nframes'] = np.array(nframes) result['resnames'] = resnames_master result['subject_residues_resnames'] = subject.residues.resnames result['targets_residues_resnames'] = targets.residues.resnames result['subject_residues_resids'] = subject.residues.resids result['nmols'] = rescounts result['times'] = np.array(times) #---some basic post-processing common to many of the plots global bonds,obs bonds,obs = bonds_catalog,counts_per_frame #---post: generate timewise trajectories for the number of contacts between protein residues and lipids #---methodology note: in a basic version of this calculation we simply count all of the bonds between #---...any lipid-protein residue pair. this means that being more close to a lipid might result in more #---...contacts and hence generates a higher score. hence we have two versions of the calculation. one #---...counts the total number of contacts, and the other discards atom information and scores contacts #---...with a maximum of one per protein residue-lipid pair. this calculation does both #---! need to check for atom-name resolution otherwise this is moot resids = result['subject_residues_resids'] lipid_resnames = np.unique(bonds[:,rowspec.index('target_resname')]) resname_combos = [(r,np.array([r])) for r in lipid_resnames]+[('all lipids',np.array(lipid_resnames))] #---compute loop looper = [{'resid':resid,'resname_set':resname_set} for resid in resids for resname_name,resname_set in resname_combos] compute_function = count_reduced_contact incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel) #---package this as a list of resid/resname pairs and the counts for them result['pairs_resid_resname'] = np.array([(resid,resname_name) for resid in resids for resname_name,resname_set in resname_combos]).astype(str) result['counts_resid_resname'] = np.array(incoming) #---reduce the data for the modified count described above global bonds_red bonds_red = bonds[:,np.array([0,1,3,4])] compute_function = count_reduced_contact_reduced incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel) result['counts_resid_resname_singleton'] = np.array(incoming) #---debugging the explicit method used in the function above if False: resid,resname_set = 2,['POP2'] which = np.where(np.all((bonds[:,rowspec.index('subject_resid')].astype(int)==resid,np.in1d(bonds[:,rowspec.index('target_resname')],resname_set)),axis=0)) obs.T[which].sum(axis=0) bonds[which] #---debugging the reduced method used in the function above if False: resid,resname_set = 2,['POP2'] which = np.where(np.all((bonds_red[:,rowspec_red.index('subject_resid')].astype(int)==resid,np.in1d(bonds_red[:,rowspec_red.index('target_resname')],resname_set)),axis=0)) (obs.T[which].sum(axis=0)>0)*1 bonds_red[which] #---! added this after discovering a contradiction in the results idx,counts = uniquify(bonds_red[which].astype(str)) bonds_red[which][idx] obs.T[which][idx].sum(axis=0) status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time') return result,attrs
def hydrogen_bonding(grofile, trajfile, **kwargs): """ Generic hydrogen bonding code. Revamped on 2017.4.28 to generate a more uniform data structure. """ #---unpack sn = kwargs['sn'] work = kwargs['workspace'] calc = kwargs['calc'] debug = kwargs.get('debug', False) run_parallel = kwargs.get('run_parallel', True) #---prototyping an external module import to replace the sometimes tedious addition of #---... many metadata to the meta dictionary in your YAML files. protein_itp_loader = work.vars.get('protein_itp_loader', None) #---default ITP paths are set for each simulation in the metadata if protein_itp_loader == None: def protein_itp_loader(sn, **kwargs): itp = work.meta.get(sn, {}).get('protein_itp', None) if not itp: raise Exception( 'cannot find protein_itp in meta dictionary for %s ' % sn + 'note that you can also use the protein_itp_loader functionality to get the ITP file' ) #---custom ITP loader specified as an alternate_module else: protein_itp_loader = alternate_module(**protein_itp_loader) #---settings distance_cutoff, angle_cutoff = [ calc['specs'][i] for i in ['distance_cutoff', 'angle_cutoff'] ] #---cutoff for inferring hydrogens from a one-time distance search distance_h_cutoff = distance_cutoff #---prepare universe uni = MDAnalysis.Universe(grofile, trajfile) nframes = len(uni.trajectory) lenscale = 10. start_job_time = time.time() #---save topology for later _, idx, counts = np.unique(uni.residues.resnames, return_index=True, return_counts=True) resnames = uni.residues.resnames[np.sort(idx)] resnames_master = np.array(resnames) rescounts = counts[np.argsort(idx)] import makeface #---get an automacs landscape with a little help from the user try: mod = makeface.import_remote('amx/amx') except: raise Exception( 'please clone a copy of automacs next to omni in `amx`. ' 'you must also run `make setup all` from that directory to get force field files.' ) mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/') #---use the landscape to get hydrogen bond donors and acceptors for lipids hydrogen_bond_ref = {} #---this section relies on correct definitions from the Landscape targets = land.objects_by_category('lipid') #---METHODOLOGY NOTE: we catalog all hydrogen bonding opportunities ONLY BY NAME #---loop over lipid targets and scan them for hydrogen bond opportunities for resname in targets: #---each lipid ITP has an identical molecule with the same (residue) name itp = mod['GMXTopology'](land.objects[resname]['fn']) #---donor names come from a double-regex match over bonds donor_names = itp.get_bonds_by_regex(molname=resname, patterns=['^H', '^(N|O|S)']) #---acceptor names have a single regex #---!!! check that this is the correct definition acceptor_names = [ i['atom'] for i in itp.molecules[resname]['atoms'] if re.match('^(N|O|S)', i['atom']) ] hydrogen_bond_ref[resname] = { 'acceptors': acceptor_names, 'donors': donor_names } #---include any proteins as participants in the bonding if kwargs['calc']['specs'].get('protein', False): #---get the protein ITP from metadata itp_fn = work.meta[sn].get('protein_itp', protein_itp_loader(sn, work=work)) if not itp_fn: raise Exception('add protein_itp to the meta for %s' % sn) #---get the sims spot path systematically if os.path.isfile(eval(itp_fn)): itp_fn_abs = eval(itp_fn) #---if path is relative then we consult the spots else: rootdir = work.raw.spots[(work.raw.spotname_lookup(sn), 'structure')]['rootdir'] sn_dir = os.path.join(rootdir, sn) #---user supplies step folder and path to the reference structure itp_fn_abs = os.path.join(sn_dir, itp_fn) protein_itp = mod['GMXTopology'](itp_fn_abs) for molname in protein_itp.molecules: #---mimic the procedure above for lipids #---donor names come from a double-regex match over bonds donor_resnames_names = protein_itp.get_bonds_by_regex( molname=molname, patterns=['^H', '^(N|O|S)'], include_resname=True) #---organize hydrogen bonds by residue name resnames_all = list( set([i for j in zip(*donor_resnames_names)[0] for i in j])) for resname_focus in resnames_all: donor_list = [] #---loop over resnames within the protein for resnames, names in donor_resnames_names: if resnames[0] != resnames[1]: raise Exception('invalid hydrogen bond spec %s,%s' % (resnames, names)) elif resnames[0] == resname_focus: donor_list.append(names) else: continue #---acceptor names have a single regex #---!!! check that this is the correct definition acceptor_names = list( set([ i['atom'] for i in protein_itp.molecules[molname]['atoms'] if re.match('^(N|O|S)', i['atom']) and i['resname'] == resname_focus ])) hydrogen_bond_ref[(molname, resname_focus)] = { 'acceptors': acceptor_names, 'donors': donor_list } """ developing a new method for selecting the atoms correctly we need to get all possible donors into a big selection after which case the hbonds.hbonder_framewise does the rest and the plotting codes are decent at picking out inter-residue bonds and identities consider the customer: hbonds_framewise needs a list of donors, hydrogens, and acceptors since the donors and hydrogens must be related by bonds there is some redundancy in the donor list which redundancy hbonds_framewise accounts for construct the donors list start with the list of all hydrogens (somewhat counterintuitive) consult the hydrogen_bond_ref and loop over all residues and then grow a list of hydrogen indices for each hydrogen find the associated heavy atom and and add both indices to separate lists net result is two lists of indices over the hydrogens and donors which constitute a bond """ #---get the heavy atom side of the donors donors_heavy, donors_h = [ uni.select_atoms(' or '.join([ '(resname %s and (%s))' % (resname if type(resname) in str_types else resname[1], ' or '.join(['name %s' % i[w] for i in v['donors']])) for resname, v in hydrogen_bond_ref.items() if v['donors'] ])) for w in range(2) ] acceptors_heavy = uni.select_atoms(' or '.join([ '(resname %s and (%s))' % (resname if type(resname) in str_types else resname[1], ' or '.join( ['name %s' % i for i in v['acceptors']])) for resname, v in hydrogen_bond_ref.items() if v['acceptors'] ])) #---check non-redundany residues if not len(donors_heavy.residues) == len(np.unique(donors_heavy.resids)): raise Exception('residue redundancy in the donor heavy list') if not len(donors_h.residues) == len(np.unique(donors_h.resids)): raise Exception('residue redundancy in the donor hydrogen list') #---constructing the donors side selection to preserve the bond relation donors_reindex = [] for refkey, details in hydrogen_bond_ref.items(): #---protein residues have the protein molecule name alongside resname = refkey if type(refkey) in str_types else refkey[1] for heavy, light in details['donors']: inds_heavy = np.where( np.all((donors_heavy.resnames == resname, donors_heavy.names == heavy), axis=0))[0] inds_light = np.where( np.all((donors_h.resnames == resname, donors_h.names == light), axis=0))[0] #---loop over resids and for each resid that has them, we add the indices to the list #---! these descending loops are clumsy but they should be fast and they make definitions precise for resid in np.unique( np.concatenate((donors_heavy[inds_heavy].resids, donors_h[inds_light].resids))): inds_heavy = np.where( np.all( (donors_heavy.resnames == resname, donors_heavy.names == heavy, donors_heavy.resnums == resid), axis=0))[0] inds_light = np.where( np.all((donors_h.resnames == resname, donors_h.names == light, donors_h.resnums == resid), axis=0))[0] if len(inds_heavy) > 1 or len(inds_light) > 1: raise Exception( 'serious error! one unique hydrogen bond in a single residue' ) if len(inds_heavy) == 1 and len(inds_light) == 1: donors_reindex.append((inds_heavy[0], inds_light[0])) #---the reindexed donors preserved the bond relation and covers all possible unique hydrogen bonds donors_reindex = np.array(donors_reindex) #---prepare coordinates for each frame st = time.time() vecs, all_donor_coords, all_acceptor_coords, all_h_coords = [], [], [], [] #---purposefully profligate with the memory so this goes quickly for fr in range(nframes): status('caching coordinates', tag='compute', i=fr, looplen=nframes, start=st) uni.trajectory[fr] vecs.append(uni.dimensions[:3] / lenscale) all_donor_coords.append(donors_heavy.positions[donors_reindex[:, 0]] / lenscale) all_h_coords.append(donors_h.positions[donors_reindex[:, 1]] / lenscale) all_acceptor_coords.append(acceptors_heavy.positions / lenscale) status('completed caching in %.1f minutes' % ((time.time() - st) / 60.), tag='status') #---export variables from codes import hbonds_framewise hbonds_framewise.hydrogen_bond_ref = hydrogen_bond_ref hbonds_framewise.all_donor_coords = all_donor_coords hbonds_framewise.all_acceptor_coords = all_acceptor_coords hbonds_framewise.all_h_coords = all_h_coords hbonds_framewise.vecs = vecs #---debug if debug: hbonds.donors_side = donors_side hbonds.donors_inds = donors_inds hbonds.donors_inds = donors_inds hbonds.acceptors_side = acceptors_side fr = 686 #---careful debugging at this frame incoming = hbonds_framewise.hbonder_framewise( fr, distance_cutoff=distance_cutoff, angle_cutoff=angle_cutoff) sys.quit() start = time.time() out_args = { 'distance_cutoff': distance_cutoff, 'angle_cutoff': angle_cutoff } if run_parallel: incoming = Parallel(n_jobs=8, verbose=10 if debug else 0)( delayed(hbonds_framewise.hbonder_framewise, has_shareable_memory)( fr, **out_args) for fr in framelooper(nframes, start=start)) else: incoming = [] for fr in framelooper(nframes): incoming.append(hbonds_framewise.hbonder_framewise(fr, **out_args)) #---get valid frames valid_frames = np.where([len(i['donors']) > 0 for i in incoming])[0] #---concatenate the donor/acceptor indices across all frames donor_cat, donor_cat_h = [ np.concatenate( [donors_reindex.T[j][incoming[i]['donors']] for i in valid_frames]).astype(int) for j in range(2) ] acceptor_cat = np.concatenate( [incoming[i]['acceptors'] for i in valid_frames]).astype(int) obs_by_frames = np.array( [len(incoming[i]['acceptors']) for i in valid_frames]).astype(int) start_time = time.time() #---tabulate each bond observation status('sluggish sequence because there are {:,} bond observations'.format( len(donor_cat)), tag='warning') status('tabulating all distinct hydrogen bonds', tag='compute') tabulation = np.transpose(( donors_heavy.resnames[donor_cat], donors_heavy.resids[donor_cat], donors_heavy.names[donor_cat], acceptors_heavy.resnames[acceptor_cat], acceptors_heavy.resids[acceptor_cat], acceptors_heavy.names[acceptor_cat], #---include the hydrogen identity here in the tabulation (note this might make things larger?) #---also note that the hydrogen atom name should be enough because we already have the donor resid donors_h.names[donor_cat_h], )) status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute') #---reduce tabulation by discarding all SOL-SOL bonds #---...note that this is necessary because we have 33M observations and almost all of them are "unique" #---...precisely because so many of them involve water #---actually, instead of discarding, let us change all waters to a single residue tabulation_explicit = tabulation tabulation = np.array(tabulation_explicit) for p in [0, 3]: sols = np.where(tabulation[:, p] == 'SOL')[0] tabulation[(sols, (np.ones((len(sols))) * (p + 1)).astype(int))] = '1' start_time = time.time() status('unique-ifying the tabulated bonds (estimated %ds)' % (len(donor_cat) * 1.3 * 10**-6), tag='compute') status( 'note: with 32GB memory, 33M observations works fine, but 46M hits the swap', tag='warning') #---note that unique is getting "axis" in np 1.13 but at some point on or before 1.12 they added some #---...kind of a safety check on the following trick for unique rows, which check returns an error #---...message: "TypeError: Cannot change data-type for object array." which is solved by forcing #---...the object to a string type. note that this method requires void and not a blank string, which #---...some examples will use. this changed must have happened in the <1 week since we wrote #---...the hydrogen bonds code and tested it again on the factory #---uniquify the enormous list of all possible hydrogen bonds tabulation_reform = tabulation.astype(str) tabulation_unique = np.ascontiguousarray(tabulation_reform).view( np.dtype( (np.void, tabulation_reform.dtype.itemsize * tabulation_reform.shape[1]))) tabulation_view_unique, idx, counts = np.unique(tabulation_unique, return_index=True, return_counts=True) bonds = tabulation[idx] status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute') start_time = time.time() #---preallocate bond counts per frame counts_per_frame = np.zeros((len(valid_frames), len(idx))) #---hash the binds over the indices bonds_to_idx = dict([(tuple(b), bb) for bb, b in enumerate(bonds)]) frame_lims = np.concatenate(([0], np.cumsum(obs_by_frames))) for fr, i in enumerate(frame_lims[:-1]): status('counting observations', i=fr, looplen=len(valid_frames), tag='compute', start=start_time) obs = tabulation[frame_lims[fr]:frame_lims[fr + 1]] counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs])] += 1 status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute') status('done heavy lifting', tag='compute') #---package the dataset result, attrs = {}, {} #---everything is indexed by idx result['bonds'] = bonds result['observations'] = counts_per_frame result['valid_frames'] = valid_frames result['nframes'] = np.array(nframes) result['resnames'] = resnames_master result['nmols'] = rescounts status('compute job lasted %.1fmin' % ((time.time() - start_job_time) / 60.), tag='time') return result, attrs
def salt_bridges(grofile, trajfile, **kwargs): """ Identify salt bridges. Mimics the beginning of the hydrogen bond """ #---unpack sn = kwargs['sn'] work = kwargs['workspace'] calc = kwargs['calc'] debug = kwargs.get('debug', False) run_parallel = kwargs.get('run_parallel', True) #---settings. distance cutoff is larger for salt bridges than hydrogen bonds distance_cutoff = calc['specs']['distance_cutoff'] #---prepare universe uni = MDAnalysis.Universe(grofile, trajfile) nframes = len(uni.trajectory) lenscale = 10. start_job_time = time.time() #---save topology for later _, idx, counts = np.unique(uni.residues.resnames, return_index=True, return_counts=True) resnames = uni.residues.resnames[np.sort(idx)] rescounts = counts[np.argsort(idx)] import makeface #---get an automacs landscape #---! DEV. needs a clone and make to work try: mod = makeface.import_remote('amx/amx') except: raise Exception( 'please clone a copy of automacs next to omni in `amx`') mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/') #---use the landscape to get hydrogen bond donors and acceptors for lipids hydrogen_bond_ref = {} targets = land.objects_by_category('lipid') for resname in targets: mol = land.itps[land.objects[resname]['fn']][resname] #---collect all possible hydrogen bond acceptors #---! forbid water here for salt bridge calculation #! it was an error to redefine acceptor names here because they get overwritten! acceptor_names = [ i['atom'] for i in mol['atoms'] if re.match('^(N|O|S)', i['atom']) and i != 'OW' ] #! note that I am preempting the bug here. previously used some logic to get hydrogen #! the following assumes that both donors and acceptors are NOS. #! NOTE THAT THE POINT OF THIS CALCULATION IS THAT WE HAVE CATIONS BRIDGING TWO NEGATIVE #! HEAVY ATOMS AND THERE ARE VERY FEW POSITIVE HEAVY CHARGES IN LIPIDS ANYWAY #! ONLY EXCEPTION IS DOPE ETHANOLAMINE donor_names = [ i['atom'] for i in mol['atoms'] if re.match('^(N|O|S)', i['atom']) and i != 'OW' ] hydrogen_bond_ref[resname] = { 'acceptors': acceptor_names, 'donors': donor_names } #---water-naming is hard-coded #! hydrogen_bond_ref['water'] = {'donors':[('OW','HW1'),('OW','HW2')],'acceptors':['OW']} #---assemble the names #! note that there were two bugs: first I was using hydrogens for a salt bridge calculation (!) #! and second this was using the zero even though the bonds could be symmetric. the solution was to #! search through mol['bonds'] and the reversed #donors_names = sorted(list(set([m for n in [zip(*i['donors'])[0] # for i in hydrogen_bond_ref.values() if i['donors']!=[]] for m in n]))) donors_names = sorted( list( set([ m for n in [i['donors'] for i in hydrogen_bond_ref.values() if i != []] for m in n ]))) #hydrogens_names = sorted(list(set([m for n in [zip(*i['donors'])[1] # for i in hydrogen_bond_ref.values() if i['donors']!=[]] for m in n]))) acceptors_names = sorted( list( set([ m for n in [ i['acceptors'] for i in hydrogen_bond_ref.values() if i != [] ] for m in n ]))) #---generate atom groups donors = uni.select_atoms(' or '.join( ['name %s' % i for i in donors_names])) acceptors = uni.select_atoms(' or '.join( ['name %s' % i for i in acceptors_names])) #hydrogens = uni.select_atoms(' or '.join(['name %s'%i for i in hydrogens_names])) #---! not necessary if False: #---we can either exclude water here or after the KD-Tree. tried the latter and there was index problem #---! note that we exclude water by ignoring OW and HW1 and HW2. they should not appear in other mol hydrogen_bond_ref = dict([(i, j) for i, j in hydrogen_bond_ref.items() if i != 'water']) donors_h_pairs = [ m for n in [i.get('donors', []) for i in hydrogen_bond_ref.values()] for m in n ] donors_h_pairs_flat = list(set([i for j in donors_h_pairs for i in j])) #---compared to the hydrogen bonding version, we only take the heavy atom the in donor pairs sel_d, sel_h = [ uni.select_atoms(' or '.join( ['name %s' % i for i in list(set(zip(*donors_h_pairs)[j]))])) for j in range(2) ] resids = np.unique(np.concatenate([sel_d.resids, sel_h.resids])) donors_side = uni.select_atoms(' or '.join( ['name %s' % i for i in donors_h_pairs_flat])) donors_resids = np.unique(donors_side.resids) #---! use acceptor names for both donor and acceptor pairs for this calculation. that is, we do not #---! ...require an intervening hydrogen #! incorrectly used acceptors_names. fixed now donors_side = uni.select_atoms(' or '.join( ['name %s' % i for i in acceptors_names])) donors_resids = np.unique(donors_side.resids) #---! not necessary if False: #---identifying residues with both a donor and a corresponding hydrogen both = np.zeros((len(resids), len(donors_h_pairs), 2), dtype=bool) alles = np.array( [[donors_side.names == i for i in zip(*donors_h_pairs)[j]] for j in range(2)]) #---lookups for different atom names (fast because there are relatively few atom names) subsels = [[np.where(i)[0] for i in alles[j]] for j in range(2)] #---loop over heavy/light types for anum in range(2): #---loop over possible pairs for pnum in range(len(alles[anum])): #---crucial conversion back to zero-numbering from resids here both[donors_side.resids[subsels[anum][pnum]] - 1, pnum, anum] = True #---use all to find out which residues have which opportunities for bonding bond_opps = np.transpose(np.where(np.all(both, axis=2))) #---some hydrogen bonds have the same donors for multiple hydrogens donors_inds = np.zeros((2, len(bond_opps))).astype(int) for anum in range(2): donors_names_u = np.unique(zip(*donors_h_pairs)[anum]) #---for each bond opportunity, we list the heavy donor donors_side_names = np.array(donors_h_pairs).T[anum][bond_opps[:, 1]] #---convert this into index (this is fast because it is over a short list of donor names) donors_side_inds = -1 * np.ones(len(donors_side_names)).astype(int) for nn, n in enumerate(donors_names_u): donors_side_inds[np.where(donors_side_names == n)] = nn #---lookup from residue and unique heavy donor atom to absolute index in donors_side lookup = len(donors_names_u) * np.ones( (len(donors_resids), len(donors_names_u) + 1)).astype(int) #---convert this into index (this is fast because it is over a short list of donor names) donors_side_names_inds = len(donors_names_u) * np.ones( (len(donors_side.names))).astype(int) for nn, n in enumerate(donors_names_u): donors_side_names_inds[np.where(donors_side.names == n)] = nn lookup[tuple(np.transpose([donors_side.resids-1,donors_side_names_inds]).T)] = \ np.arange(len(donors_side.resids)) #---translate bond_opps from pair numbering to heavy donor numbering (which is unique) bond_opps_unique = np.array(bond_opps) bond_opps_unique[:, 1] = donors_side_inds donors_inds[anum] = lookup[tuple(bond_opps_unique.T)] #---prepare the acceptors selections acceptors_names = np.unique([ j for k in [i.get('acceptors', []) for i in hydrogen_bond_ref.values()] for j in k ]) #---! use acceptor_names not acceptors_names #! NO THAT IS WRONG acceptors_side = uni.select_atoms(' or '.join( ['name %s' % i for i in acceptors_names])) #---extend to include salt bridges #---some systems have two types of cations cation_names = work.meta[sn].get('cations', work.meta[sn].get('cation', None)) if not cation_names: raise Exception('add "cations" to the meta dictionary for %s' % sn) if type(cation_names) != list: cation_names = [cation_names] multiple_cations = len(cation_names) > 1 cations_side = uni.select_atoms(' or '.join( ['name %s' % i for i in cation_names])) #---prepare coordinates for each frame st = time.time() vecs,all_donor_coords,all_acceptor_coords,all_h_coords,all_cation_coords = [],[],[],[],[] #---purposefully profligate with the memory so this goes quickly for fr in range(nframes): status('caching coordinates', tag='compute', i=fr, looplen=nframes, start=st) uni.trajectory[fr] vecs.append(uni.dimensions[:3] / lenscale) all_donor_coords.append(donors_side.positions / lenscale) all_h_coords.append(donors_side.positions / lenscale) all_acceptor_coords.append(acceptors_side.positions / lenscale) all_cation_coords.append(cations_side.positions / lenscale) status('completed caching in %.1f minutes' % ((time.time() - st) / 60.), tag='status') #---the preceding code is identical to the beginning of hydrogen_bonding #---export variables from codes import hbonds hbonds.hydrogen_bond_ref = hydrogen_bond_ref hbonds.all_donor_coords = all_donor_coords hbonds.all_acceptor_coords = all_acceptor_coords hbonds.all_h_coords = all_h_coords hbonds.all_cation_coords = all_cation_coords hbonds.vecs = vecs #---extra exports for development hbonds.acceptors_resids = acceptors_side.resids hbonds.acceptors_resnames = acceptors_side.resnames hbonds.donors_resids = donors_side.resids hbonds.donors_resnames = donors_side.resnames #---debug if debug: fr = 36 incoming_salt = hbonds.salt_bridges_framewise( fr, distance_cutoff=distance_cutoff) import ipdb ipdb.set_trace() sys.quit() start = time.time() out_args = {'distance_cutoff': distance_cutoff} if run_parallel: #! use require='sharedmem' instead of delayed(func,has_shareable_memory) for late era joblib incoming_salt = Parallel( n_jobs=4, verbose=10 if debug else 0, require='sharedmem')( delayed(hbonds.salt_bridges_framewise)(fr, **out_args) for fr in framelooper(nframes, start=start)) else: incoming, incoming_salt = [], [] for fr in framelooper(nframes): incoming_salt.append(hbonds.salt_bridges_framewise(fr, **out_args)) #---extension to salt bridges. tabulate each salt valid_frames_salt = np.array( [ii for ii, i in enumerate(incoming_salt) if len(i) > 0]) obs_by_frames_salt = np.array([ len(i) for ii, i in enumerate(incoming_salt) if len(i) > 0 ]).astype(int) #---some simulations have no salt bridges if len(valid_frames_salt) == 0: bonds_salt, counts_per_frame_salt = np.array([]), np.array([]) else: salt_cat = np.concatenate( [incoming_salt[i] for i in valid_frames_salt]) status('tabulating all distinct salt bridges', tag='compute') status('sluggish sequence because there are {:,} bond observations'. format(len(salt_cat)), tag='warning') #---having excluded water we are small enough to add back the ion name tabulation_salt = np.transpose(( acceptors_side.resnames[salt_cat[:, 0]], acceptors_side.resids[salt_cat[:, 0]], acceptors_side.names[salt_cat[:, 0]], donors_side.resnames[salt_cat[:, 2]], donors_side.resids[salt_cat[:, 2]], donors_side.names[salt_cat[:, 2]], #cations_side[salt_cat[:,1]].resids, #cations_side[salt_cat[:,1]].resnames, )) #---send the hydrogen bonds to the tabulator tabulation_salt_out = tabulation_salt bonds_salt, counts_per_frame_salt = tabulator(tabulation_salt_out, valid_frames_salt, obs_by_frames_salt) #---!!! development note. data are too big so we discard cation data in the tabulation #---!!! ...which means that #---package the dataset result, attrs = {}, {} #---everything is indexed by idx result['bonds'] = bonds_salt result['observations'] = counts_per_frame_salt result['bonds_salt'] = bonds_salt result['valid_frames'] = valid_frames_salt result['nframes'] = np.array(nframes) result['resnames'] = resnames result['nmols'] = rescounts status('compute job lasted %.1fmin' % ((time.time() - start_job_time) / 60.), tag='time') return result, attrs
rotation=0,ha="center",va="center",color='k', fontsize=fsbase-4) tb.set_path_effects([path_effects.Stroke(linewidth=4,foreground='w'), path_effects.Normal()]) counter += 1 #---saving the snapshot tag here so we can keep track of the fix to exemplar_lipid above, fixed on v4 picturesave('fig.head_angle_detail',work.plotdir,backup=False,version=True, meta={'tag':tag_head_angle,'exemplar_rank':exemplar_rank},extras=patches) plt.close() ###---CODE REORGY import makeface #---get an automacs landscape with a little help from the user try: mod = makeface.import_remote('amx/amx') except: raise Exception('please clone a copy of automacs next to omni in `amx`. ' 'you must also run `make setup all` from that directory to get force field files.') mod['state'].force_field = 'charmm' GMXStructure = mod['GMXStructure'] def mapback(seq): """Hash a list of numbers back to their indices.""" #---this function is a candidate for omni/base/tools.py return dict([(v,k) for k,v in zip(np.arange(len(seq)),seq)]) #---custom water coloring custom_water_coloring = """ set vdw_thick 0.4 set inner_cutoff 2.0 set outer_cutoff 5.0
def contacts(grofile,trajfile,**kwargs): """ Identify, catalog, and count contacts in a simulation. Note that this code was developed to mimic the data structures used by hydrogen bonding and salt bridging codes, and stores the various contacts up to a very high level of specificity (i.e. the specific residue and atom names). """ #---unpack sn = kwargs['sn'] work = kwargs['workspace'] calc = kwargs['calc'] debug = kwargs.get('debug',False) run_parallel = kwargs.get('run_parallel',True) #---settings lenscale = 10.0 #---distance cutoff stays in angstroms until the compute function distance_cutoff = calc['specs']['cutoff'] subject_selection = calc['specs'].get('subject','protein') object_flag = calc['specs'].get('object','lipid') #---prepare universe uni = MDAnalysis.Universe(grofile,trajfile) nframes = len(uni.trajectory) start_job_time = time.time() #---save topology for later _,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True) resnames = uni.residues.resnames[np.sort(idx)] resnames_master = np.array(resnames) rescounts = counts[np.argsort(idx)] import makeface #---get an automacs landscape with a little help from the user try: mod = makeface.import_remote('amx/amx') except: raise Exception('please clone a copy of automacs next to omni in `amx`. ' 'you must also run `make setup all` from that directory to get force field files.') ff_name = work.vars.get('force_field',None) if not ff_name: raise Exception('we must be very careful with the residue naming. ' 'you must add `force_field` to the `variables` dictionary in your metadata to continue.') mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/',ff=ff_name) #---get the subject of the calculation, the thing we wish to study the contacts of #---...typically the protein #---! need to add resid redundancy checks possibly subject = uni.select_atoms(subject_selection) #---get the objects if object_flag=='lipid': #---objects from the landscape returns resnames target_resnames = land.objects_by_category('lipid') #---explicitly ignore hydrogen contacts here targets = uni.select_atoms('(%s) and not name H*'% ' or '.join(['resname %s'%i for i in target_resnames])) elif object_flag=='protein': target_resnames = np.unique(uni.residues.resnames) #---explicitly ignore hydrogen contacts here targets = uni.select_atoms('(%s) and not name H*'% ' or '.join(['resname %s'%i for i in target_resnames])) else: raise Exception('not set up for object %s'%object_flag) #---prepare coordinates for each frame st = time.time() global vecs,coords_subj,coords_targ vecs,coords_subj,coords_targ,times = [],[],[],[] #---purposefully profligate with the memory so this goes quickly for fr in range(nframes): status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st) uni.trajectory[fr] times.append(uni.trajectory.time) vecs.append(uni.dimensions[:3]/lenscale) coords_subj.append(subject.positions/lenscale) coords_targ.append(targets.positions/lenscale) status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status') #---debug compute_function = contacts_framewise if debug: fr = 50 incoming = compute_function(fr,distance_cutoff=distance_cutoff) import ipdb;ipdb.set_trace() sys.quit() #---compute loop start = time.time() out_args = {'distance_cutoff':distance_cutoff} if run_parallel: incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)( delayed(compute_function,has_shareable_memory)(fr,**out_args) for fr in framelooper(nframes,start=start)) else: incoming = [] for fr in framelooper(nframes): incoming.append(compute_function(fr,**out_args)) #---chompdown #---get valid frames valid_frames = np.where([len(i['subjects'])>0 for i in incoming])[0] obs_by_frames = np.array([len(incoming[i]['subjects']) for i in valid_frames]).astype(int) #---concatenate the donor/acceptor indices across all frames subject_cat = np.concatenate([incoming[i]['subjects'] for i in valid_frames]).astype(int) target_cat = np.concatenate([incoming[i]['targets'] for i in valid_frames]).astype(int) start_time = time.time() #---tabulate each bond observation tabulation = np.transpose((subject.resnames[subject_cat],subject.resids[subject_cat], subject.names[subject_cat],targets.resnames[target_cat],targets.resids[target_cat], targets.names[target_cat],)) status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') idx,counts = uniquify(tabulation.astype(str)) bonds_catalog = tabulation[idx] start_time = time.time() #---preallocate bond counts per frame counts_per_frame = np.zeros((len(valid_frames),len(idx))) #---hash the binds over the indices bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds_catalog)]) frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames))) for fr,i in enumerate(frame_lims[:-1]): status('counting observations per frame',i=fr,looplen=len(valid_frames), tag='compute',start=start_time) obs_this = tabulation[frame_lims[fr]:frame_lims[fr+1]] counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs_this])] += 1 status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') status('done heavy lifting',tag='compute') #---note the size of the outgoing data. we could shrink this by discarding atom names status('observation array for cutoff %.1f is %.1fMB'%( distance_cutoff,sys.getsizeof(counts_per_frame)/10**6.),tag='note') #---package the dataset result,attrs = {},{} #---everything is indexed by idx result['bonds'] = bonds_catalog result['observations'] = counts_per_frame result['valid_frames'] = valid_frames result['nframes'] = np.array(nframes) result['resnames'] = resnames_master result['subject_residues_resnames'] = subject.residues.resnames result['targets_residues_resnames'] = targets.residues.resnames result['subject_residues_resids'] = subject.residues.resids result['nmols'] = rescounts result['times'] = np.array(times) #---some basic post-processing common to many of the plots global bonds,obs bonds,obs = bonds_catalog,counts_per_frame #---post: generate timewise trajectories for the number of contacts between protein residues and lipids #---methodology note: in a basic version of this calculation we simply count all of the bonds between #---...any lipid-protein residue pair. this means that being more close to a lipid might result in more #---...contacts and hence generates a higher score. hence we have two versions of the calculation. one #---...counts the total number of contacts, and the other discards atom information and scores contacts #---...with a maximum of one per protein residue-lipid pair. this calculation does both #---! need to check for atom-name resolution otherwise this is moot resids = result['subject_residues_resids'] lipid_resnames = np.unique(bonds[:,rowspec.index('target_resname')]) resname_combos = [(r,np.array([r])) for r in lipid_resnames]+[('all lipids',np.array(lipid_resnames))] #---compute loop looper = [{'resid':resid,'resname_set':resname_set} for resid in resids for resname_name,resname_set in resname_combos] compute_function = count_reduced_contact incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel) #---package this as a list of resid/resname pairs and the counts for them result['pairs_resid_resname'] = np.array([(resid,resname_name) for resid in resids for resname_name,resname_set in resname_combos]).astype(str) result['counts_resid_resname'] = np.array(incoming) #---reduce the data for the modified count described above global bonds_red bonds_red = bonds[:,np.array([0,1,3,4])] compute_function = count_reduced_contact_reduced incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel) result['counts_resid_resname_singleton'] = np.array(incoming) #---debugging the explicit method used in the function above if False: resid,resname_set = 2,['POP2'] which = np.where(np.all((bonds[:,rowspec.index('subject_resid')].astype(int)==resid,np.in1d(bonds[:,rowspec.index('target_resname')],resname_set)),axis=0)) obs.T[which].sum(axis=0) bonds[which] #---debugging the reduced method used in the function above if False: resid,resname_set = 2,['POP2'] which = np.where(np.all((bonds_red[:,rowspec_red.index('subject_resid')].astype(int)==resid,np.in1d(bonds_red[:,rowspec_red.index('target_resname')],resname_set)),axis=0)) (obs.T[which].sum(axis=0)>0)*1 bonds_red[which] #---! added this after discovering a contradiction in the results idx,counts = uniquify(bonds_red[which].astype(str)) bonds_red[which][idx] obs.T[which][idx].sum(axis=0) status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time') return result,attrs
def hydrogen_bonding(grofile,trajfile,**kwargs): """ Generic hydrogen bonding code. Revamped on 2017.4.28 to generate a more uniform data structure. """ #---unpack sn = kwargs['sn'] work = kwargs['workspace'] calc = kwargs['calc'] debug = kwargs.get('debug',False) run_parallel = kwargs.get('run_parallel',True) #---prototyping an external module import to replace the sometimes tedious addition of #---... many metadata to the meta dictionary in your YAML files. protein_itp_loader = work.vars.get('protein_itp_loader',None) #---default ITP paths are set for each simulation in the metadata if protein_itp_loader==None: def protein_itp_loader(sn,**kwargs): itp = work.meta.get(sn,{}).get('protein_itp',None) if not itp: raise Exception('cannot find protein_itp in meta dictionary for %s '%sn+ 'note that you can also use the protein_itp_loader functionality to get the ITP file') #---custom ITP loader specified as an alternate_module else: protein_itp_loader = alternate_module(**protein_itp_loader) #---settings distance_cutoff,angle_cutoff = [calc['specs'][i] for i in ['distance_cutoff','angle_cutoff']] #---cutoff for inferring hydrogens from a one-time distance search distance_h_cutoff = distance_cutoff #---prepare universe uni = MDAnalysis.Universe(grofile,trajfile) nframes = len(uni.trajectory) lenscale = 10. start_job_time = time.time() #---save topology for later _,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True) resnames = uni.residues.resnames[np.sort(idx)] resnames_master = np.array(resnames) rescounts = counts[np.argsort(idx)] import makeface #---get an automacs landscape with a little help from the user try: mod = makeface.import_remote('amx/amx') except: raise Exception('please clone a copy of automacs next to omni in `amx`. ' 'you must also run `make setup all` from that directory to get force field files.') mod['state'].force_field = 'charmm' Landscape = mod['Landscape'] land = Landscape(cwd='amx/') #---use the landscape to get hydrogen bond donors and acceptors for lipids hydrogen_bond_ref = {} #---this section relies on correct definitions from the Landscape targets = land.objects_by_category('lipid') #---METHODOLOGY NOTE: we catalog all hydrogen bonding opportunities ONLY BY NAME #---loop over lipid targets and scan them for hydrogen bond opportunities for resname in targets: #---each lipid ITP has an identical molecule with the same (residue) name itp = mod['GMXTopology'](land.objects[resname]['fn']) #---donor names come from a double-regex match over bonds donor_names = itp.get_bonds_by_regex(molname=resname,patterns=['^H','^(N|O|S)']) #---acceptor names have a single regex #---!!! check that this is the correct definition acceptor_names = [i['atom'] for i in itp.molecules[resname]['atoms'] if re.match('^(N|O|S)',i['atom'])] hydrogen_bond_ref[resname] = {'acceptors':acceptor_names,'donors':donor_names} #---include any proteins as participants in the bonding if kwargs['calc']['specs'].get('protein',False): #---get the protein ITP from metadata itp_fn = work.meta[sn].get('protein_itp',protein_itp_loader(sn,work=work)) if not itp_fn: raise Exception('add protein_itp to the meta for %s'%sn) #---get the sims spot path systematically if os.path.isfile(eval(itp_fn)): itp_fn_abs = eval(itp_fn) #---if path is relative then we consult the spots else: rootdir = work.raw.spots[(work.raw.spotname_lookup(sn),'structure')]['rootdir'] sn_dir = os.path.join(rootdir,sn) #---user supplies step folder and path to the reference structure itp_fn_abs = os.path.join(sn_dir,itp_fn) protein_itp = mod['GMXTopology'](itp_fn_abs) for molname in protein_itp.molecules: #---mimic the procedure above for lipids #---donor names come from a double-regex match over bonds donor_resnames_names = protein_itp.get_bonds_by_regex(molname=molname, patterns=['^H','^(N|O|S)'],include_resname=True) #---organize hydrogen bonds by residue name resnames_all = list(set([i for j in zip(*donor_resnames_names)[0] for i in j])) for resname_focus in resnames_all: donor_list = [] #---loop over resnames within the protein for resnames,names in donor_resnames_names: if resnames[0]!=resnames[1]: raise Exception('invalid hydrogen bond spec %s,%s'%(resnames,names)) elif resnames[0]==resname_focus: donor_list.append(names) else: continue #---acceptor names have a single regex #---!!! check that this is the correct definition acceptor_names = list(set([i['atom'] for i in protein_itp.molecules[molname]['atoms'] if re.match('^(N|O|S)',i['atom']) and i['resname']==resname_focus])) hydrogen_bond_ref[(molname,resname_focus)] = { 'acceptors':acceptor_names,'donors':donor_list} """ developing a new method for selecting the atoms correctly we need to get all possible donors into a big selection after which case the hbonds.hbonder_framewise does the rest and the plotting codes are decent at picking out inter-residue bonds and identities consider the customer: hbonds_framewise needs a list of donors, hydrogens, and acceptors since the donors and hydrogens must be related by bonds there is some redundancy in the donor list which redundancy hbonds_framewise accounts for construct the donors list start with the list of all hydrogens (somewhat counterintuitive) consult the hydrogen_bond_ref and loop over all residues and then grow a list of hydrogen indices for each hydrogen find the associated heavy atom and and add both indices to separate lists net result is two lists of indices over the hydrogens and donors which constitute a bond """ #---get the heavy atom side of the donors donors_heavy,donors_h = [uni.select_atoms(' or '.join(['(resname %s and (%s))'%( resname if type(resname) in str_types else resname[1],' or '.join(['name %s'%i[w] for i in v['donors']])) for resname,v in hydrogen_bond_ref.items() if v['donors']])) for w in range(2)] acceptors_heavy = uni.select_atoms(' or '.join(['(resname %s and (%s))'%( resname if type(resname) in str_types else resname[1],' or '.join(['name %s'%i for i in v['acceptors']])) for resname,v in hydrogen_bond_ref.items() if v['acceptors']])) #---check non-redundany residues if not len(donors_heavy.residues)==len(np.unique(donors_heavy.resids)): raise Exception('residue redundancy in the donor heavy list') if not len(donors_h.residues)==len(np.unique(donors_h.resids)): raise Exception('residue redundancy in the donor hydrogen list') #---constructing the donors side selection to preserve the bond relation donors_reindex = [] for refkey,details in hydrogen_bond_ref.items(): #---protein residues have the protein molecule name alongside resname = refkey if type(refkey) in str_types else refkey[1] for heavy,light in details['donors']: inds_heavy = np.where(np.all(( donors_heavy.resnames==resname,donors_heavy.names==heavy),axis=0))[0] inds_light = np.where(np.all(( donors_h.resnames==resname,donors_h.names==light),axis=0))[0] #---loop over resids and for each resid that has them, we add the indices to the list #---! these descending loops are clumsy but they should be fast and they make definitions precise for resid in np.unique(np.concatenate((donors_heavy[inds_heavy].resids, donors_h[inds_light].resids))): inds_heavy = np.where(np.all(( donors_heavy.resnames==resname,donors_heavy.names==heavy,donors_heavy.resnums==resid ),axis=0))[0] inds_light = np.where(np.all(( donors_h.resnames==resname,donors_h.names==light,donors_h.resnums==resid),axis=0))[0] if len(inds_heavy)>1 or len(inds_light)>1: raise Exception('serious error! one unique hydrogen bond in a single residue') if len(inds_heavy)==1 and len(inds_light)==1: donors_reindex.append((inds_heavy[0],inds_light[0])) #---the reindexed donors preserved the bond relation and covers all possible unique hydrogen bonds donors_reindex = np.array(donors_reindex) #---prepare coordinates for each frame st = time.time() vecs,all_donor_coords,all_acceptor_coords,all_h_coords = [],[],[],[] #---purposefully profligate with the memory so this goes quickly for fr in range(nframes): status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st) uni.trajectory[fr] vecs.append(uni.dimensions[:3]/lenscale) all_donor_coords.append(donors_heavy.positions[donors_reindex[:,0]]/lenscale) all_h_coords.append(donors_h.positions[donors_reindex[:,1]]/lenscale) all_acceptor_coords.append(acceptors_heavy.positions/lenscale) status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status') #---export variables from codes import hbonds_framewise hbonds_framewise.hydrogen_bond_ref = hydrogen_bond_ref hbonds_framewise.all_donor_coords = all_donor_coords hbonds_framewise.all_acceptor_coords = all_acceptor_coords hbonds_framewise.all_h_coords = all_h_coords hbonds_framewise.vecs = vecs #---debug if debug: hbonds.donors_side = donors_side hbonds.donors_inds = donors_inds hbonds.donors_inds = donors_inds hbonds.acceptors_side = acceptors_side fr = 686 #---careful debugging at this frame incoming = hbonds_framewise.hbonder_framewise(fr,distance_cutoff=distance_cutoff,angle_cutoff=angle_cutoff) sys.quit() start = time.time() out_args = {'distance_cutoff':distance_cutoff,'angle_cutoff':angle_cutoff} if run_parallel: incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)( delayed(hbonds_framewise.hbonder_framewise,has_shareable_memory)(fr,**out_args) for fr in framelooper(nframes,start=start)) else: incoming = [] for fr in framelooper(nframes): incoming.append(hbonds_framewise.hbonder_framewise(fr,**out_args)) #---get valid frames valid_frames = np.where([len(i['donors'])>0 for i in incoming])[0] #---concatenate the donor/acceptor indices across all frames donor_cat,donor_cat_h = [np.concatenate([donors_reindex.T[j][incoming[i]['donors']] for i in valid_frames]).astype(int) for j in range(2)] acceptor_cat = np.concatenate([incoming[i]['acceptors'] for i in valid_frames]).astype(int) obs_by_frames = np.array([len(incoming[i]['acceptors']) for i in valid_frames]).astype(int) start_time = time.time() #---tabulate each bond observation status('sluggish sequence because there are {:,} bond observations'.format(len(donor_cat)),tag='warning') status('tabulating all distinct hydrogen bonds',tag='compute') tabulation = np.transpose((donors_heavy.resnames[donor_cat],donors_heavy.resids[donor_cat], donors_heavy.names[donor_cat],acceptors_heavy.resnames[acceptor_cat], acceptors_heavy.resids[acceptor_cat],acceptors_heavy.names[acceptor_cat], #---include the hydrogen identity here in the tabulation (note this might make things larger?) #---also note that the hydrogen atom name should be enough because we already have the donor resid donors_h.names[donor_cat_h],)) status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') #---reduce tabulation by discarding all SOL-SOL bonds #---...note that this is necessary because we have 33M observations and almost all of them are "unique" #---...precisely because so many of them involve water #---actually, instead of discarding, let us change all waters to a single residue tabulation_explicit = tabulation tabulation = np.array(tabulation_explicit) for p in [0,3]: sols = np.where(tabulation[:,p]=='SOL')[0] tabulation[(sols,(np.ones((len(sols)))*(p+1)).astype(int))] = '1' start_time = time.time() status('unique-ifying the tabulated bonds (estimated %ds)'%(len(donor_cat)*1.3*10**-6),tag='compute') status('note: with 32GB memory, 33M observations works fine, but 46M hits the swap',tag='warning') #---note that unique is getting "axis" in np 1.13 but at some point on or before 1.12 they added some #---...kind of a safety check on the following trick for unique rows, which check returns an error #---...message: "TypeError: Cannot change data-type for object array." which is solved by forcing #---...the object to a string type. note that this method requires void and not a blank string, which #---...some examples will use. this changed must have happened in the <1 week since we wrote #---...the hydrogen bonds code and tested it again on the factory #---uniquify the enormous list of all possible hydrogen bonds tabulation_reform = tabulation.astype(str) tabulation_unique = np.ascontiguousarray(tabulation_reform).view( np.dtype((np.void,tabulation_reform.dtype.itemsize*tabulation_reform.shape[1]))) tabulation_view_unique,idx,counts = np.unique(tabulation_unique,return_index=True,return_counts=True) bonds = tabulation[idx] status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') start_time = time.time() #---preallocate bond counts per frame counts_per_frame = np.zeros((len(valid_frames),len(idx))) #---hash the binds over the indices bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds)]) frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames))) for fr,i in enumerate(frame_lims[:-1]): status('counting observations',i=fr,looplen=len(valid_frames), tag='compute',start=start_time) obs = tabulation[frame_lims[fr]:frame_lims[fr+1]] counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs])] += 1 status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute') status('done heavy lifting',tag='compute') #---package the dataset result,attrs = {},{} #---everything is indexed by idx result['bonds'] = bonds result['observations'] = counts_per_frame result['valid_frames'] = valid_frames result['nframes'] = np.array(nframes) result['resnames'] = resnames_master result['nmols'] = rescounts status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time') return result,attrs