def find_atoms_around_alternate_conformers(hierarchy, altlocs=None, dist_cutoff=4.2): """For all alternate conformers in (or subset altlocs, if given) return atom pairs to surrounding atoms""" # Remove hydrograns and extract atoms hierarchy = non_h(hierarchy) h_atoms = hierarchy.atoms() # Get all the altlocs in the structure all_altlocs = list(hierarchy.altloc_indices()) if not altlocs: altlocs = all_altlocs # Get the indices of each conformer in the structure conf_indices = hierarchy.get_conformer_indices() # Get selection for blank altloc atoms i_alt_blank = all_altlocs.index('') alt_blank_sel = (conf_indices == i_alt_blank).iselection() # Output list and squared distance cutoff atom_pairs = [] dist_cut_sq = dist_cutoff**2 # Iterate through altlocs for alt in altlocs: if alt == '': continue elif alt not in all_altlocs: continue # Get a selection for atoms with this altloc i_alt = all_altlocs.index(alt) alt_sel = (conf_indices == i_alt).iselection() # Combine with the blank altloc selection comb_sel = flex.size_t(sorted(alt_sel.concatenate(alt_blank_sel))) # These should be mutually exclusive sets... assert len(comb_sel) == len(alt_sel) + len(alt_blank_sel) # Extract all atoms of this conformer alt_ats = h_atoms.select(alt_sel) comb_ats = h_atoms.select(comb_sel) # Iterate through the atoms in this conformation for atom in alt_ats: # Find all atoms within dist_cutoff at_dists_sq = (comb_ats.extract_xyz() - atom.xyz).dot() at_dists_sel = (at_dists_sq < dist_cut_sq).iselection() # Iterate through nearby atoms and append for atom_2 in comb_ats.select(at_dists_sel): atom_pairs.append((atom.fetch_labels(), atom_2.fetch_labels(), round(float(atom.distance(atom_2)), 3))) return atom_pairs
def calculate_residue_mean_normalised_b_factors(self): """Extract Mean-B values in each of the structures""" # -----------------------------------------------------> self.tables.residue_observations.loc[:, :, 'mean-bz-all'] = numpy.nan self.tables.residue_observations.loc[:, :, 'mean-bz-backbone'] = numpy.nan self.tables.residue_observations.loc[:, :, 'mean-bz-sidechain'] = numpy.nan # -----------------------------------------------------> print('------------------------------------>') for lab_h, pdb_h in zip(self.structures.labels, self.structures.hierarchies): print('Calculating Local Normalised Mean B-Factors: {}'.format( lab_h)) # Normalise the b-factors of the structure pdb_h_z = normalise_b_factors_to_z_scores(pdb_hierarchy=pdb_h, method='protein') cache = pdb_h_z.atom_selection_cache() # Non-Hydrogens for c in conformers_via_residue_groups( s_select.non_h(hierarchy=pdb_h_z, cache=cache)): res_lab = make_label(c) res_mean_b = flex.mean_weighted(c.atoms().extract_b(), c.atoms().extract_occ()) self.tables.residue_observations.set_value( res_lab, lab_h, 'mean-bz-all', res_mean_b) # Backbone Atoms for c in conformers_via_residue_groups( s_select.backbone(hierarchy=pdb_h_z, cache=cache)): res_lab = make_label(c) res_mean_b = flex.mean_weighted(c.atoms().extract_b(), c.atoms().extract_occ()) self.tables.residue_observations.set_value( res_lab, lab_h, 'mean-bz-backbone', res_mean_b) # Sidechain Atoms for c in conformers_via_residue_groups( s_select.sidechains(hierarchy=pdb_h_z, cache=cache)): res_lab = make_label(c) res_mean_b = flex.mean_weighted(c.atoms().extract_b(), c.atoms().extract_occ()) self.tables.residue_observations.set_value( res_lab, lab_h, 'mean-bz-sidechain', res_mean_b)
def from_pdb(cls, pdb_input=None, pdb_hierarchy=None): """Calculate the b-factor statistics of a model""" assert [pdb_input, pdb_hierarchy ].count(None) == 1, 'Provide pdb_input OR pdb_hierarchy' if pdb_input: pdb_hierarchy = pdb_input.construct_hierarchy() cache = pdb_hierarchy.atom_selection_cache() all_b = non_h(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() protein_b = protein(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() backbone_b = backbone(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() sidechain_b = sidechains(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() return cls(all=basic_statistics(all_b), protein=basic_statistics(protein_b), backbone=basic_statistics(backbone_b), sidechain=basic_statistics(sidechain_b))
def score_model(params, pdb1, mtz1, pdb2=None, mtz2=None, label_prefix='', verbose=False): """ Score residues against density, and generate other model quality indicators. Identified residues in pdb1 are scored against mtz1 (and mtz2, if provided) using edstats. Identified residues in pdb1 are compared to the equivalent residues in pdb2, if provided. B-factors ratios of identified residues to surrounding sidechains are calculated. """ if label_prefix: label_prefix = label_prefix + '-' # Extract the residues to look for res_names = params.selection.res_names_list print 'Reading input structure:', pdb1 # Extract Structure h1_all = non_h(strip_pdb_to_input(pdb1, remove_ter=True, remove_end=True).hierarchy) # Normalise hierarchy (standardise atomic naming, etc...) sanitise_hierarchy(h1_all) h1_pro = protein(h1_all) h1_bck = backbone(h1_all) h1_sch = sidechains(h1_all) # Pull out residues to analyse if res_names: rg_for_analysis = [rg for rg in h1_all.residue_groups() if [n for n in rg.unique_resnames() if n in res_names]] print 'Selecting residues named {}: {} residue(s)'.format(' or '.join(res_names), len(rg_for_analysis)) else: rg_for_analysis = h1_all.residue_groups() print 'Analysing all residues ({} residues)'.format(len(rg_for_analysis)) # Check residues to analyse or skip if not rg_for_analysis: raise Exception('There are no residues called {} in {}'.format(' or '.join(params.selection.res_names_list), pdb1)) # Extract PDB2 if pdb2 is not None: print 'Reading input structure:', pdb2 h2_all = non_h(strip_pdb_to_input(pdb2, remove_ter=True, remove_end=True).hierarchy) sanitise_hierarchy(h2_all) # Score MTZ1 if mtz1 is not None: print 'Scoring model against mtz file' print 'Scoring {} >>> {}'.format(pdb1, mtz1) mtz1_edstats_scores = Edstats(mtz_file=mtz1, pdb_file=pdb1, f_label=params.input.f_label) else: mtz1_edstats_scores = None # Score MTZ2 if mtz2 is not None: print 'Scoring model against mtz file' print 'Scoring {} >>> {}'.format(pdb1, mtz2) mtz2_edstats_scores = Edstats(mtz_file=mtz2, pdb_file=pdb1, f_label=params.input.f_label) else: mtz2_edstats_scores = None # Prepare output table data_table = prepare_table() for rg_sel in rg_for_analysis: # Create label for the output table #rg_label = (label_prefix+rg_sel.unique_resnames()[0]+'-'+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','') #rg_label = (label_prefix+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','') rg_label = ShortLabeller.format(rg_sel).replace(' ','') tab_label = label_prefix + rg_label if len(rg_sel.unique_resnames()) != 1: raise Exception(tab_label+': More than one residue name associated with residue group -- cannot process') # Append empty row to output table data_table.loc[tab_label] = None data_table.set_value(index = tab_label, col = 'PDB', value = pdb1 ) data_table.set_value(index = tab_label, col = 'Occupancy', value = calculate_residue_group_occupancy(residue_group=rg_sel) ) data_table = calculate_residue_group_bfactor_ratio(residue_group = rg_sel, hierarchy = h1_sch, data_table = data_table, rg_label = tab_label) if pdb2 is not None: data_table.set_value(index = tab_label, col = 'PDB-2', value = pdb2 ) # Extract the equivalent residue in pdb2 rg_sel_2 = [rg for rg in h2_all.residue_groups() if ShortLabeller.format(rg).replace(' ','') == rg_label] try: assert rg_sel_2, 'Residue is not present in pdb file: {} not in {}'.format(rg_label, pdb2) assert len(rg_sel_2) == 1, 'More than one residue has been selected for {} in {}'.format(rg_label, pdb2) except: raise # Extract occupancy data_table.set_value(index = tab_label, col = 'Occupancy-2', value = calculate_residue_group_occupancy(residue_group=rg_sel_2[0]) ) # Calculate the RMSD between the models try: confs1, confs2, rmsds = zip(*calculate_paired_conformer_rmsds(conformers_1=rg_sel.conformers(), conformers_2=rg_sel_2[0].conformers())) data_table.set_value(index=tab_label, col='Model RMSD', value=min(rmsds)) except: raise print 'Could not calculate RMSD between pdb_1 and pdb_2 for residue {}'.format(rg_label) pass # Extract Density Scores - MTZ 1 if mtz1 is not None: data_table.set_value(index=tab_label, col='MTZ', value=mtz1) if mtz1_edstats_scores is not None: data_table = mtz1_edstats_scores.extract_residue_group_scores( residue_group = rg_sel, data_table = data_table, rg_label = tab_label ) # Normalise the RSZO by the Occupancy of the ligand data_table['RSZO/OCC'] = data_table['RSZO']/data_table['Occupancy'] # Extract Density Scores - MTZ 2 if mtz2 is not None: data_table.set_value(index=tab_label, col='MTZ-2', value=mtz2) if mtz2_edstats_scores is not None: data_table = mtz2_edstats_scores.extract_residue_group_scores( residue_group = rg_sel, data_table = data_table, rg_label = tab_label, column_suffix = '-2' ) # Normalise the RSZO by the Occupancy of the ligand data_table['RSZO/OCC-2'] = data_table['RSZO-2']/data_table['Occupancy-2'] return data_table
def overlapping_occupancy_groups(hierarchy, resnames, group_dist, overlap_dist, complete_groups=True, exclude_altlocs=[], verbose=False): if exclude_altlocs is None: exclude_altlocs = [] if exclude_altlocs == ['']: exclude_altlocs = [] # Remove hydrogens to prevent ridiculous amounts of restraints hierarchy = non_h(hierarchy) # Extract all altlocs and ags with altlocs sel_altlocs = [ a for a in hierarchy.altloc_indices() if (a != '') and (a not in exclude_altlocs) ] sel_alt_ags = [ ag for ag in hierarchy.atom_groups() if (ag.altloc in sel_altlocs) ] # Record for each altloc # - atom groups for each altloc # - assigment of each ag to a cluster of ags cluster_dict = {} if verbose: print '-------------------------------------->' print '' print 'Generating groups of nearby alternate conformers (cutoff {}A)'.format( group_dist) if exclude_altlocs: print 'Excluding conformer(s): {}'.format( ','.join(exclude_altlocs)) print '' for altloc in sel_altlocs: # Select atom groups with this altloc altloc_ags = filter_by_altloc(sel_alt_ags, altloc) # Cluster the atom groups altloc_clusters = cluster_atom_groups(altloc_ags, cutoff=group_dist) # Dictionary mapping altlocs to ags to clusters cluster_dict[altloc] = (altloc_ags, altloc_clusters) if verbose: print '- altloc {}: {} residues clustered into {} groups'.format( altloc, len(altloc_ags), len(set(altloc_clusters))) if verbose: print '' # Find atom_groups with the selected resnames seed_ags = [ag for ag in sel_alt_ags if (ag.resname in resnames)] # List of 2-length tuples (containing constrained pairs) constrain_groups = [] # Loop until all atom groups have been used while seed_ags: # Pick the first residue to focus on focus_ag = seed_ags.pop(0) # Find which cluster this ag is in altloc_ags, altloc_clusters = cluster_dict[focus_ag.altloc] focus_clust = altloc_clusters[altloc_ags.index(focus_ag)] # Extract all ags in this cluster group_ags = [ ag for i, ag in enumerate(altloc_ags) if altloc_clusters[i] == focus_clust ] group_xyz = group_ags[0].atoms().extract_xyz() for ag in group_ags[1:]: group_xyz = group_xyz.concatenate(ag.atoms().extract_xyz()) if verbose: print '-------------------------------------->' print '' print 'Creating occupancy group based on: {}'.format( GenericSelection.to_str(focus_ag)) print '- this residue is part of alternate conformer {}'.format( focus_ag.altloc) print '- there are {} atom_groups in this group'.format( len(group_ags)) print '' print 'Looking for overlapping groups of residues with different alternate conformers:' print '' tmp_constrain_groups = [] for altloc in sel_altlocs: # Skip blank altloc or the selected altloc if altloc == '' or altloc == focus_ag.altloc: continue # Find all ags for this altloc that overlap with the selected cluster altloc_ags, altloc_clusters = cluster_dict[altloc] overlap_ags = filter_by_distance(atom_groups=altloc_ags, xyz=group_xyz, cutoff=overlap_dist) overlap_clusts = sorted( set([ altloc_clusters[altloc_ags.index(ag)] for ag in overlap_ags ])) if verbose: print '- altloc {}: overlaps with {} group(s) of residues'.format( altloc, len(overlap_clusts)) for cluster in overlap_clusts: tmp_constrain_groups.append( ((focus_ag.altloc, focus_clust), (altloc, cluster))) # Remove any used seed groups in the overlapping group [seed_ags.remove(ag) for ag in overlap_ags if ag in seed_ags] # Remove any used seed groups in the seed group [seed_ags.remove(ag) for ag in group_ags if ag in seed_ags] if verbose: print '' print 'Occupancy groups for this residue' print '- {} overlapping group(s) found'.format( len(tmp_constrain_groups)) # Add to the complete list if tmp_constrain_groups: if complete_groups: if verbose: print '- complete_groups=={}: concatenating occupancy groups'.format( complete_groups) tmp_constrain_groups = [ tuple([(focus_ag.altloc, focus_clust)] + [t[1] for t in tmp_constrain_groups]) ] print '- creating {} occupancy group constraint(s)'.format( len(tmp_constrain_groups)) constrain_groups.extend(tmp_constrain_groups) else: if verbose: print '...no overlapping groups found.' print '- not creating any occupancy groups for this residue' if verbose: print '' # Filter duplicated restraint groups tmp = [] for g in map(sorted, constrain_groups): if g not in tmp: tmp.append(g) constrain_groups = tmp # Format to generic residue selections occupancy_groups = [] for g in constrain_groups: ag_groups = {} for altloc, cluster in g: ag_groups.setdefault(altloc, []).extend([ GenericSelection.to_dict(ag) for ag, c in zip(*cluster_dict[altloc]) if c == cluster ]) occupancy_groups.append( [ag_groups[a] for a in sorted(ag_groups.keys())]) return occupancy_groups