def run_RMSD(self, file1, file2): ''' Calculates 4 RMSD values between two GPCR pdb files. It compares the two files using sequence numbers. 1. overall all atoms RMSD 2. overall backbone atoms RMSD 3. 7TM all atoms RMSD 4. 7TM backbone atoms RMSD ''' parser = PDB.PDBParser(QUIET=True) pdb1 = parser.get_structure('struct1', file1)[0] pdb2 = parser.get_structure('struct2', file2)[0] pdb_array1, pdb_array2, pdb_array3, pdb_array4 = OrderedDict( ), OrderedDict(), OrderedDict(), OrderedDict() assign_gn1 = as_gn.GenericNumbering(structure=pdb1) pdb1 = assign_gn1.assign_generic_numbers() assign_gn2 = as_gn.GenericNumbering(structure=pdb2) pdb2 = assign_gn2.assign_generic_numbers() for chain1 in pdb1: for residue1 in chain1: if residue1.get_full_id()[3][0] != ' ': continue pdb_array1[int(residue1.get_id()[1])] = residue1 try: if -8.1 < residue1['CA'].get_bfactor() < 8.1: pdb_array3[int(residue1.get_id()[1])] = residue1 except: pass for chain2 in pdb2: for residue2 in chain2: if residue2.get_full_id()[3][0] != ' ': continue pdb_array2[int(residue2.get_id()[1])] = residue2 try: if -8.1 < residue2['CA'].get_bfactor() < 8.1: pdb_array4[int(residue2.get_id()[1])] = residue2 except: pass overall_all1, overall_all2, overall_backbone1, overall_backbone2, o_a, o_b = self.create_lists( pdb_array1, pdb_array2) TM_all1, TM_all2, TM_backbone1, TM_backbone2, t_a, t_b = self.create_lists( pdb_array3, pdb_array4) rmsd1 = self.calc_RMSD(overall_all1, overall_all2, o_a) rmsd2 = self.calc_RMSD(overall_backbone1, overall_backbone2, o_b) rmsd3 = self.calc_RMSD(TM_all1, TM_all2, t_a) rmsd4 = self.calc_RMSD(TM_backbone1, TM_backbone2, t_b) return rmsd1, rmsd2, rmsd3, rmsd4
def run(self): parse = GPCRDBParsingPDB() self.signprot_complex = SignprotComplex.objects.get( structure=self.main_structure) structure_signprot = self.signprot_complex.protein if self.signprot != False: self.target_signprot = Protein.objects.get( entry_name=self.signprot) else: self.target_signprot = self.signprot_complex.protein self.signprot_protconf = ProteinConformation.objects.get( protein=self.target_signprot) sign_a = GProteinAlignment() sign_a.run_alignment(self.target_signprot, structure_signprot) io = StringIO(self.main_structure.pdb_data.pdb) assign_cgn = as_gn.GenericNumbering( pdb_file=io, pdb_code=self.main_structure.pdb_code.index, sequence_parser=True, signprot=structure_signprot) signprot_pdb_array = assign_cgn.assign_cgn_with_sequence_parser( self.signprot_complex.alpha) # Alignment exception in HN for 6OIJ, shifting alignment by 6 residues if self.main_structure.pdb_code.index == '6OIJ': keys = list(signprot_pdb_array['HN'].keys()) new_HN = OrderedDict() for i, k in enumerate(signprot_pdb_array['HN']): if i < 8: new_HN[k] = 'x' else: new_HN[k] = signprot_pdb_array['HN'][keys[i - 6]] signprot_pdb_array['HN'] = new_HN new_array = OrderedDict() # Initiate complex part of template source source_resis = Residue.objects.filter( protein_conformation__protein=self.target_signprot) for res in source_resis: if res.protein_segment.slug not in self.template_source: self.template_source[res.protein_segment.slug] = OrderedDict() if res.protein_segment.category == 'loop': self.template_source[res.protein_segment.slug][str( res.sequence_number)] = [None, None] else: self.template_source[res.protein_segment.slug][ res.display_generic_number.label] = [ self.main_structure, self.main_structure ] # Superimpose missing regions H1 - hfs2 alt_complex_struct = None segs_for_alt_complex_struct = [] alt_templates_H_domain = self.get_full_alpha_templates() if self.main_structure.id not in alt_templates_H_domain[1]: segs_for_alt_complex_struct = [ 'H1', 'h1ha', 'HA', 'hahb', 'HB', 'hbhc', 'HC', 'hchd', 'HD', 'hdhe', 'HE', 'hehf', 'HF', 'hfs2' ] alt_complex_struct = self.find_h_domain_template( self.target_signprot, alt_templates_H_domain[0] ) #Structure.objects.get(pdb_code__index='3SN6') io = StringIO(alt_complex_struct.pdb_data.pdb) alt_signprot_complex = SignprotComplex.objects.get( structure=alt_complex_struct) alt_assign_cgn = as_gn.GenericNumbering( pdb_file=io, pdb_code=alt_complex_struct.pdb_code.index, sequence_parser=True, signprot=alt_signprot_complex.protein) alt_signprot_pdb_array = alt_assign_cgn.assign_cgn_with_sequence_parser( alt_signprot_complex.alpha) before_cgns = ['G.HN.50', 'G.HN.51', 'G.HN.52', 'G.HN.53'] after_cgns = ['G.H5.03', 'G.H5.04', 'G.H5.05', 'G.H5.06'] orig_residues1 = parse.fetch_residues_from_array( signprot_pdb_array['HN'], before_cgns) orig_residues2 = parse.fetch_residues_from_array( signprot_pdb_array['H5'], after_cgns) orig_residues = parse.add_two_ordereddict(orig_residues1, orig_residues2) alt_residues1 = parse.fetch_residues_from_array( alt_signprot_pdb_array['HN'], before_cgns) alt_residues2 = parse.fetch_residues_from_array( alt_signprot_pdb_array['H5'], after_cgns) # for i,j in orig_residues.items(): # print(i, j, j[0].get_parent()) # print('ALTERNATIVES') # for i,j in alt_residues1.items(): # print(i, j, j[0].get_parent()) # for i,j in alt_residues2.items(): # print(i, j, j[0].get_parent()) alt_middle = OrderedDict() for s in segs_for_alt_complex_struct: alt_middle = parse.add_two_ordereddict( alt_middle, alt_signprot_pdb_array[s]) self.template_source = update_template_source( self.template_source, list(self.template_source[s].keys()), alt_complex_struct, s) alt_residues = parse.add_two_ordereddict( parse.add_two_ordereddict(alt_residues1, alt_middle), alt_residues2) del_list = [] for r, t in alt_middle.items(): if t == 'x': del_list.append(r) for r in del_list: del alt_residues[r] superpose = sp.LoopSuperpose(orig_residues, alt_residues) new_residues = superpose.run() key_list = list(new_residues.keys())[4:-4] for key in key_list: seg = key.split('.')[1] signprot_pdb_array[seg][key] = new_residues[key] # alt local loop alignment alt_sign_a = GProteinAlignment() alt_sign_a.run_alignment(self.target_signprot, alt_signprot_complex.protein, segments=segs_for_alt_complex_struct) for alt_seg in segs_for_alt_complex_struct: sign_a.reference_dict[alt_seg] = alt_sign_a.reference_dict[ alt_seg] sign_a.template_dict[alt_seg] = alt_sign_a.template_dict[ alt_seg] sign_a.alignment_dict[alt_seg] = alt_sign_a.alignment_dict[ alt_seg] # fix h1ha and hahb and hbhc if self.target_signprot.entry_name != 'gnas2_human': h1ha = Residue.objects.filter( protein_conformation__protein=alt_signprot_complex.protein, protein_segment__slug='h1ha') h1ha_dict, hahb_dict = OrderedDict(), OrderedDict() for h in h1ha: h1ha_dict[h.generic_number.label] = 'x' signprot_pdb_array['h1ha'] = h1ha_dict right_order = sorted(list(signprot_pdb_array['hahb'].keys()), key=lambda x: (x)) for r in right_order: hahb_dict[r] = signprot_pdb_array['hahb'][r] signprot_pdb_array['hahb'] = hahb_dict # Let Modeller model buffer regions self.trimmed_residues.append('s1h1_6') self.trimmed_residues.append('hfs2_1') self.trimmed_residues.append('hfs2_2') self.trimmed_residues.append('hfs2_3') self.trimmed_residues.append('hfs2_4') self.trimmed_residues.append('hfs2_5') self.trimmed_residues.append('hfs2_6') self.trimmed_residues.append('hfs2_7') self.trimmed_residues.append('G.S2.01') self.trimmed_residues.append('G.S2.02') self.trimmed_residues.append('s4h3_4') self.trimmed_residues.append('s4h3_5') # New loop alignments for signprot. If length differs between ref and temp, buffer is created in the middle of the loop loops = [ i.slug for i in ProteinSegment.objects.filter(proteinfamily='Alpha', category='loop') ] loops_to_model = [] for r_seg, t_seg, a_seg in zip(sign_a.reference_dict, sign_a.template_dict, sign_a.alignment_dict): if r_seg in loops: loop_length = len(sign_a.reference_dict[r_seg]) ref_loop = [ i for i in list(sign_a.reference_dict[r_seg].values()) if i not in ['x', '-'] ] ref_keys = [ i for i in list(sign_a.reference_dict[r_seg].keys()) if i not in ['x', '-'] ] ref_loop_residues = Residue.objects.filter( protein_conformation__protein=self.target_signprot, protein_segment__slug=r_seg) temp_loop = [ i for i in list(sign_a.template_dict[t_seg].values()) if i not in ['x', '-'] ] temp_keys = [ i for i in list(sign_a.template_dict[t_seg].keys()) if i not in ['x', '-'] ] if alt_complex_struct and r_seg in segs_for_alt_complex_struct: temp_loop_residues = Residue.objects.filter( protein_conformation__protein=alt_signprot_complex. protein, protein_segment__slug=r_seg) else: temp_loop_residues = Residue.objects.filter( protein_conformation__protein=structure_signprot, protein_segment__slug=r_seg) ref_out, temp_out, align_out = OrderedDict(), OrderedDict( ), OrderedDict() # ref is longer if len(ref_loop) > len(temp_loop): mid_temp = math.ceil(len(temp_loop) / 2) j = 0 for i in range(0, loop_length): key = r_seg + '_' + str(i + 1) if i + 1 <= mid_temp: temp_out[key] = temp_loop[i] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, i, ref_loop_residues[i]. display_generic_number.label, ref_loop_residues[i].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) elif mid_temp < i + 1 <= loop_length - mid_temp + 1: if i + 1 == loop_length - mid_temp + 1 and len( temp_loop) % 2 == 0: temp_out[key] = temp_loop[mid_temp + j] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_temp + j, ref_loop_residues[i]. display_generic_number.label, ref_loop_residues[i].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) j += 1 else: temp_out[key.replace('_', '?')] = '-' self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_temp + j, ref_loop_residues[i]. display_generic_number.label, ref_loop_residues[i].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) else: temp_out[key] = temp_loop[mid_temp + j] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_temp + j, ref_loop_residues[i].display_generic_number. label, ref_loop_residues[i].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) j += 1 for i, j in enumerate( list(sign_a.reference_dict[r_seg].values())): key = r_seg + '_' + str(i + 1) try: temp_out[key] ref_out[key] = j except: ref_out[key.replace('_', '?')] = j i += 1 # temp is longer elif len(ref_loop) < len(temp_loop): mid_ref = math.ceil(len(ref_loop) / 2) j = 0 for i in range(0, loop_length): key = r_seg + '_' + str(i + 1) if i + 1 <= mid_ref: ref_out[key] = ref_loop[i] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, i, temp_loop_residues[i]. display_generic_number.label, ref_loop_residues[i].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) elif mid_ref < i + 1 <= loop_length - mid_ref + 1: if i + 1 == loop_length - mid_ref + 1 and len( ref_loop) % 2 == 0: ref_out[key] = ref_loop[mid_ref + j] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_ref + j, temp_loop_residues[i]. display_generic_number.label, ref_loop_residues[mid_ref + j].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) j += 1 else: ref_out[key.replace('_', '?')] = '-' self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_ref + j, temp_loop_residues[i]. display_generic_number.label, ref_loop_residues[mid_ref + j].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) else: ref_out[key] = ref_loop[mid_ref + j] self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, mid_ref + j, temp_loop_residues[i].display_generic_number. label, ref_loop_residues[mid_ref + j].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) j += 1 for i, j in enumerate( list(sign_a.template_dict[t_seg].values())): key = r_seg + '_' + str(i + 1) try: ref_out[key] temp_out[key] = j except: temp_out[key.replace('_', '?')] = j i += 1 loops_to_model.append(r_seg) # ref and temp length equal else: cr, ct = 1, 1 for i, j in zip( list(sign_a.reference_dict[r_seg].values()), list(sign_a.template_dict[t_seg].values())): ref_out[r_seg + '_' + str(cr)] = i temp_out[r_seg + '_' + str(ct)] = j self.template_source = compare_and_update_template_source( self.template_source, r_seg, signprot_pdb_array, ct - 1, temp_loop_residues[ct - 1].display_generic_number.label, ref_loop_residues[cr - 1].sequence_number, segs_for_alt_complex_struct, alt_complex_struct, self.main_structure) if i != '-': cr += 1 if j != '-': ct += 1 c = 1 # update alignment dict for i, j in zip(list(ref_out.values()), list(temp_out.values())): key = r_seg + '_' + str(c) if i == '-' or j == '-': align_out[key.replace('_', '?')] = '-' elif i != j: align_out[key] = '.' elif i == j: align_out[key] = i c += 1 # update pdb array new_pdb_array = OrderedDict() atoms_list = list(signprot_pdb_array[t_seg].values()) j = 0 for t_c, t in temp_out.items(): jplus1 = False if t != '-': for i in range(j, len(atoms_list)): if atoms_list[j] != '-': new_pdb_array[t_c] = atoms_list[j] jplus1 = True break if jplus1: j += 1 else: new_pdb_array[t_c] = 'x' # j+=1 # pprint.pprint(new_pdb_array) # for i,j in new_pdb_array.items(): # try: # print(i, PDB.Polypeptide.three_to_one(j[0].get_parent().get_resname())) # except: # print(i, j) # update dictionary keys with '?' if no backbone template ref_out_final, temp_out_final, align_out_final, new_pdb_array_final = OrderedDict( ), OrderedDict(), OrderedDict(), OrderedDict() # self.template_source[r_seg] = OrderedDict() for i, j in new_pdb_array.items(): if '?' not in i and j == 'x': ref_out_final[i.replace('_', '?').replace('.', '?')] = ref_out[i] temp_out_final[i.replace('_', '?').replace( '.', '?')] = temp_out[i] align_out_final[i.replace('_', '?').replace( '.', '?')] = align_out[i] new_pdb_array_final[i.replace('_', '?').replace( '.', '?')] = new_pdb_array[i] else: ref_out_final[i] = ref_out[i] temp_out_final[i] = temp_out[i] align_out_final[i] = align_out[i] new_pdb_array_final[i] = new_pdb_array[i] sign_a.reference_dict[r_seg] = ref_out_final sign_a.template_dict[t_seg] = temp_out_final sign_a.alignment_dict[a_seg] = align_out_final signprot_pdb_array[r_seg] = new_pdb_array_final align_loop = list(sign_a.alignment_dict[a_seg].values()) self.a.reference_dict = deepcopy(self.a.reference_dict) self.a.template_dict = deepcopy(self.a.template_dict) self.a.alignment_dict = deepcopy(self.a.alignment_dict) for seg, values in sign_a.reference_dict.items(): new_array[seg] = OrderedDict() # self.template_source[seg] = OrderedDict() final_values = deepcopy(values) for key, res in values.items(): try: if signprot_pdb_array[seg][key] == 'x': new_array[seg][key] = 'x' self.template_source = update_template_source( self.template_source, [key], None, seg) else: new_array[seg][key] = signprot_pdb_array[seg][key] except: if res != '-': new_array[seg][key] = '-' self.template_source = update_template_source( self.template_source, [key], None, seg) self.a.reference_dict[seg] = final_values for seg, values in sign_a.template_dict.items(): for key, res in values.items(): if new_array[seg][key] == 'x': sign_a.template_dict[seg][key] = 'x' else: if new_array[seg][key] == '-': sign_a.template_dict[seg][key] = '-' else: pdb_res = PDB.Polypeptide.three_to_one( new_array[seg][key][0].get_parent().get_resname()) if pdb_res != sign_a.template_dict[seg][key]: sign_a.template_dict[seg][key] = pdb_res self.a.template_dict[seg] = sign_a.template_dict[seg] for seg, values in sign_a.alignment_dict.items(): for key, res in values.items(): if new_array[seg][key] == 'x': values[key] = 'x' self.a.alignment_dict[seg] = values signprot_pdb_array = new_array for seg, values in signprot_pdb_array.items(): self.main_pdb_array[seg] = values delete_HN_begin = [] for i in self.a.reference_dict['HN']: if i == 'G.HN.30': break delete_HN_begin.append(i) for d in delete_HN_begin: del self.a.reference_dict['HN'][d] try: del self.a.template_dict['HN'][d] except: pass try: del self.a.alignment_dict['HN'][d] except: pass del self.main_pdb_array['HN'][d] try: del self.template_source['HN'][d] except: pass # add residues to model to self.trimmed_residues gprot_segments = [ i.slug for i in ProteinSegment.objects.filter(proteinfamily='Alpha') ] for i, j in self.a.reference_dict.items(): if i in gprot_segments: for k, l in j.items(): if '?' in k or self.main_pdb_array[i][k] in ['-', 'x']: self.trimmed_residues.append(k) if i in loops_to_model: self.trimmed_residues.append(k) # custom mods long_HG_prots = Protein.objects.filter(family__name='Gs') if structure_signprot in long_HG_prots and self.target_signprot not in long_HG_prots: self.trimmed_residues.append('G.HG.08') self.trimmed_residues.append('G.HG.09') self.trimmed_residues.append('G.HG.12') self.trimmed_residues.append('G.HG.13') self.trimmed_residues.append('G.HG.14') self.trimmed_residues.append('G.HG.16') self.trimmed_residues.append('G.HG.17') if structure_signprot != self.target_signprot or alt_signprot_complex.protein not in [ None, self.target_signprot ]: # hbhc hbhc_keys = list(self.a.reference_dict['hbhc'].keys()) self.trimmed_residues.append(hbhc_keys[2]) self.trimmed_residues.append(hbhc_keys[3]) self.trimmed_residues.append(hbhc_keys[-3]) self.trimmed_residues.append(hbhc_keys[-2]) # H1 self.trimmed_residues.append('G.H1.07') self.trimmed_residues.append('G.H1.08') if 'hgh4' in loops_to_model: self.trimmed_residues.append('G.H4.01') self.trimmed_residues.append('G.H4.02') self.trimmed_residues.append('G.H4.03') # Add mismatching residues to trimmed residues for modeling for seg, val in self.a.alignment_dict.items(): if seg in gprotein_segment_slugs: for key, res in val.items(): if res == '.': self.trimmed_residues.append(key) # Add residues with missing atom coordinates to trimmed residues for modeling for seg, val in self.main_pdb_array.items(): if seg in gprotein_segment_slugs: for key, atoms in val.items(): if atoms not in ['-', 'x']: if atom_num_dict[PDB.Polypeptide.three_to_one( atoms[0].get_parent().get_resname())] > len( atoms): self.trimmed_residues.append(key) # Add Beta and Gamma chains p = PDB.PDBParser(QUIET=True).get_structure( 'structure', StringIO(self.main_structure.pdb_data.pdb))[0] beta = p[self.signprot_complex.beta_chain] gamma = p[self.signprot_complex.gamma_chain] self.a.reference_dict['Beta'] = OrderedDict() self.a.template_dict['Beta'] = OrderedDict() self.a.alignment_dict['Beta'] = OrderedDict() self.main_pdb_array['Beta'] = OrderedDict() self.template_source['Beta'] = OrderedDict() self.a.reference_dict['Gamma'] = OrderedDict() self.a.template_dict['Gamma'] = OrderedDict() self.a.alignment_dict['Gamma'] = OrderedDict() self.main_pdb_array['Gamma'] = OrderedDict() self.template_source['Gamma'] = OrderedDict() for b_res in beta: key = str(b_res.get_id()[1]) self.a.reference_dict['Beta'][key] = PDB.Polypeptide.three_to_one( b_res.get_resname()) self.a.template_dict['Beta'][key] = PDB.Polypeptide.three_to_one( b_res.get_resname()) self.a.alignment_dict['Beta'][key] = PDB.Polypeptide.three_to_one( b_res.get_resname()) atoms = [atom for atom in b_res] self.main_pdb_array['Beta'][key] = atoms self.template_source['Beta'][key] = [ self.main_structure, self.main_structure ] for g_res in gamma: key = str(g_res.get_id()[1]) self.a.reference_dict['Gamma'][key] = PDB.Polypeptide.three_to_one( g_res.get_resname()) self.a.template_dict['Gamma'][key] = PDB.Polypeptide.three_to_one( g_res.get_resname()) self.a.alignment_dict['Gamma'][key] = PDB.Polypeptide.three_to_one( g_res.get_resname()) atoms = [atom for atom in g_res] self.main_pdb_array['Gamma'][key] = atoms self.template_source['Gamma'][key] = [ self.main_structure, self.main_structure ]
def run_RMSD_list(self, files, receptor, seq_nums=None, force_chain=None, sp_7TM=False, only_backbone=False): """Calculates 3 RMSD values between a list of GPCR pdb files. It compares the files using sequence and generic numbers. First file in the list has to be the reference file. Params: @receptor: UniProt entry name of GPCR, str @seq_nums: Specified list of sequence residue numbers for the Custom calculation, list @force_chain: Specify one letter chain name to use in the pdb files, str @sp_7TM: Superimpose only on 7TM backbone atoms (N, CA, C), boolean @only_backbone: Calculate RMSD for only the backbone atoms, boolean """ parser = PDB.PDBParser(QUIET=True) count = 0 pdbs = [] for f in files: count+=1 pdb = parser.get_structure('struct{}'.format(count), f)[0] assign_gn = as_gn.GenericNumbering(pdb_file=f, sequence_parser=True) pdb = assign_gn.assign_generic_numbers_with_sequence_parser() pdbs.append(pdb) chains = [] for p in pdbs: this = [] for c in p.get_chains(): this.append(c.get_id()) chains.append(this) usable_chains = [] for m in chains[1:]: for c in m: if c in chains[0]: usable_chains.append(c) if force_chain: chains[0] = [force_chain] arrays = [] model_counter = 0 ### Creating full arrays for p in pdbs: try: if pdbs.index(p)==0 and len(usable_chains)==0: chain = [c.get_id() for c in pdbs[0].get_chains()][0] else: chain = p[usable_chains[0]].get_id() except: try: chain = p[' '].get_id() except: chain = p['A'].get_id() if force_chain and model_counter==0: chain = force_chain pdb_array1 = OrderedDict() for residue in p[chain]: if residue.get_full_id()[3][0]!=' ': continue pdb_array1[int(residue.get_id()[1])] = residue arrays.append(pdb_array1) model_counter+=1 ### Checking available residues in target and models all_deletes, all_keep = [], [] for res in arrays[0]: for m in arrays[1:]: if res not in m: all_deletes.append(res) else: all_keep.append(res) ### Making unique list with residues present in all structures unique_nums = [] for i in all_keep: if i not in unique_nums: unique_nums.append(i) all_keep = [i for i in unique_nums if i not in all_deletes] print('Residue sequence numbers present in all structures: {}'.format(len(all_keep))) print(all_keep) ### Checking available atoms in target and models atoms_to_keep, atoms_to_delete = OrderedDict(), OrderedDict() for target_resnum, target_res in arrays[0].items(): atoms_to_keep[target_resnum] = [] atoms_to_delete[target_resnum] = [] for model in arrays[1:]: for model_resnum, model_res in model.items(): if target_resnum==model_resnum: for atom in model_res: if atom.id in target_res and atom.id not in atoms_to_keep[target_resnum]: atoms_to_keep[target_resnum].append(atom.id) elif atom.id not in target_res and atom.id not in atoms_to_delete[target_resnum]: atoms_to_delete[target_resnum].append(atom.id) for t_atom in target_res: if t_atom.id in model_res and t_atom.id not in atoms_to_keep[target_resnum]: atoms_to_keep[target_resnum].append(t_atom.id) elif t_atom.id not in model_res and t_atom.id not in atoms_to_delete[target_resnum]: atoms_to_delete[target_resnum].append(t_atom.id) break ### Creating atom lists of structures atom_lists = [] for m in arrays: atom_list = [] for num, res in m.items(): if num in all_keep and num not in all_deletes: for atom in res: if atom.id in atoms_to_keep[num] and atom.id not in atoms_to_delete[num]: atom_list.append(atom) atom_lists.append(atom_list) ### Fetching TM data from GPCRdb TM_nums = Residue.objects.filter(protein_conformation__protein__entry_name=receptor, protein_segment__slug__in=['TM1', 'TM2', 'TM3', 'TM4', 'TM5', 'TM6', 'TM7']).values_list('sequence_number', flat=True) TM_target_atom_list = [i for i in atom_lists[0] if i.get_parent().id[1] in TM_nums] TM_target_backbone_atom_list = [i for i in TM_target_atom_list if i.id in ['N','CA','C']] TM_atom_num = len(TM_target_atom_list) print('TM_atom_num:',TM_atom_num) print('TM_backbone_atom_num:',len(TM_target_backbone_atom_list)) ### Running superposition and RMSD calculation c = 2 for m in atom_lists[1:]: print('########################################') print('Model {}'.format(c-1)) if seq_nums: seq_nums = [int(s) for s in seq_nums] else: seq_nums = all_keep ### Custom calculation if sp_7TM: TM_superposed, atoms_used_sp = self.superpose(atom_lists[0], m, list(TM_nums)) superposed = self.fetch_atoms_with_seqnum(TM_superposed, seq_nums, only_backbone) target_atoms = self.fetch_atoms_with_seqnum(atom_lists[0], seq_nums, only_backbone) rmsd = self.calc_RMSD(target_atoms, superposed) else: superposed, atoms_used_sp = self.superpose(atom_lists[0], m) target_atoms = atom_lists[0] rmsd = self.calc_RMSD(target_atoms, superposed) print('Num atoms sent for superposition: ', len(atom_lists[0]), len(m)) print('Num atoms used for superposition: ', atoms_used_sp) print('Num atoms used for RMSD: ', len(target_atoms), len(superposed)) print('Custom RMSD:', rmsd) ### 7TM all atoms calculation TM_model_atom_list = [i for i in m if i.get_parent().id[1] in TM_nums] superposed2, atoms_used_sp = self.superpose(TM_target_atom_list, TM_model_atom_list, list(TM_nums)) rmsd = self.calc_RMSD(TM_target_atom_list, superposed2) print('Num atoms sent for superposition: ', len(TM_target_atom_list), len(TM_model_atom_list)) print('Num atoms used for superposition: ', atoms_used_sp) print('Num atoms used for RMSD: ', len(TM_target_atom_list), len(superposed2)) print('7TM all RMSD:', rmsd) ### 7TM only backbone (N, CA, C) calculation superposed, atoms_used_sp = self.superpose(TM_target_atom_list, TM_model_atom_list, list(TM_nums)) superposed3 = self.fetch_atoms_with_seqnum(superposed, list(TM_nums), True) rmsd = self.calc_RMSD(TM_target_backbone_atom_list, superposed3) print('Num atoms sent for superposition: ', len(TM_target_atom_list), len(TM_model_atom_list)) print('Num atoms used for superposition: ', atoms_used_sp) print('Num atoms used for RMSD: ', len(TM_target_backbone_atom_list), len(superposed3)) print('7TM backbone RMSD:', rmsd) c+=1
def run_RMSD_list_archived(self, files, seq_nums=None, force_chain=None): """Calculates 4 RMSD values between a list of GPCR pdb files. It compares the files using sequence and generic numbers. First file in the list has to be the reference file. 1. overall all atoms RMSD 2. overall backbone atoms RMSD 3. 7TM all atoms RMSD 4. 7TM backbone atoms RMSD """ c = 0 for f in files: c+=1 if c==1: self.number_of_residues_superposed['reference'] = OrderedDict() self.number_of_atoms_superposed['reference'] = OrderedDict() self.rmsds['reference'] = OrderedDict() else: self.number_of_residues_superposed['file{}'.format(str(c))] = OrderedDict() self.number_of_atoms_superposed['file{}'.format(str(c))] = OrderedDict() self.rmsds['file{}'.format(str(c))] = OrderedDict() parser = PDB.PDBParser(QUIET=True) count = 0 pdbs = [] for f in files: count+=1 pdb = parser.get_structure('struct{}'.format(count), f)[0] assign_gn = as_gn.GenericNumbering(pdb_file=f, sequence_parser=True) pdb = assign_gn.assign_generic_numbers_with_sequence_parser() pdbs.append(pdb) chains = [] for p in pdbs: this = [] for c in p.get_chains(): this.append(c.get_id()) chains.append(this) usable_chains = [] for m in chains[1:]: for c in m: if c in chains[0]: usable_chains.append(c) if force_chain: chains[0] = [force_chain] arrays = [] model_counter = 0 for p in pdbs: try: if pdbs.index(p)==0 and len(usable_chains)==0: chain = [c.get_id() for c in pdbs[0].get_chains()][0] else: chain = p[usable_chains[0]].get_id() except: try: chain = p[' '].get_id() except: chain = p['A'].get_id() if force_chain and model_counter==0: chain = force_chain pdb_array1, pdb_array2 = OrderedDict(), OrderedDict() for residue in p[chain]: if residue.get_full_id()[3][0]!=' ': continue if seq_nums!=None and str(residue.get_id()[1]) in seq_nums: pdb_array1[int(residue.get_id()[1])] = residue elif seq_nums==None: pdb_array1[int(residue.get_id()[1])] = residue try: if -8.1 < residue['CA'].get_bfactor() < 8.1: pdb_array2[int(residue.get_id()[1])] = residue except: pass arrays.append([pdb_array1,pdb_array2]) model_counter+=1 all_deletes, TM_deletes = [], [] all_keep, TM_keep = [], [] for i in range(0,2): for res in arrays[0][i]: for m in arrays[1:]: if res not in m[i]: if i==0: all_deletes.append(res) else: TM_deletes.append(res) else: if i==0: all_keep.append(res) else: TM_keep.append(res) deletes = [all_deletes, TM_deletes] keeps = [all_keep, TM_keep] num_atoms1, num_atoms2 = OrderedDict(), OrderedDict() num_atoms = [num_atoms1, num_atoms2] mismatches = [] resis_to_delete = [] for m_i, m in enumerate(arrays): for i in range(0,2): for res in m[i]: if res in deletes[i] or res not in keeps[i]: resis_to_delete.append([m_i,i,res]) else: try: if m[i][res].get_resname()!=num_atoms[i][res][0].get_parent().get_resname(): del num_atoms[i][res] mismatches.append(res) else: raise Exception() except: if res not in mismatches: atoms = [] for atom in m[i][res]: atoms.append(atom) if res not in num_atoms[i]: num_atoms[i][res] = atoms else: if len(atoms)<len(num_atoms[i][res]): num_atoms[i][res] = atoms for i in resis_to_delete: del arrays[i[0]][i[1]][i[2]] atom_lists = [] for m in arrays: this_model = [] for i in range(0,2): this_list_all = [] this_list_bb = [] for res in m[i]: if res in num_atoms[i]: atoms = [a.get_id() for a in m[i][res].get_list()] ref_atoms = [at.get_id() for at in num_atoms[i][res]] for atom in sorted(atoms): if atom in ref_atoms: this_list_all.append(m[i][res][atom]) if atom in ['N','CA','C']: this_list_bb.append(m[i][res][atom]) this_model.append(this_list_all) this_model.append(this_list_bb) atom_lists.append(this_model) TM_keys = list(num_atoms[1].keys()) c = 0 for m in atom_lists: c+=1 for i in range(0,4): if i<2: j=0 else: j=1 if c>1: self.number_of_residues_superposed['file{}'.format(str(c))][self.four_scores[i]] = len(num_atoms[j]) self.number_of_atoms_superposed['file{}'.format(str(c))][self.four_scores[i]] = len(m[i]) rmsd = self.calc_RMSD(atom_lists[0][i], m[i])#, TM_keys) self.rmsds['file{}'.format(str(c))][self.four_scores[i]] = rmsd else: self.number_of_residues_superposed['reference'][self.four_scores[i]] = len(num_atoms[j]) self.number_of_atoms_superposed['reference'][self.four_scores[i]] = len(m[i]) self.rmsds['reference'][self.four_scores[i]] = None
def pdb_array_creator(self, structure=None, filename=None): ''' Creates an OrderedDict() from the pdb of a Structure object where residue numbers/generic numbers are keys for the residues, and atom names are keys for the Bio.PDB.Residue objects. @param structure: Structure, Structure object of protein. When using structure, leave filename=None. \n @param filename: str, filename of pdb to be parsed. When using filename, leave structure=None). ''' # seq_nums_overwrite_cutoff_dict = {'4PHU':2000, '4LDL':1000, '4LDO':1000, '4QKX':1000, '5JQH':1000, '5TZY':2000, '5KW2':2000} if structure != None and filename == None: io = StringIO(structure.pdb_data.pdb) else: io = filename gn_array = [] residue_array = [] # pdb_struct = PDB.PDBParser(QUIET=True).get_structure(structure.pdb_code.index, io)[0] residues = Residue.objects.filter( protein_conformation=structure.protein_conformation) gn_list = [] for i in residues: try: gn_list.append( ggn(i.display_generic_number.label).replace('x', '.')) except: pass ssno = StructureSeqNumOverwrite(structure) ssno.seq_num_overwrite('pdb') if len(ssno.pdb_wt_table) > 0: residues = residues.filter(protein_segment__slug__in=[ 'TM1', 'TM2', 'TM3', 'TM4', 'TM5', 'TM6', 'TM7', 'H8' ]).order_by('sequence_number') output = OrderedDict() for r in residues: if r.protein_segment.slug == None: continue if r.protein_segment.slug not in output: output[r.protein_segment.slug] = OrderedDict() rotamer = Rotamer.objects.filter(residue=r) rotamer = self.right_rotamer_select(rotamer) rota_io = StringIO(rotamer.pdbdata.pdb) p = PDB.PDBParser() parsed_rota = p.get_structure('rota', rota_io) for chain in parsed_rota[0]: for res in chain: atom_list = [] for atom in res: # Skip hydrogens if atom.get_id().startswith('H'): continue if atom.get_id() == 'N': bw, gn = r.display_generic_number.label.split( 'x') atom.set_bfactor(bw) elif atom.get_id() == 'CA': bw, gn = r.display_generic_number.label.split( 'x') gn = "{}.{}".format(bw.split('.')[0], gn) if len(gn.split('.')[1]) == 3: gn = '-' + gn[:-1] atom.set_bfactor(gn) atom_list.append(atom) output[r.protein_segment.slug][ggn( r.display_generic_number.label).replace( 'x', '.')] = atom_list pprint.pprint(output) return output else: assign_gn = as_gn.GenericNumbering( pdb_file=io, pdb_code=structure.pdb_code.index, sequence_parser=True) pdb_struct = assign_gn.assign_generic_numbers_with_sequence_parser( ) pref_chain = structure.preferred_chain parent_prot_conf = ProteinConformation.objects.get( protein=structure.protein_conformation.protein.parent) parent_residues = Residue.objects.filter( protein_conformation=parent_prot_conf) last_res = list(parent_residues)[-1].sequence_number if len(pref_chain) > 1: pref_chain = pref_chain[0] for residue in pdb_struct[pref_chain]: if 'CA' in residue and -9.1 < residue['CA'].get_bfactor( ) < 9.1: use_resid = False gn = str(residue['CA'].get_bfactor()) if len(gn.split('.')[1]) == 1: gn = gn + '0' if gn[0] == '-': gn = gn[1:] + '1' # Exceptions if structure.pdb_code.index == '3PBL' and residue.get_id( )[1] == 331: use_resid = True elif structure.pdb_code.index == '6QZH' and residue.get_id( )[1] == 1434: use_resid = True elif structure.pdb_code.index == '7M3E': use_resid = True ################################################# elif gn in gn_list: gn_array.append(gn) residue_array.append(residue.get_list()) else: use_resid = True if use_resid: gn_array.append(str(residue.get_id()[1])) residue_array.append(residue.get_list()) output = OrderedDict() for num, label in self.segment_coding.items(): output[label] = OrderedDict() if len(gn_array) != len(residue_array): raise AssertionError() for gn, res in zip(gn_array, residue_array): if '.' in gn: seg_num = int(gn.split('.')[0]) seg_label = self.segment_coding[seg_num] if seg_num == 8 and len(output['TM7']) == 0: continue else: output[seg_label][gn] = res else: try: found_res, found_gn = None, None try: found_res = Residue.objects.get( protein_conformation=structure. protein_conformation, sequence_number=gn) except: # Exception for res 317 in 5VEX, 5VEW if structure.pdb_code.index in [ '5VEX', '5VEW' ] and gn == '317' and res[0].get_parent( ).get_resname() == 'CYS': found_res = Residue.objects.get( protein_conformation=parent_prot_conf, sequence_number=gn) ##################################### found_gn = str( ggn(found_res.display_generic_number.label) ).replace('x', '.') # Exception for res 318 in 5VEX, 5VEW if structure.pdb_code.index in [ '5VEX', '5VEW' ] and gn == '318' and res[0].get_parent().get_resname( ) == 'ILE' and found_gn == '5.47': found_gn = '5.48' ##################################### if -9.1 < float(found_gn) < 9.1: if len(res) == 1: continue if int(gn) > last_res: continue seg_label = self.segment_coding[int( found_gn.split('.')[0])] output[seg_label][found_gn] = res except: if res[0].get_parent().get_resname() == 'YCM' or res[ 0].get_parent().get_resname() == 'CSD': try: found_res = Residue.objects.get( protein_conformation=parent_prot_conf, sequence_number=gn) except: continue if found_res.protein_segment.slug[0] not in [ 'T', 'H' ]: continue try: found_gn = str( ggn(found_res.display_generic_number.label) ).replace('x', '.') except: found_gn = str(gn) output[ found_res.protein_segment.slug][found_gn] = res return output