def match_renumber(self, reference_pdb): """ Match the chains and renumber the structures according to a reference PDB """ clustalo_exe = ini.get('third_party', 'clustalo_exe') if not shutil.which(clustalo_exe): print('\n+ ClustalO not correctly configured in haddock3.ini') print('+ WARNING: matching not possible!') return False else: print('\n+ Running automated chain matching and renumbering') print('+ WARNING: Use with caution, some residues could be deleted') if reference_pdb == 'lowest': reference_pdb, reference_score = self.fetch_lowest() if ' ' not in PDB.identify_chains(reference_pdb): reference_pdb = PDB.fix_id(reference_pdb, priority='chain') if ' ' not in PDB.identify_chainseg(reference_pdb): reference_pdb = PDB.fix_id(reference_pdb, priority='seg') reference_seq_dic = PDB.load_seq(reference_pdb) reference_chains = PDB.identify_chains(reference_pdb) reference_chains.sort() pdb_list = list(self.structure_dic.keys()) pdb_list.sort() for pdb in pdb_list: # match the chains with sequence alignment target_seq_dic = PDB.load_seq(pdb) pdb = PDB.fix_id(pdb, priority='seg') # Get what chains are present in the target target_chains = PDB.identify_chains(pdb) target_chains.sort() # Do a combinatorial alignment to check which chains match better identity_dic = {} for ref_chain, target_chain in itertools.product(reference_chains, target_chains): ref_seq = ''.join(list(reference_seq_dic[ref_chain].values())) target_seq = ''.join(list(target_seq_dic[target_chain].values())) open('seq.fasta', 'w').write(f'>ref\n{ref_seq}\n>target\n{target_seq}\n') cmd = f'{clustalo_exe} -i seq.fasta --outfmt=clu --resno --wrap=9000 --force' p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) out = p.communicate() os.remove('seq.fasta') aln_data = out[0].decode('utf-8').split() ref_aln = aln_data[6] target_aln = aln_data[9] counter_a = 0 counter_b = 0 numbering_dic = {} for i in range(len(ref_aln)): ref_char = ref_aln[i] target_char = target_aln[i] ref_resnum = list(reference_seq_dic[ref_chain])[counter_a] try: target_resnum = list(target_seq_dic[target_chain])[counter_b] except IndexError: # Target sequence exhausted, ignore target_resnum = '-' # print(ref_char, ref_resnum, target_char, target_resnum) if '-' not in ref_char: counter_a += 1 if '-' not in target_char: counter_b += 1 if '-' not in ref_char and '-' not in target_char: numbering_dic[target_resnum] = ref_resnum identity = out[0].decode('utf-8').count('*') / float(len(ref_seq)) coverage = len(numbering_dic) / len(ref_aln) # print(ref_chain, target_chain, identity, coverage) # print(f'>R:{ref_chain}\n{ref_aln}') # print(f'>T:{target_chain}\n{target_aln}') try: identity_dic[ref_chain].append((target_chain, identity, coverage, numbering_dic)) except KeyError: identity_dic[ref_chain] = [(target_chain, identity, coverage, numbering_dic)] # print('#########\n') # do the renumbering for i, ref_c in enumerate(reference_chains): target_info_list = [(v[0], v[1], v[2]) for v in identity_dic[ref_c]] # sort by identity and coverage sorted_target_list = sorted(target_info_list, key=lambda x: (-x[2], x[1])) # create a catalog with possible numbering references numbering_dic_catalog = dict([(v[0], v[3]) for v in identity_dic[ref_c]]) if len(set([ e[1] for e in sorted_target_list])) == 1: # this is a h**o-something, match is sequentialy selected_chain = target_chains[i] else: # get the highest identity/coverage selected_chain = sorted_target_list[0][0] # select the correct numbering dictionary selected_numbering_dic = numbering_dic_catalog[selected_chain] # just for readability: old_chain = selected_chain new_chain = ref_c # replace the target chain (old) with the same observed in the reference (new) chain_matched_pdb = PDB.replace_chain(pdb, old_chain, new_chain) # renumber! # Note, if residue is present in target and not in reference it will be DELETED, use with caution renumbered_pdb = PDB.renumber(chain_matched_pdb, selected_numbering_dic, new_chain, overwrite=True) return True
def dockq(ref, pdb_f, dockq_exec): irms = .0 lrms = .0 fnat = .0 dockq_score = .0 capri = None order = None ref = PDB.fix_id(ref) pdb_f = PDB.fix_id(pdb_f) reference_chains = PDB.identify_chainseg(ref) pdb_chains = PDB.identify_chainseg(pdb_f) result_dic = {} if reference_chains != pdb_chains: print( f'+ WARNING: Skipping {pdb_f}, number of chains do not match. Expected {len(reference_chains)} found {len(pdb_chains)}' ) interface_name = '' result_dic[f'{interface_name}_irms'] = .0 result_dic[f'{interface_name}_lrms'] = .0 result_dic[f'{interface_name}_fnat'] = .0 result_dic[f'{interface_name}_capri'] = '' result_dic[f'{interface_name}_dockq'] = .0 result_dic[f'{interface_name}_order'] = '' else: for comb in itertools.combinations(pdb_chains, 2): # interface_name = ''.join(comb) + '-' + [e for e in pdb_chains if e not in comb][0] interface_name = ''.join(comb) cmd = f'{dockq_exec} {pdb_f} {ref} -native_chain1 {comb[0]} {comb[1]} -perm1' p = subprocess.run(cmd.split(), stdout=subprocess.PIPE) out = p.stdout.decode('utf-8').split('\n') for l in out: if l.startswith('Best score'): output_l = l.split() order = f'{output_l[-4]}->{output_l[-1]}' elif l.startswith('Fnat'): fnat = float(l.split()[1]) elif l.startswith('iRMS'): irms = float(l.split()[1]) elif l.startswith('LRMS'): lrms = float(l.split()[1]) elif l.startswith('DockQ_CAPRI'): capri = l.split()[1] elif l.startswith('DockQ'): dockq_score = float(l.split()[1]) else: pass result_dic[f'{interface_name}_irms'] = irms result_dic[f'{interface_name}_lrms'] = lrms result_dic[f'{interface_name}_fnat'] = fnat result_dic[f'{interface_name}_capri'] = capri result_dic[f'{interface_name}_dockq'] = dockq_score result_dic[f'{interface_name}_order'] = order return result_dic
class TestPDB(unittest.TestCase): def setUp(self): self.PDB = PDB() def test_treat_ensemble(self): copyfile(f'{data_path}/mini_ens.pdb', f'{data_path}/temp_ens.pdb') input_pdb_dic = {'mol1': f'{data_path}/temp_ens.pdb'} treated_dic = self.PDB.treat_ensemble(input_pdb_dic) expected_treated_dic = {'mol1': [f'{data_path}/temp_1.pdb', f'{data_path}/temp_2.pdb']} self.assertEqual(treated_dic, expected_treated_dic) self.assertTrue(filecmp.cmp(f'{data_path}/temp_1.pdb', f'{data_path}/mini_ens1.pdb')) self.assertTrue(filecmp.cmp(f'{data_path}/temp_2.pdb', f'{data_path}/mini_ens2.pdb')) os.remove(f'{data_path}/temp_1.pdb') os.remove(f'{data_path}/temp_2.pdb') os.remove(f'{data_path}/temp_ens.pdb') def test_load_structure(self): pdb_f = f'{data_path}/miniA.pdb' pdb_dic = self.PDB.load_structure(pdb_f) expected_pdb_dic = {'A': ['ATOM 2 CA MET A 1 16.967 12.784 4.338 1.00 10.80 A C \n', 'ATOM 9 CA ARG A 2 13.856 11.469 6.066 1.00 8.31 A C \n', 'ATOM 16 CA CYS A 3 13.660 10.707 9.787 1.00 5.39 A C \n']} self.assertEqual(pdb_dic, expected_pdb_dic) def test_identify_chains(self): pdb_f = f'{data_path}/mini.pdb' chain_l = self.PDB.identify_chains(pdb_f) expected_chain_l = ['A', 'B','C'] self.assertEqual(chain_l, expected_chain_l) def test_identify_segids(self): pdb_f = f'{data_path}/miniA.pdb' segid_l = self.PDB.identify_segids(pdb_f) expected_segid_l = ['A'] self.assertEqual(segid_l, expected_segid_l) def test_split_models(self): ensamble_f = f'{data_path}/mini_ens.pdb' model_list = self.PDB.split_models(ensamble_f) expected_list = [f'{data_path}/mini_1.pdb', f'{data_path}/mini_2.pdb'] self.assertEqual(model_list, expected_list, 'Name of list elements differ') self.assertTrue(filecmp.cmp(f'{data_path}/mini_1.pdb', f'{data_path}/mini_1.gold')) self.assertTrue(filecmp.cmp(f'{data_path}/mini_2.pdb', f'{data_path}/mini_2.gold')) for f in model_list: os.remove(f) def test_fix_id(self): nosegid_pdb_f = f'{data_path}/mini.pdb' nochain_pdb_f = f'{data_path}/mini_nochain.pdb' segid_pdb = self.PDB.fix_id(nosegid_pdb_f, priority='chain', overwrite=False) chain_pdb = self.PDB.fix_id(nochain_pdb_f, priority='seg', overwrite=False) self.assertTrue(filecmp.cmp(segid_pdb, f'{data_path}/mini_segid.pdb')) self.assertTrue(filecmp.cmp(chain_pdb, f'{data_path}/mini_segid.pdb')) os.remove(f'{data_path}/mini.pdb_') os.remove(f'{data_path}/mini_nochain.pdb_') def test_add_chainseg(self): copyfile(f'{data_path}/mini.pdb', f'{data_path}/temp.pdb') check = self.PDB.add_chainseg(f'{data_path}/temp.pdb', 'A') self.assertTrue(check) self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/miniA.pdb')) os.remove(f'{data_path}/temp.pdb') def test_identify_chainseg(self): pdbf = f'{data_path}/miniA.pdb' chainseg = self.PDB.identify_chainseg(pdbf) self.assertEqual(chainseg, ['A']) def test_fix_chainseg(self): copyfile(f'{data_path}/mini_1.gold', f'{data_path}/mol1.pdb') copyfile(f'{data_path}/mini_2.gold', f'{data_path}/mol2.pdb') input_pdb_dic = {'mol1': f'{data_path}/mol1.pdb', 'segid1': 'X', 'mol2': f'{data_path}/mol2.pdb'} return_pdb_dic = self.PDB.fix_chainseg(input_pdb_dic) expected_return_dic = {'mol1': f'{data_path}/mol1.pdb', 'mol2': f'{data_path}/mol2.pdb'} self.assertEqual(return_pdb_dic, expected_return_dic) self.assertTrue(filecmp.cmp(f'{data_path}/mol1.pdb', f'{data_path}/miniX.pdb')) os.remove(f'{data_path}/mol1.pdb') os.remove(f'{data_path}/mol2.pdb') def test_sanitize(self): copyfile(f'{data_path}/mini.dirty.pdb', f'{data_path}/temp.pdb') input_pdb_dic = {'mol1': [f'{data_path}/temp.pdb']} model_list = self.PDB.sanitize(input_pdb_dic) expected_model_list = [f'{data_path}/temp.pdb'] self.assertEqual(model_list, expected_model_list) self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/mini.clean.pdb')) os.remove(f'{data_path}/temp.pdb') def test_count_atoms(self): pdb_f = f'{data_path}/mini.pdb' atom_count = self.PDB.count_atoms(pdb_f) self.assertEqual(atom_count, 3) def test_organize_chains(self): pass def test_replace_chain(self): pdb_f = f'{data_path}/mini.pdb' newchain_pdb = self.PDB.replace_chain(pdb_f, 'A', 'X', overwrite=False) self.assertTrue(filecmp.cmp(newchain_pdb, f'{data_path}/mini_A-X.pdb')) os.remove(f'{data_path}/mini.pdb_') def test_renumber(self): pass def test_load_seq(self): pass def tearDown(self): pass
def prepare_input(pdb_input, psf_input=None): """ Write input of recipe """ # This section will be written for any recipe # Even if some CNS variables are not used, it should not be an issue. input_str = '\n! Input structure\n' string = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' ncomp = None if psf_input: if type(psf_input) == str: input_str += 'structure\n' input_str += f' @@{psf_input}\n' input_str += 'end\n' if type(psf_input) == list: input_str += 'structure\n' for psf in psf_input: input_str += f' @@{psf}\n' input_str += 'end\n' if type(pdb_input) == str: ncomp = 1 if psf_input: input_str += f'coor @@{pdb_input}\n' else: pass # $file variable is still used by some CNS recipes, need refactoring! input_str += f'eval ($file=\"{pdb_input}\")\n' if type(pdb_input) == list or type(pdb_input) == tuple: ncomp = len(pdb_input) for pdb in pdb_input: input_str += f'coor @@{pdb}\n' chainsegs = PDB.identify_chainseg(pdb_input) ncomponents = len(chainsegs) input_str += f'eval ($ncomponents={ncomponents})\n' for i, segid in enumerate(chainsegs): input_str += f'eval ($prot_segid_mol{i+1}="{segid}")\n' try: ambig_fname = glob.glob('data/ambig.tbl')[0] input_str += f'eval ($ambig_fname="{ambig_fname}")\n' except IndexError: input_str += f'eval ($ambig_fname="")\n' try: unambig_fname = glob.glob('data/unambig.tbl')[0] input_str += f'eval ($unambig_fname="{unambig_fname}")\n' except IndexError: input_str += f'eval ($unambig_fname="")\n' try: hbond_fname = glob.glob('data/hbond.tbl')[0] input_str += f'eval ($hbond_fname="{hbond_fname}")\n' except IndexError: input_str += f'eval ($hbond_fname="")\n' try: dihe_fname = glob.glob('data/dihe.tbl')[0] input_str += f'eval ($dihe_fname="{dihe_fname}")\n' except IndexError: input_str += f'eval ($dihe_fname="")\n' try: tensor_fname = glob.glob('data/tensor.tbl')[0] input_str += f'eval ($tensor_tbl="{tensor_fname}")\n' except IndexError: input_str += f'eval ($dihe_fname="")\n' seed = random.randint(100, 999) input_str += f'eval ($seed={seed})\n' return input_str