示例#1
0
	def match_renumber(self, reference_pdb):
		""" Match the chains and renumber the structures according to a reference PDB """

		clustalo_exe = ini.get('third_party', 'clustalo_exe')

		if not shutil.which(clustalo_exe):
			print('\n+ ClustalO not correctly configured in haddock3.ini')
			print('+ WARNING: matching not possible!')
			return False
		else:
			print('\n+ Running automated chain matching and renumbering')
			print('+ WARNING: Use with caution, some residues could be deleted')

		if reference_pdb == 'lowest':
			reference_pdb, reference_score = self.fetch_lowest()

		if ' ' not in PDB.identify_chains(reference_pdb):
			reference_pdb = PDB.fix_id(reference_pdb, priority='chain')
		if ' ' not in PDB.identify_chainseg(reference_pdb):
			reference_pdb = PDB.fix_id(reference_pdb, priority='seg')

		reference_seq_dic = PDB.load_seq(reference_pdb)
		reference_chains = PDB.identify_chains(reference_pdb)
		reference_chains.sort()

		pdb_list = list(self.structure_dic.keys())
		pdb_list.sort()

		for pdb in pdb_list:
			# match the chains with sequence alignment
			target_seq_dic = PDB.load_seq(pdb)
			pdb = PDB.fix_id(pdb, priority='seg')

			# Get what chains are present in the target
			target_chains = PDB.identify_chains(pdb)
			target_chains.sort()

			# Do a combinatorial alignment to check which chains match better
			identity_dic = {}
			for ref_chain, target_chain in itertools.product(reference_chains, target_chains):
				ref_seq = ''.join(list(reference_seq_dic[ref_chain].values()))
				target_seq = ''.join(list(target_seq_dic[target_chain].values()))
				open('seq.fasta', 'w').write(f'>ref\n{ref_seq}\n>target\n{target_seq}\n')
				cmd = f'{clustalo_exe} -i seq.fasta --outfmt=clu --resno --wrap=9000 --force'
				p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				out = p.communicate()
				os.remove('seq.fasta')
				aln_data = out[0].decode('utf-8').split()
				ref_aln = aln_data[6]
				target_aln = aln_data[9]
				counter_a = 0
				counter_b = 0
				numbering_dic = {}
				for i in range(len(ref_aln)):
					ref_char = ref_aln[i]
					target_char = target_aln[i]
					ref_resnum = list(reference_seq_dic[ref_chain])[counter_a]
					try:
						target_resnum = list(target_seq_dic[target_chain])[counter_b]
					except IndexError:
						# Target sequence exhausted, ignore
						target_resnum = '-'
					# print(ref_char, ref_resnum, target_char, target_resnum)
					if '-' not in ref_char:
						counter_a += 1
					if '-' not in target_char:
						counter_b += 1
					if '-' not in ref_char and '-' not in target_char:
						numbering_dic[target_resnum] = ref_resnum
				identity = out[0].decode('utf-8').count('*') / float(len(ref_seq))
				coverage = len(numbering_dic) / len(ref_aln)
				# print(ref_chain, target_chain, identity, coverage)
				# print(f'>R:{ref_chain}\n{ref_aln}')
				# print(f'>T:{target_chain}\n{target_aln}')
				try:
					identity_dic[ref_chain].append((target_chain, identity, coverage, numbering_dic))
				except KeyError:
					identity_dic[ref_chain] = [(target_chain, identity, coverage,  numbering_dic)]
			# print('#########\n')

			# do the renumbering
			for i, ref_c in enumerate(reference_chains):
				target_info_list = [(v[0], v[1], v[2]) for v in identity_dic[ref_c]]
				# sort by identity and coverage
				sorted_target_list = sorted(target_info_list, key=lambda x: (-x[2], x[1]))
				# create a catalog with possible numbering references
				numbering_dic_catalog = dict([(v[0], v[3]) for v in identity_dic[ref_c]])

				if len(set([ e[1] for e in sorted_target_list])) == 1:
					# this is a h**o-something, match is sequentialy
					selected_chain = target_chains[i]
				else:
					# get the highest identity/coverage
					selected_chain = sorted_target_list[0][0]

				# select the correct numbering dictionary
				selected_numbering_dic = numbering_dic_catalog[selected_chain]
				# just for readability:
				old_chain = selected_chain
				new_chain = ref_c

				# replace the target chain (old) with the same observed in the reference (new)
				chain_matched_pdb = PDB.replace_chain(pdb, old_chain, new_chain)

				# renumber!
				#  Note, if residue is present in target and not in reference it will be DELETED, use with caution
				renumbered_pdb = PDB.renumber(chain_matched_pdb, selected_numbering_dic, new_chain, overwrite=True)

		return True
示例#2
0
def dockq(ref, pdb_f, dockq_exec):

    irms = .0
    lrms = .0
    fnat = .0
    dockq_score = .0
    capri = None
    order = None

    ref = PDB.fix_id(ref)
    pdb_f = PDB.fix_id(pdb_f)

    reference_chains = PDB.identify_chainseg(ref)
    pdb_chains = PDB.identify_chainseg(pdb_f)

    result_dic = {}

    if reference_chains != pdb_chains:
        print(
            f'+ WARNING: Skipping {pdb_f}, number of chains do not match. Expected {len(reference_chains)} found {len(pdb_chains)}'
        )
        interface_name = ''
        result_dic[f'{interface_name}_irms'] = .0
        result_dic[f'{interface_name}_lrms'] = .0
        result_dic[f'{interface_name}_fnat'] = .0
        result_dic[f'{interface_name}_capri'] = ''
        result_dic[f'{interface_name}_dockq'] = .0
        result_dic[f'{interface_name}_order'] = ''

    else:

        for comb in itertools.combinations(pdb_chains, 2):

            # interface_name = ''.join(comb) + '-' + [e for e in pdb_chains if e not in comb][0]
            interface_name = ''.join(comb)

            cmd = f'{dockq_exec} {pdb_f} {ref} -native_chain1 {comb[0]} {comb[1]} -perm1'

            p = subprocess.run(cmd.split(), stdout=subprocess.PIPE)
            out = p.stdout.decode('utf-8').split('\n')

            for l in out:
                if l.startswith('Best score'):
                    output_l = l.split()
                    order = f'{output_l[-4]}->{output_l[-1]}'
                elif l.startswith('Fnat'):
                    fnat = float(l.split()[1])
                elif l.startswith('iRMS'):
                    irms = float(l.split()[1])
                elif l.startswith('LRMS'):
                    lrms = float(l.split()[1])
                elif l.startswith('DockQ_CAPRI'):
                    capri = l.split()[1]
                elif l.startswith('DockQ'):
                    dockq_score = float(l.split()[1])
                else:
                    pass

            result_dic[f'{interface_name}_irms'] = irms
            result_dic[f'{interface_name}_lrms'] = lrms
            result_dic[f'{interface_name}_fnat'] = fnat
            result_dic[f'{interface_name}_capri'] = capri
            result_dic[f'{interface_name}_dockq'] = dockq_score
            result_dic[f'{interface_name}_order'] = order

    return result_dic
示例#3
0
class TestPDB(unittest.TestCase):

	def setUp(self):
		self.PDB = PDB()

	def test_treat_ensemble(self):
		copyfile(f'{data_path}/mini_ens.pdb', f'{data_path}/temp_ens.pdb')
		input_pdb_dic = {'mol1': f'{data_path}/temp_ens.pdb'}

		treated_dic = self.PDB.treat_ensemble(input_pdb_dic)
		expected_treated_dic = {'mol1': [f'{data_path}/temp_1.pdb', f'{data_path}/temp_2.pdb']}

		self.assertEqual(treated_dic, expected_treated_dic)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp_1.pdb', f'{data_path}/mini_ens1.pdb'))
		self.assertTrue(filecmp.cmp(f'{data_path}/temp_2.pdb', f'{data_path}/mini_ens2.pdb'))

		os.remove(f'{data_path}/temp_1.pdb')
		os.remove(f'{data_path}/temp_2.pdb')
		os.remove(f'{data_path}/temp_ens.pdb')

	def test_load_structure(self):
		pdb_f = f'{data_path}/miniA.pdb'
		pdb_dic = self.PDB.load_structure(pdb_f)
		expected_pdb_dic = {'A': ['ATOM      2  CA  MET A   1      16.967  12.784   4.338  1.00 10.80      A    C  \n',
							  'ATOM      9  CA  ARG A   2      13.856  11.469   6.066  1.00  8.31      A    C  \n',
							  'ATOM     16  CA  CYS A   3      13.660  10.707   9.787  1.00  5.39      A    C  \n']}
		self.assertEqual(pdb_dic, expected_pdb_dic)

	def test_identify_chains(self):
		pdb_f = f'{data_path}/mini.pdb'
		chain_l = self.PDB.identify_chains(pdb_f)
		expected_chain_l = ['A', 'B','C']
		self.assertEqual(chain_l, expected_chain_l)

	def test_identify_segids(self):
		pdb_f = f'{data_path}/miniA.pdb'
		segid_l = self.PDB.identify_segids(pdb_f)
		expected_segid_l = ['A']
		self.assertEqual(segid_l, expected_segid_l)

	def test_split_models(self):
		ensamble_f = f'{data_path}/mini_ens.pdb'
		model_list = self.PDB.split_models(ensamble_f)
		expected_list = [f'{data_path}/mini_1.pdb', f'{data_path}/mini_2.pdb']

		self.assertEqual(model_list, expected_list, 'Name of list elements differ')
		self.assertTrue(filecmp.cmp(f'{data_path}/mini_1.pdb', f'{data_path}/mini_1.gold'))
		self.assertTrue(filecmp.cmp(f'{data_path}/mini_2.pdb', f'{data_path}/mini_2.gold'))

		for f in model_list:
			os.remove(f)

	def test_fix_id(self):
		nosegid_pdb_f = f'{data_path}/mini.pdb'
		nochain_pdb_f = f'{data_path}/mini_nochain.pdb'

		segid_pdb = self.PDB.fix_id(nosegid_pdb_f, priority='chain', overwrite=False)
		chain_pdb = self.PDB.fix_id(nochain_pdb_f, priority='seg', overwrite=False)

		self.assertTrue(filecmp.cmp(segid_pdb, f'{data_path}/mini_segid.pdb'))
		self.assertTrue(filecmp.cmp(chain_pdb, f'{data_path}/mini_segid.pdb'))

		os.remove(f'{data_path}/mini.pdb_')
		os.remove(f'{data_path}/mini_nochain.pdb_')

	def test_add_chainseg(self):

		copyfile(f'{data_path}/mini.pdb', f'{data_path}/temp.pdb')

		check = self.PDB.add_chainseg(f'{data_path}/temp.pdb', 'A')

		self.assertTrue(check)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/miniA.pdb'))

		os.remove(f'{data_path}/temp.pdb')

	def test_identify_chainseg(self):

		pdbf = f'{data_path}/miniA.pdb'

		chainseg = self.PDB.identify_chainseg(pdbf)

		self.assertEqual(chainseg, ['A'])

	def test_fix_chainseg(self):
		copyfile(f'{data_path}/mini_1.gold', f'{data_path}/mol1.pdb')
		copyfile(f'{data_path}/mini_2.gold', f'{data_path}/mol2.pdb')

		input_pdb_dic = {'mol1': f'{data_path}/mol1.pdb', 'segid1': 'X', 'mol2': f'{data_path}/mol2.pdb'}

		return_pdb_dic = self.PDB.fix_chainseg(input_pdb_dic)
		expected_return_dic = {'mol1': f'{data_path}/mol1.pdb', 'mol2': f'{data_path}/mol2.pdb'}

		self.assertEqual(return_pdb_dic, expected_return_dic)
		self.assertTrue(filecmp.cmp(f'{data_path}/mol1.pdb', f'{data_path}/miniX.pdb'))

		os.remove(f'{data_path}/mol1.pdb')
		os.remove(f'{data_path}/mol2.pdb')

	def test_sanitize(self):
		copyfile(f'{data_path}/mini.dirty.pdb', f'{data_path}/temp.pdb')

		input_pdb_dic = {'mol1': [f'{data_path}/temp.pdb']}

		model_list = self.PDB.sanitize(input_pdb_dic)

		expected_model_list = [f'{data_path}/temp.pdb']

		self.assertEqual(model_list, expected_model_list)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/mini.clean.pdb'))

		os.remove(f'{data_path}/temp.pdb')

	def test_count_atoms(self):
		pdb_f = f'{data_path}/mini.pdb'
		atom_count = self.PDB.count_atoms(pdb_f)
		self.assertEqual(atom_count, 3)

	def test_organize_chains(self):
		pass

	def test_replace_chain(self):
		pdb_f = f'{data_path}/mini.pdb'
		newchain_pdb = self.PDB.replace_chain(pdb_f, 'A', 'X', overwrite=False)
		self.assertTrue(filecmp.cmp(newchain_pdb, f'{data_path}/mini_A-X.pdb'))
		os.remove(f'{data_path}/mini.pdb_')

	def test_renumber(self):
		pass

	def test_load_seq(self):
		pass

	def tearDown(self):
		pass
示例#4
0
文件: input.py 项目: SSchott/haddock3
	def prepare_input(pdb_input, psf_input=None):
		""" Write input of recipe """
		# This section will be written for any recipe
		#  Even if some CNS variables are not used, it should not be an issue.

		input_str = '\n! Input structure\n'

		string = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
		ncomp = None

		if psf_input:
			if type(psf_input) == str:
				input_str += 'structure\n'
				input_str += f'  @@{psf_input}\n'
				input_str += 'end\n'
			if type(psf_input) == list:
				input_str += 'structure\n'
				for psf in psf_input:
					input_str += f'  @@{psf}\n'
				input_str += 'end\n'

		if type(pdb_input) == str:
			ncomp = 1
			if psf_input:
				input_str += f'coor @@{pdb_input}\n'
			else:
				pass

			# $file variable is still used by some CNS recipes, need refactoring!
			input_str += f'eval ($file=\"{pdb_input}\")\n'

		if type(pdb_input) == list or type(pdb_input) == tuple:
			ncomp = len(pdb_input)
			for pdb in pdb_input:
				input_str += f'coor @@{pdb}\n'

		chainsegs = PDB.identify_chainseg(pdb_input)

		ncomponents = len(chainsegs)

		input_str += f'eval ($ncomponents={ncomponents})\n'

		for i, segid in enumerate(chainsegs):
			input_str += f'eval ($prot_segid_mol{i+1}="{segid}")\n'

		try:
			ambig_fname = glob.glob('data/ambig.tbl')[0]
			input_str += f'eval ($ambig_fname="{ambig_fname}")\n'
		except IndexError:
			input_str += f'eval ($ambig_fname="")\n'

		try:
			unambig_fname = glob.glob('data/unambig.tbl')[0]
			input_str += f'eval ($unambig_fname="{unambig_fname}")\n'
		except IndexError:
			input_str += f'eval ($unambig_fname="")\n'

		try:
			hbond_fname = glob.glob('data/hbond.tbl')[0]
			input_str += f'eval ($hbond_fname="{hbond_fname}")\n'
		except IndexError:
			input_str += f'eval ($hbond_fname="")\n'

		try:
			dihe_fname = glob.glob('data/dihe.tbl')[0]
			input_str += f'eval ($dihe_fname="{dihe_fname}")\n'
		except IndexError:
			input_str += f'eval ($dihe_fname="")\n'

		try:
			tensor_fname = glob.glob('data/tensor.tbl')[0]
			input_str += f'eval ($tensor_tbl="{tensor_fname}")\n'
		except IndexError:
			input_str += f'eval ($dihe_fname="")\n'

		seed = random.randint(100, 999)
		input_str += f'eval ($seed={seed})\n'

		return input_str