def test_internal_vina(): """Compare internal vs orignal Vina partial scores""" mols = list( oddt.toolkit.readfile( 'sdf', os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) list(map(lambda x: x.addh(), mols)) rec = next( oddt.toolkit.readfile( 'pdb', os.path.join(test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) rec.protein = True rec.addh() # Delete molecule which has differences in Acceptor-Donor def in RDK and OB del mols[65] vina_scores = [ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen' ] autodock_vina_results = np.loadtxt(os.path.join( test_data_dir, 'data/results/xiap/autodock_vina_scores.csv'), delimiter=',', dtype=np.float64) oddt_vina_results = oddt_vina_descriptor( protein=rec, vina_scores=vina_scores).build(mols) assert_array_almost_equal(oddt_vina_results, autodock_vina_results, decimal=4)
def __init__(self, protein = None): """ Descriptor build from binana script (as used in NNScore 2.0 Parameters ---------- protein: oddt.toolkit.Molecule object (default=None) Protein object to be used while generating descriptors. """ self.protein = protein self.vina = oddt_vina_descriptor(protein, vina_scores = ['vina_affinity', 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen']) # Close contacts descriptor generators cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD') ) cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types) self.cc_4 = cc_4_nn = close_contacts(protein, cutoff=4, protein_types=cc_4_rec_types, ligand_types=cc_4_lig_types, mode='atom_types_ad4', aligned_pairs=True) cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN')] cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types) self.cc_25 = close_contacts(protein, cutoff=2.5, protein_types=cc_25_rec_types, ligand_types=cc_25_lig_types, mode='atom_types_ad4', aligned_pairs=True)
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = [ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors' ] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def test_ensemble_descriptor(): mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10] list(map(lambda x: x.addh(), mols)) rec = next(oddt.toolkit.readfile('pdb', receptor_pdb)) rec.protein = True rec.addh() desc1 = rfscore(version=1).descriptor_generator desc2 = oddt_vina_descriptor() ensemble = ensemble_descriptor((desc1, desc2)) ensemble.set_protein(rec) assert len(ensemble) == len(desc1) + len(desc2) # set protein assert desc1.protein == rec assert desc2.protein == rec ensemble_scores = ensemble.build(mols) scores1 = desc1.build(mols) scores2 = desc2.build(mols) assert_array_almost_equal(ensemble_scores, np.hstack((scores1, scores2)))
def __init__(self, protein=None): """ Descriptor build from binana script (as used in NNScore 2.0 Parameters ---------- protein: oddt.toolkit.Molecule object (default=None) Protein object to be used while generating descriptors. """ self.protein = protein self.titles = [] self.vina = oddt_vina_descriptor(protein, vina_scores=[ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen' ]) self.titles += self.vina.titles # Close contacts descriptor generators cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD')) cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types) self.titles += ['cc_%s.%s_4' % (t1, t2) for t1, t2 in cc_4_types] self.cc_4 = close_contacts_descriptor(protein, cutoff=4, protein_types=cc_4_rec_types, ligand_types=cc_4_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.ele_types = ( ('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA')) self.titles += ['ele_%s.%s_4' % (t1, t2) for t1, t2 in self.ele_types] self.ligand_atom_types = [ 'A', 'BR', 'C', 'CL', 'F', 'HD', 'I', 'N', 'NA', 'OA', 'P', 'S', 'SA' ] self.titles += ['lig_%s' % t1 for t1 in self.ligand_atom_types] cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN')] cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types) self.cc_25 = close_contacts_descriptor(protein, cutoff=2.5, protein_types=cc_25_rec_types, ligand_types=cc_25_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.titles += ['cc_%s.%s_2.5' % (t1, t2) for t1, t2 in cc_25_types] # H-Bonds (<4A) self.titles += [ 'hb_4_mol_backbone_alpha', 'hb_4_mol_backbone_beta', 'hb_4_mol_backbone_other', 'hb_4_mol_sidechain_alpha', 'hb_4_mol_sidechain_beta', 'hb_4_mol_sidechain_other', 'hb_4_rec_backbone_alpha', 'hb_4_rec_backbone_beta', 'hb_4_rec_backbone_other', 'hb_4_rec_sidechain_alpha', 'hb_4_rec_sidechain_beta', 'hb_4_rec_sidechain_other' ] # Hydrophobic Contact <4A self.titles += [ 'hyd_4_backbone_alpha', 'hyd_4_backbone_beta', 'hyd_4_backbone_other', 'hyd_4_sidechain_alpha', 'hyd_4_sidechain_beta', 'hyd_4_sidechain_other', 'hyd_4_all' ] # Pi-stacking (<7.5A) self.titles += [ 'pi_stack_7.5_alpha', 'pi_stack_7.5_beta', 'pi_stack_7.5_other' ] # T-shaped Pi-Pi interaction self.titles += ['pi_t_7.5_alpha', 'pi_t_7.5_beta', 'pi_t_7.5_other'] # Pi-cation (<6A) self.titles += [ 'pi_cat_mol_6_alpha', 'pi_cat_mol_6_beta', 'pi_cat_mol_6_other', 'pi_cat_rec_6_alpha', 'pi_cat_rec_6_beta', 'pi_cat_rec_6_other' ] # Active site flexibility (<4A) self.titles += [ 'as_flex_backbone_alpha', 'as_flex_backbone_beta', 'as_flex_backbone_other', 'as_flex_sidechain_alpha', 'as_flex_sidechain_beta', 'as_flex_sidechain_other', 'as_flex_all' ] # Salt bridges (<5.5) self.titles += [ 'salt_bridge_5.5_alpha', 'salt_bridge_5.5_beta', 'salt_bridge_5.5_other', 'salt_bridge_5.5_all' ] # Rotatable bonds self.titles += ['num_rotors'] assert len(self.titles) == len(self)
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): """Scoring function implementing RF-Score variants. It predicts the binding affinity (pKi/d) of ligand in a complex utilizng simple descriptors (close contacts of atoms <12A) with sophisticated machine-learning model (random forest). The third variand supplements those contacts with Vina partial scores. For futher details see RF-Score publications v1[1]_, v2[2]_, v3[3]_. Parameters ---------- protein : oddt.toolkit.Molecule object Receptor for the scored ligands n_jobs: int (default=-1) Number of cores to use for scoring and training. By default (-1) all cores are allocated. version: int (default=1) Scoring function variant. The deault is the simplest one (v1). spr: int (default=0) The minimum number of contacts in each pair of atom types in the training set for the column to be included in training. This is a way of removal of not frequent and empty contacts. References ---------- .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to predicting protein-ligand binding affinity with applications to molecular docking. Bioinformatics. 2010;26: 1169-1175. doi:10.1093/bioinformatics/btq112 .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise chemical description of protein-ligand complexes lead to more accurate prediction of binding affinity? J Chem Inf Model. 2014;54: 944-955. doi:10.1021/ci500091r .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina Using Random Forest: The Growing Accuracy of Binding Affinity Prediction by the Effective Exploitation of Larger Data Sets. Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126. doi:10.1002/minf.201400132 """ self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = ['vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): """Scoring function implementing RF-Score variants. It predicts the binding affinity (pKi/d) of ligand in a complex utilizng simple descriptors (close contacts of atoms <12A) with sophisticated machine-learning model (random forest). The third variand supplements those contacts with Vina partial scores. For futher details see RF-Score publications v1[1]_, v2[2]_, v3[3]_. Parameters ---------- protein : oddt.toolkit.Molecule object Receptor for the scored ligands n_jobs: int (default=-1) Number of cores to use for scoring and training. By default (-1) all cores are allocated. version: int (default=1) Scoring function variant. The deault is the simplest one (v1). spr: int (default=0) The minimum number of contacts in each pair of atom types in the training set for the column to be included in training. This is a way of removal of not frequent and empty contacts. References ---------- .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to predicting protein-ligand binding affinity with applications to molecular docking. Bioinformatics. 2010;26: 1169-1175. doi:10.1093/bioinformatics/btq112 .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise chemical description of protein-ligand complexes lead to more accurate prediction of binding affinity? J Chem Inf Model. 2014;54: 944-955. doi:10.1021/ci500091r .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina Using Random Forest: The Growing Accuracy of Binding Affinity Prediction by the Effective Exploitation of Larger Data Sets. Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126. doi:10.1002/minf.201400132 """ self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = [ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors' ] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) # elif version == 5: # cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) # mtry = 14 # descriptors = close_contacts_descriptor( # protein, # cutoff=cutoff, # protein_types=protein_atomic_nums, # ligand_types=ligand_atomic_nums) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def __init__(self, protein=None): """ Descriptor build from binana script (as used in NNScore 2.0 Parameters ---------- protein: oddt.toolkit.Molecule object (default=None) Protein object to be used while generating descriptors. """ self.protein = protein self.titles = [] self.vina = oddt_vina_descriptor(protein, vina_scores=['vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen']) self.titles += self.vina.titles # Close contacts descriptor generators cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD')) cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types) self.titles += ['cc_%s.%s_4' % (t1, t2) for t1, t2 in cc_4_types] self.cc_4 = close_contacts_descriptor(protein, cutoff=4, protein_types=cc_4_rec_types, ligand_types=cc_4_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.ele_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA')) self.titles += ['ele_%s.%s_4' % (t1, t2) for t1, t2 in self.ele_types] self.ligand_atom_types = ['A', 'BR', 'C', 'CL', 'F', 'HD', 'I', 'N', 'NA', 'OA', 'P', 'S', 'SA'] self.titles += ['lig_%s' % t1 for t1 in self.ligand_atom_types] cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN')] cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types) self.cc_25 = close_contacts_descriptor(protein, cutoff=2.5, protein_types=cc_25_rec_types, ligand_types=cc_25_lig_types, mode='atom_types_ad4', aligned_pairs=True) self.titles += ['cc_%s.%s_2.5' % (t1, t2) for t1, t2 in cc_25_types] # H-Bonds (<4A) self.titles += ['hb_4_mol_backbone_alpha', 'hb_4_mol_backbone_beta', 'hb_4_mol_backbone_other', 'hb_4_mol_sidechain_alpha', 'hb_4_mol_sidechain_beta', 'hb_4_mol_sidechain_other', 'hb_4_rec_backbone_alpha', 'hb_4_rec_backbone_beta', 'hb_4_rec_backbone_other', 'hb_4_rec_sidechain_alpha', 'hb_4_rec_sidechain_beta', 'hb_4_rec_sidechain_other'] # Hydrophobic Contact <4A self.titles += ['hyd_4_backbone_alpha', 'hyd_4_backbone_beta', 'hyd_4_backbone_other', 'hyd_4_sidechain_alpha', 'hyd_4_sidechain_beta', 'hyd_4_sidechain_other', 'hyd_4_all'] # Pi-stacking (<7.5A) self.titles += ['pi_stack_7.5_alpha', 'pi_stack_7.5_beta', 'pi_stack_7.5_other'] # T-shaped Pi-Pi interaction self.titles += ['pi_t_7.5_alpha', 'pi_t_7.5_beta', 'pi_t_7.5_other'] # Pi-cation (<6A) self.titles += ['pi_cat_mol_6_alpha', 'pi_cat_mol_6_beta', 'pi_cat_mol_6_other', 'pi_cat_rec_6_alpha', 'pi_cat_rec_6_beta', 'pi_cat_rec_6_other'] # Active site flexibility (<4A) self.titles += ['as_flex_backbone_alpha', 'as_flex_backbone_beta', 'as_flex_backbone_other', 'as_flex_sidechain_alpha', 'as_flex_sidechain_beta', 'as_flex_sidechain_other', 'as_flex_all'] # Salt bridges (<5.5) self.titles += ['salt_bridge_5.5_alpha', 'salt_bridge_5.5_beta', 'salt_bridge_5.5_other', 'salt_bridge_5.5_all'] # Rotatable bonds self.titles += ['num_rotors'] assert len(self.titles) == len(self)