def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=2016) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel( n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(delayed(method_caller)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort( key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target) ] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' # load precomputed descriptors and target values df = pd.read_csv(home_dir + '/nnscore_descs.csv', index_col='pdbid') train_set = 'refined' test_set = 'core' cols = list(map(str, range(len(self.descriptor_generator)))) self.train_descs = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])][cols].values) self.train_target = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])]['act'].values) self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][cols].values self.test_target = df[df['%i_%s' % (pdbbind_version, test_set)]]['act'].values # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel(n_jobs=self.n_jobs, verbose=10)( delayed(_parallel_helper)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000, ), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best best_idx = np.array([ net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets ]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), delimiter=',', dtype=float) self.train_target = np.loadtxt( home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print("Training RFScore v%i on PDBBind v%i" % (self.version, pdbbind_version), file=sys.stderr) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set: R**2:', r2, ' R:', r, 'RMSE:', error, file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set: R**2:', r2, ' R:', r, 'RMSE:', error, file=sys.stderr) # compile trees if compiledtrees is not None: try: print("Compiling Random Forest using sklearn-compiledtrees", file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print("Failed to compile Random Forest with exception: %s" % e, file=sys.stderr) print("Continuing without compiled RF.", file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version))
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016, ignore_json=False): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) json_path = path_join( home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) if (self.version in ['linear'] and # TODO: support other models isfile(json_path) and not ignore_json): print('Loading pretrained PLECscore %s with depths P%i L%i on ' 'PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) with open(json_path) as json_f: json_data = json.load(json_f) for k, v in json_data.items(): if isinstance(v, list): if isinstance(v[0], list): v = [np.array(x) for x in v] else: v = np.array(v) setattr(self.model, k, v) else: # blacklist core set 2013 and astex pdbids_blacklist = [ '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b', '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq', '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz', '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v', '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj', '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg', '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4', '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4', '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk', '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b', '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w', '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn', '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78', '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu', '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u', '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs', '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u', '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m', '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq', '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5', '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv', '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw', '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66', '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6', '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2', '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5', '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m', '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe', '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h', '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5', '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml', '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g', '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf', '1ywr', '1z95', '2bm2', '2br1', '2bsm'] # use remote csv if it's not present if not isfile(desc_path): branch = 'master' # define branch/commit desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s' '/oddt/scoring/functions/PLECscore/' 'plecscore_descs_p%i_l%i.csv.gz' % (branch, self.depth_protein, self.depth_ligand)) warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to ' 'get it from ODDT GitHub.' % (self.depth_protein, self.depth_ligand)) # download and save CSV pd.read_csv(desc_url, index_col='pdbid').to_csv( desc_path, compression='gzip') # set PLEC size to unfolded super(PLECscore, self)._load_pdbbind_desc( desc_path, train_set=('general', 'refined'), pdbbind_version=pdbbind_version, train_blacklist=pdbids_blacklist, fold_size=self.size, ) print('Training PLECscore %s with depths P%i L%i on PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) self.model.fit(self.train_descs, self.train_target) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version, self.size)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values df = pd.read_csv(home_dir + '/rfscore_descs_v%i.csv' % self.version, index_col='pdbid') train_set = 'refined' test_set = 'core' self.train_descs = df[ df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)]][list( map(str, range(len(self.descriptor_generator))))].values self.train_target = df[ df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)]]['act'].values self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][list( map(str, range(len(self.descriptor_generator))))].values self.test_target = df[df['%i_%s' % (pdbbind_version, test_set)]]['act'].values # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print("Training RFScore v%i on PDBBind v%i" % (self.version, pdbbind_version), file=sys.stderr) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) oob_error = rmse(self.model.oob_prediction_, self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, 'OOB RMSE: %.4f' % oob_error, sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print("Compiling Random Forest using sklearn-compiledtrees", file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print("Failed to compile Random Forest with exception: %s" % e, file=sys.stderr) print("Continuing without compiled RF.", file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version))
def test_rmse(): assert rmse(values, poor_values) >= 30 assert rmse(values, good_values) <= 1
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016, ignore_json=False): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) json_path = path_join( home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) if (self.version in ['linear'] and # TODO: support other models isfile(json_path) and not ignore_json): print('Loading pretrained PLECscore %s with depths P%i L%i on ' 'PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) with open(json_path) as json_f: json_data = json.load(json_f) for k, v in json_data.items(): if isinstance(v, list): if isinstance(v[0], list): v = [np.array(x) for x in v] else: v = np.array(v) setattr(self.model, k, v) else: # blacklist core set 2013 and astex pdbids_blacklist = [ '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b', '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq', '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz', '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v', '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj', '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg', '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4', '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4', '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk', '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b', '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w', '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn', '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78', '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu', '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u', '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs', '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u', '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m', '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq', '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5', '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv', '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw', '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66', '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6', '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2', '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5', '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m', '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe', '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h', '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5', '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml', '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g', '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf', '1ywr', '1z95', '2bm2', '2br1', '2bsm'] # use remote csv if it's not present if not isfile(desc_path): branch = 'master' # define branch/commit desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s' '/oddt/scoring/functions/PLECscore/' 'plecscore_descs_p%i_l%i.csv.gz' % (branch, self.depth_protein, self.depth_ligand)) warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to ' 'get it from ODDT GitHub.' % (self.depth_protein, self.depth_ligand)) # download and save CSV pd.read_csv(desc_url, index_col='pdbid').to_csv( desc_path, compression='gzip') # set PLEC size to unfolded super(PLECscore, self)._load_pdbbind_desc( desc_path, train_set=('general', 'refined'), pdbbind_version=pdbbind_version, train_blacklist=pdbids_blacklist, fold_size=self.size, ) print('Training PLECscore %s with depths P%i L%i on PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) self.model.fit(self.train_descs, self.train_target) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] if self.version == 'rf': sets.append(('OOB', self.model.oob_prediction_, self.train_target)) for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version, self.size)) else: return self.save(sf_pickle)
def test_rmse(): assert_greater_equal(rmse(values, poor_values), 30) assert_less_equal(rmse(values, good_values), 1)