def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target) ] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016, ignore_json=False): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) json_path = path_join( home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) if (self.version in ['linear'] and # TODO: support other models isfile(json_path) and not ignore_json): print('Loading pretrained PLECscore %s with depths P%i L%i on ' 'PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) with open(json_path) as json_f: json_data = json.load(json_f) for k, v in json_data.items(): if isinstance(v, list): if isinstance(v[0], list): v = [np.array(x) for x in v] else: v = np.array(v) setattr(self.model, k, v) else: # blacklist core set 2013 and astex pdbids_blacklist = [ '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b', '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq', '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz', '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v', '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj', '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg', '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4', '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4', '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk', '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b', '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w', '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn', '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78', '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu', '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u', '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs', '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u', '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m', '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq', '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5', '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv', '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw', '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66', '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6', '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2', '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5', '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m', '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe', '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h', '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5', '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml', '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g', '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf', '1ywr', '1z95', '2bm2', '2br1', '2bsm'] # use remote csv if it's not present if not isfile(desc_path): branch = 'master' # define branch/commit desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s' '/oddt/scoring/functions/PLECscore/' 'plecscore_descs_p%i_l%i.csv.gz' % (branch, self.depth_protein, self.depth_ligand)) warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to ' 'get it from ODDT GitHub.' % (self.depth_protein, self.depth_ligand)) # download and save CSV pd.read_csv(desc_url, index_col='pdbid').to_csv( desc_path, compression='gzip') # set PLEC size to unfolded super(PLECscore, self)._load_pdbbind_desc( desc_path, train_set=('general', 'refined'), pdbbind_version=pdbbind_version, train_blacklist=pdbids_blacklist, fold_size=self.size, ) print('Training PLECscore %s with depths P%i L%i on PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) self.model.fit(self.train_descs, self.train_target) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version, self.size)) else: return self.save(sf_pickle)
def test_standard_deviation_error(): assert standard_deviation_error(values, good_values) < 1.1 assert standard_deviation_error(values, poor_values) > 2e4
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016, ignore_json=False): if not home_dir: home_dir = path_join(dirname(__file__), 'PLECscore') desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' % (self.depth_protein, self.depth_ligand)) json_path = path_join( home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' % (self.version, self.depth_protein, self.depth_ligand, self.size, pdbbind_version)) if (self.version in ['linear'] and # TODO: support other models isfile(json_path) and not ignore_json): print('Loading pretrained PLECscore %s with depths P%i L%i on ' 'PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) with open(json_path) as json_f: json_data = json.load(json_f) for k, v in json_data.items(): if isinstance(v, list): if isinstance(v[0], list): v = [np.array(x) for x in v] else: v = np.array(v) setattr(self.model, k, v) else: # blacklist core set 2013 and astex pdbids_blacklist = [ '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b', '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq', '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz', '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v', '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj', '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg', '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4', '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4', '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk', '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b', '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w', '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn', '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78', '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu', '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u', '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs', '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u', '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m', '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq', '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5', '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv', '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw', '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66', '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6', '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2', '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5', '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m', '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe', '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h', '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5', '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml', '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g', '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf', '1ywr', '1z95', '2bm2', '2br1', '2bsm'] # use remote csv if it's not present if not isfile(desc_path): branch = 'master' # define branch/commit desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s' '/oddt/scoring/functions/PLECscore/' 'plecscore_descs_p%i_l%i.csv.gz' % (branch, self.depth_protein, self.depth_ligand)) warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to ' 'get it from ODDT GitHub.' % (self.depth_protein, self.depth_ligand)) # download and save CSV pd.read_csv(desc_url, index_col='pdbid').to_csv( desc_path, compression='gzip') # set PLEC size to unfolded super(PLECscore, self)._load_pdbbind_desc( desc_path, train_set=('general', 'refined'), pdbbind_version=pdbbind_version, train_blacklist=pdbids_blacklist, fold_size=self.size, ) print('Training PLECscore %s with depths P%i L%i on PDBBind v%i' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version), file=sys.stderr) self.model.fit(self.train_descs, self.train_target) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] if self.version == 'rf': sets.append(('OOB', self.model.oob_prediction_, self.train_target)) for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle' % (self.version, self.depth_protein, self.depth_ligand, pdbbind_version, self.size)) else: return self.save(sf_pickle)
def test_standard_deviation_error(): assert_less(standard_deviation_error(values, good_values), 1.1) assert_greater(standard_deviation_error(values, poor_values), 5e4)