def test_rdkit_descriptors(self): """ testing rdkit descriptors load """ print(self.ds.rdkit.tolist()) descriptors = PubChemDataSetDescriptors(self.ds) rdkit_X = descriptors.load_rdkit() assert 'HeavyAtomCount' in rdkit_X.columns
def setup(self): self.ds = PubChemDataSet(1).clean_load() self.y = self.ds.Activity self.X = PubChemDataSetDescriptors(self.ds).load_rdkit() #print(self.X.shape, self.y.shape) self.y = self.y[self.X.notnull().all(1)] self.X = self.X[self.X.notnull().all(1)] #print(self.X.shape, self.y.shape) self.y = self.y[~np.isinf(self.X.values).any(1)] self.X = self.X[~np.isinf(self.X.values).any(1)]
class TestFull: def setup(self): self.ds = PubChemDataSet(1).clean_load() self.y = self.ds.Activity self.X = PubChemDataSetDescriptors(self.ds).load_rdkit() #print(self.X.shape, self.y.shape) self.y = self.y[self.X.notnull().all(1)] self.X = self.X[self.X.notnull().all(1)] #print(self.X.shape, self.y.shape) self.y = self.y[~np.isinf(self.X.values).any(1)] self.X = self.X[~np.isinf(self.X.values).any(1)] #print(self.X.shape, self.y.shape) def teardown(self): pass def test_full_pipeline(self): pipe = Pipeline(list(SKLearnModels.PREPROCESS)) classifier = SKLearnModels.CLASSIFIERS[0] pipe.steps.append(classifier) parameters = SKLearnModels.PARAMETERS[classifier[0]] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(self.X.values, self.y.values) print(cv_search.best_params_, cv_search.best_score_) assert False
def build_model(aid, model): try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print("======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: raise Exception("error on aid {0}".format(aid)) pipe = Pipeline( list(SKLearnModels.PREPROCESS) + [(name, clf) for name, clf in SKLearnModels.CLASSIFIERS if name == model]) print("=======5-fold CV on {0}=======".format(model)) parameters = SKLearnModels.PARAMETERS[model] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :".format(model)) for param, val in cv_search.best_params_.items(): print("{}: {}".format(param.split('__')[1], val)) print("The best accuracy score is {0:.2f}%".format(cv_search.best_score_ * 100)) # Save to pickle cv_search.best_estimator_
def build_models(aid, sub_directory): best_scores = {} aid_sub_directory = os.path.join(sub_directory, str(aid)) # if its already been modeled move on if os.path.exists(aid_sub_directory): return else: os.mkdir(aid_sub_directory) try: ds = PubChemDataSet(aid).clean_load() ds.to_csv('{}/training.csv'.format(aid_sub_directory)) y = ds.Activity X = PubChemDataSetDescriptors(ds).load_ECFP6() print("=======building model for aid {0}=======".format(aid)) print("======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) return for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline([(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_scores[name] = cv_search.best_score_ joblib.dump(cv_search.best_estimator_, '{}/{}.pkl'.format(aid_sub_directory, name)) with open('{}/results.csv'.format(aid_sub_directory), 'w') as results_file: for model, score in best_scores.items(): results_file.write(model + ',' + str(score) + '\n')
def main(): # assays from cluster 0 aids = [ '119', '103', '99', '133', '71', '145', '5', '33', '113', '43', '139', '115', '55', '31', '67', '81', '143', '87', '109', '129', '39', '49', '137', '65', '79', '93', '57', '15', '107', '59', '37', '101', '123', '41', '45', '7', '83', '91', '53', '13', '21', '95', '105', '9', '131', '125', '97', '29', '121', '3', '25', '141', '23', '77', '19', '1', '47', '73', '35', '89', '85' ] aids = list(map(int, aids)) import os ds_test = pd.read_csv(os.getenv('QSAR_DATA') + 'reach_curated.csv') ds_test.index = ds_test.ECNumber best_models = {} predictions = [] for aid in aids: try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print( "======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) continue for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_models[aid] = cv_search.best_estimator_ X_test = PubChemDataSetDescriptors(ds_test).load_rdkit() # save null or inf values dropped_cmps = X_test[~(X_test.notnull().all(1)) | (np.isinf(X_test.values).any(1))] # remove null and inf values X_test = X_test[X_test.notnull().all(1)] X_test = X_test[~np.isinf(X_test.values).any(1)] print("Making predictions on {0} compounds".format(X_test.shape[0])) preds = pd.DataFrame(cv_search.predict(X_test), index=X_test.index, columns=[aid]) predictions.append(preds) print(pd.concat(predictions, axis=1)) import os filename = os.getenv('QSAR_DATA') + 'reach_predictions_cluster_0.csv' pd.concat(predictions, axis=1).to_csv(filename) m = pd.DataFrame(best_models, index=aids) m.to_pickle(os.getenv('QSAR_DATA') + 'models.csv')
def main(): aids = [119, 79, 83, 7, 37, 99, 129, 59, 41] profile = DS.profile_3.load() best_models = {} predictions = [] for aid in aids: try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print( "======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) continue for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_models[aid] = cv_search.best_estimator_ ds_test = profile.get_subprofile([aid]).get_nulls().as_ds() X_test = PubChemDataSetDescriptors(ds_test).load_rdkit() # save null or inf values dropped_cmps = X_test[~(X_test.notnull().all(1)) | (np.isinf(X_test.values).any(1))] # remove null and inf values X_test = X_test[X_test.notnull().all(1)] X_test = X_test[~np.isinf(X_test.values).any(1)] print("Making predictions on {0} compounds".format(X_test.shape[0])) preds = pd.DataFrame(cv_search.predict(X_test), index=X_test.index, columns=[aid]) predictions.append(preds) print(pd.concat(predictions, axis=1)) import os filename = os.getenv('QSAR_DATA') + 'missing_data_predictions.csv' pd.concat(predictions, axis=1).to_csv(filename)
def test_rdkit_descriptors_on_test_set(self): """ testing rdkit descriptors load on the test set """ descriptors = PubChemDataSetDescriptors(DS.profile_3.load().as_ds()) rdkit_X = descriptors.load_rdkit() assert 'HeavyAtomCount' in rdkit_X.columns