def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier()) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification") reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_sklearn_classification_overfit(self): """Test that sklearn models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = RandomForestClassifier() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestRegressor(n_estimators=500), verbosity=verbosity)
def test_sklearn_regression(self): """Test that sklearn models can learn on simple regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestClassifier( class_weight="balanced", n_estimators=500), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression())
def test_singletask_sklearn_rf_ECFP_regression_API(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [ Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error) ] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="regression", model_instance=LinearRegression(), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=RandomForestClassifier(), verbosity=verbosity)
def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None): """Builds random forests given hyperparameters. Last two arguments only for tensorflow models and ignored. """ n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel( tasks, task_types, params_dict, model_dir, mode="regression", model_instance=RandomForestRegressor(n_estimators=n_estimators, max_features=max_features))
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity)
def model_builder(tasks, task_types, params_dict, model_dir, verbosity=verbosity): n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel(tasks, task_types, params_dict, model_dir, model_instance=model_class( n_estimators=n_estimators, max_features=max_features))
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset), NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for data in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(data) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) sklearn_model = LinearRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) assert train_scores[regression_metric.name] > .5 # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .5
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] sklearn_model = RandomForestRegressor() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_sklearn_regression_overfit(self): """Test that sklearn models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_sklearn_classification(self): """Test that sklearn models can learn on simple classification datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = LogisticRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .5
def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024)
pd.DataFrame(train_dataset.y, columns=['prediction']).to_csv(modeldir + "train_original.csv") pd.DataFrame(valid_dataset.y, columns=['prediction']).to_csv(modeldir + "valid_original.csv") for estimator in n_estimators: print('n_estimators = {0}'.format(estimator)) #Create model sklmodel = RandomForestRegressor(n_estimators=estimator, criterion="mse", max_features=max_features, bootstrap=True, oob_score=False, n_jobs=int(cpus / 2)) model = SklearnModel(sklmodel, modeldir) model.fit(train_dataset) #Append trains cores and results train_scores = model.evaluate( train_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) train_results = np.concatenate( (train_results, list(train_scores.values()))) valid_scores = model.evaluate( valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) test_results = np.concatenate((test_results, list(valid_scores.values()))) #Append trains cores and results predict_train = pd.DataFrame(
def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir)
pdbbind_task_types = {task: "regression" for task in pdbbind_tasks} classification_metric = Metric(metrics.r2_score, verbosity=verbosity, mode="regression") params_dict = { "batch_size": None, "data_shape": train_dataset.get_data_shape(), } if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) model = SklearnModel(pdbbind_tasks, pdbbind_task_types, params_dict, model_dir, model_instance=RandomForestRegressor(n_estimators=500), verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores")
# Get supports on test-set support_generator = SupportGenerator( test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg, n_trials, replace) # Compute accuracies task_scores = { task: [] for task in range(len(test_dataset.get_task_names())) } for (task, support) in support_generator: # Train model on support sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=50) model = SklearnModel(sklearn_model, model_dir) model.fit(support) # Test model task_dataset = get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict_proba(task_dataset) score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w) #print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} for task in range(len(test_dataset.get_task_names())): mean_task_scores[task] = np.mean(np.array(task_scores[task])) print("Fold %s" % str(fold))
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=LogisticRegression(class_weight="balanced"), verbosity=verbosity)
def generate_rf_model(): model_dir = "." sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = RandomForestRegressor() return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = RandomForestClassifier() return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestClassifier(**model_params) return SklearnModel(sklearn_model, model_dir)
featurizer = deepchem.feat.WeaveFeaturizer(), transformers = 2, modelname = MPNNModel, model_file = model_dir + "mpnn_model", dataset_file = data_dir + 'To_predict.csv', fname = 'PredictedMPNN.csv', parentdir = data_dir, newdir = newdir) flag_predicted = False; if len(models) == 0 or "RandomForest" in models: print("-Evaluating Random Forest Model", flush = True) predictchem.predict_csv_from_model( featurizer = deepchem.feat.CircularFingerprint(size=1024), transformers = 2, modelname = SklearnModel(model_dir = model_dir + "random_forest"), model_file = "", #No need for model_file dataset_file = data_dir + 'To_predict.csv', fname = 'PredictedForest.csv', parentdir = data_dir, newdir = newdir, modeltype = "sklearn") flag_predicted = False; if len(models) == 0 or "KRR" in models: print("-Evaluating Kernel Ridge Regression", flush = True) predictchem.predict_csv_from_model( featurizer = deepchem.feat.CircularFingerprint(size=1024), transformers = 2, modelname = SklearnModel(model_dir = model_dir + "krr_model"), model_file = "", #No need for model_file