model.fit(train_dataset) #Append trains cores and results train_scores = model.evaluate( train_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) train_results = np.concatenate( (train_results, list(train_scores.values()))) valid_scores = model.evaluate( valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) test_results = np.concatenate((test_results, list(valid_scores.values()))) #Append trains cores and results predict_train = pd.DataFrame( model.predict(train_dataset), columns=['prediction']).to_csv(modeldir + "predict_train_" + str(estimator) + '.csv') predict_valid = pd.DataFrame( model.predict(valid_dataset), columns=['prediction']).to_csv(modeldir + "predict_validation_" + str(estimator) + '.csv') #Append measures estimators = np.concatenate((estimators, [estimator, estimator])) metrics = np.concatenate((metrics, ["r2", "mse"])) # Fit trained model model.save() print("Dataframe for n_estimator selection") df = pd.DataFrame({
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.median) # Do setup required for tf/keras models sklmodel = KernelRidge(kernel="rbf", alpha=1e-5, gamma=0.05) model = SklearnModel(sklmodel, modeldir) # Fit trained model print("Fitting model") model.fit(train_dataset) model.save() print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) = model.evaluate(valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) #Error kernel predict_train = pd.DataFrame(model.predict(train_dataset), columns=['prediction']).to_csv(modeldir + "predict_train.csv") predict_valid = pd.DataFrame(model.predict(valid_dataset), columns=['prediction']).to_csv(modeldir + "predict_validation.csv") print("Train scores") print(train_scores) print("Validation scores") print(valid_scores) metrics = np.concatenate(([], ["r2","mse"])) print("Dataframe for n_estimator selection") df = pd.DataFrame({'metrics': metrics, 'train_scores': list(train_scores.values()), 'result_scores': list(valid_scores.values())}) df.to_csv(modeldir + "KRRAnalysis.csv", index= False)
class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ if not ligand_file.endswith(".sdf"): raise ValueError("Only .sdf ligand files can be featurized.") ligand_basename = os.path.basename(ligand_file).split(".")[0] ligand_mol2 = os.path.join( self.base_dir, ligand_basename + ".mol2") # Write mol2 file for ligand obConversion = ob.OBConversion() conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) ob_mol = ob.OBMol() obConversion.ReadFile(ob_mol, str(ligand_file)) obConversion.WriteFile(ob_mol, str(ligand_mol2)) # Featurize ligand mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) if mol is None: return None, None # Default for CircularFingerprint n_ligand_features = 1024 ligand_features = self.ligand_featurizer.featurize([mol]) # Featurize pocket pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( protein_file, ligand_file) n_pockets = len(pockets) n_pocket_features = BindingPocketFeaturizer.n_features features = np.zeros((n_pockets, n_pocket_features+n_ligand_features)) pocket_features = self.pocket_featurizer.featurize( protein_file, pockets, pocket_atoms_map, pocket_coords) # Note broadcast operation features[:, :n_pocket_features] = pocket_features features[:, n_pocket_features:] = ligand_features dataset = NumpyDataset(X=features) pocket_preds = self.model.predict(dataset) pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # Find pockets which are active active_pockets = [] active_pocket_atoms_map = {} active_pocket_coords = [] for pocket_ind in range(len(pockets)): #################################################### DEBUG # TODO(rbharath): For now, using a weak cutoff. Fix later. #if pocket_preds[pocket_ind] == 1: if pocket_pred_proba[pocket_ind][1] > .15: #################################################### DEBUG pocket = pockets[pocket_ind] active_pockets.append(pocket) active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] active_pocket_coords.append(pocket_coords[pocket_ind]) return active_pockets, active_pocket_atoms_map, active_pocket_coords
class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ if not ligand_file.endswith(".sdf"): raise ValueError("Only .sdf ligand files can be featurized.") ligand_basename = os.path.basename(ligand_file).split(".")[0] ligand_mol2 = os.path.join(self.base_dir, ligand_basename + ".mol2") # Write mol2 file for ligand obConversion = ob.OBConversion() conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) ob_mol = ob.OBMol() obConversion.ReadFile(ob_mol, str(ligand_file)) obConversion.WriteFile(ob_mol, str(ligand_mol2)) # Featurize ligand mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) if mol is None: return None, None # Default for CircularFingerprint n_ligand_features = 1024 ligand_features = self.ligand_featurizer.featurize([mol]) # Featurize pocket pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( protein_file, ligand_file) n_pockets = len(pockets) n_pocket_features = BindingPocketFeaturizer.n_features features = np.zeros((n_pockets, n_pocket_features + n_ligand_features)) pocket_features = self.pocket_featurizer.featurize( protein_file, pockets, pocket_atoms_map, pocket_coords) # Note broadcast operation features[:, :n_pocket_features] = pocket_features features[:, n_pocket_features:] = ligand_features dataset = NumpyDataset(X=features) pocket_preds = self.model.predict(dataset) pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # Find pockets which are active active_pockets = [] active_pocket_atoms_map = {} active_pocket_coords = [] for pocket_ind in range(len(pockets)): #################################################### DEBUG # TODO(rbharath): For now, using a weak cutoff. Fix later. #if pocket_preds[pocket_ind] == 1: if pocket_pred_proba[pocket_ind][1] > .15: #################################################### DEBUG pocket = pockets[pocket_ind] active_pockets.append(pocket) active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] active_pocket_coords.append(pocket_coords[pocket_ind]) return active_pockets, active_pocket_atoms_map, active_pocket_coords