class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ if not ligand_file.endswith(".sdf"): raise ValueError("Only .sdf ligand files can be featurized.") ligand_basename = os.path.basename(ligand_file).split(".")[0] ligand_mol2 = os.path.join( self.base_dir, ligand_basename + ".mol2") # Write mol2 file for ligand obConversion = ob.OBConversion() conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) ob_mol = ob.OBMol() obConversion.ReadFile(ob_mol, str(ligand_file)) obConversion.WriteFile(ob_mol, str(ligand_mol2)) # Featurize ligand mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) if mol is None: return None, None # Default for CircularFingerprint n_ligand_features = 1024 ligand_features = self.ligand_featurizer.featurize([mol]) # Featurize pocket pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( protein_file, ligand_file) n_pockets = len(pockets) n_pocket_features = BindingPocketFeaturizer.n_features features = np.zeros((n_pockets, n_pocket_features+n_ligand_features)) pocket_features = self.pocket_featurizer.featurize( protein_file, pockets, pocket_atoms_map, pocket_coords) # Note broadcast operation features[:, :n_pocket_features] = pocket_features features[:, n_pocket_features:] = ligand_features dataset = NumpyDataset(X=features) pocket_preds = self.model.predict(dataset) pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # Find pockets which are active active_pockets = [] active_pocket_atoms_map = {} active_pocket_coords = [] for pocket_ind in range(len(pockets)): #################################################### DEBUG # TODO(rbharath): For now, using a weak cutoff. Fix later. #if pocket_preds[pocket_ind] == 1: if pocket_pred_proba[pocket_ind][1] > .15: #################################################### DEBUG pocket = pockets[pocket_ind] active_pockets.append(pocket) active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] active_pocket_coords.append(pocket_coords[pocket_ind]) return active_pockets, active_pocket_atoms_map, active_pocket_coords
# Compute accuracies task_scores = { task: [] for task in range(len(test_dataset.get_task_names())) } for (task, support) in support_generator: # Train model on support sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=50) model = SklearnModel(sklearn_model, model_dir) model.fit(support) # Test model task_dataset = get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict_proba(task_dataset) score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w) #print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} for task in range(len(test_dataset.get_task_names())): mean_task_scores[task] = np.mean(np.array(task_scores[task])) print("Fold %s" % str(fold)) print(mean_task_scores) for (fold_task, task) in zip(fold_tasks, range(len(test_dataset.get_task_names()))): all_scores[fold_task] = mean_task_scores[task]
class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ if not ligand_file.endswith(".sdf"): raise ValueError("Only .sdf ligand files can be featurized.") ligand_basename = os.path.basename(ligand_file).split(".")[0] ligand_mol2 = os.path.join(self.base_dir, ligand_basename + ".mol2") # Write mol2 file for ligand obConversion = ob.OBConversion() conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) ob_mol = ob.OBMol() obConversion.ReadFile(ob_mol, str(ligand_file)) obConversion.WriteFile(ob_mol, str(ligand_mol2)) # Featurize ligand mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) if mol is None: return None, None # Default for CircularFingerprint n_ligand_features = 1024 ligand_features = self.ligand_featurizer.featurize([mol]) # Featurize pocket pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( protein_file, ligand_file) n_pockets = len(pockets) n_pocket_features = BindingPocketFeaturizer.n_features features = np.zeros((n_pockets, n_pocket_features + n_ligand_features)) pocket_features = self.pocket_featurizer.featurize( protein_file, pockets, pocket_atoms_map, pocket_coords) # Note broadcast operation features[:, :n_pocket_features] = pocket_features features[:, n_pocket_features:] = ligand_features dataset = NumpyDataset(X=features) pocket_preds = self.model.predict(dataset) pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # Find pockets which are active active_pockets = [] active_pocket_atoms_map = {} active_pocket_coords = [] for pocket_ind in range(len(pockets)): #################################################### DEBUG # TODO(rbharath): For now, using a weak cutoff. Fix later. #if pocket_preds[pocket_ind] == 1: if pocket_pred_proba[pocket_ind][1] > .15: #################################################### DEBUG pocket = pockets[pocket_ind] active_pockets.append(pocket) active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] active_pocket_coords.append(pocket_coords[pocket_ind]) return active_pockets, active_pocket_atoms_map, active_pocket_coords
support_generator = SupportGenerator( test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg, n_trials, replace) # Compute accuracies task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))} for (task, support) in support_generator: # Train model on support sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=50) model = SklearnModel(sklearn_model, model_dir) model.fit(support) # Test model task_dataset = get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict_proba(task_dataset) score = metric.compute_metric( task_dataset.y, y_pred, task_dataset.w) #print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} for task in range(len(test_dataset.get_task_names())): mean_task_scores[task] = np.mean(np.array(task_scores[task])) print("Fold %s" % str(fold)) print(mean_task_scores) for (fold_task, task) in zip(fold_tasks, range(len(test_dataset.get_task_names()))): all_scores[fold_task] = mean_task_scores[task]