def test_combined(self): ecfp_power = 5 splif_power = 5 box_width = 75.0 voxel_width = 1.0 voxels_per_edge = int(box_width / voxel_width) # test voxel features featurizer = RdkitGridFeaturizer( voxel_width=voxel_width, box_width=box_width, feature_types=['voxel_combined'], ecfp_power=ecfp_power, splif_power=splif_power, flatten=False, sanitize=True) feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray) voxel_total_len = ( 2**ecfp_power + len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len( featurizer.cutoffs['hbond_dist_bins']) + 5) self.assertEqual( feature_tensor.shape, (1, voxels_per_edge, voxels_per_edge, voxels_per_edge, voxel_total_len)) # test flat features featurizer = RdkitGridFeaturizer( voxel_width=1.0, box_width=75.0, feature_types=['flat_combined'], ecfp_power=ecfp_power, splif_power=splif_power, sanitize=True) feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray) flat_total_len = ( 3 * 2**ecfp_power + len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len( featurizer.cutoffs['hbond_dist_bins'])) self.assertEqual(feature_tensor.shape, (1, flat_total_len)) # check if aromatic features are ignored if sanitize=False featurizer = RdkitGridFeaturizer( voxel_width=1.0, box_width=75.0, feature_types=['all_combined'], ecfp_power=ecfp_power, splif_power=splif_power, flatten=True, sanitize=False) self.assertTrue('pi_stack' not in featurizer.feature_types) self.assertTrue('cation_pi' not in featurizer.feature_types) feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray) self.assertEqual(feature_tensor.shape, (1, 56109538))
def test_failures(self): # test flattened voxel features featurizer = RdkitGridFeaturizer( nb_rotations=0, box_width=75., voxel_width=1., feature_types=['voxel_combined'], flatten=True, sanitize=True) features = featurizer.featurize([(self.ligand_file, self.protein_file), ('nan', 'nan')]) self.assertEqual(features.shape, (2, 16875000)) # test voxel features featurizer = RdkitGridFeaturizer( nb_rotations=0, box_width=75., voxel_width=1., feature_types=['voxel_combined'], flatten=False, sanitize=True) features = featurizer.featurize([(self.ligand_file, self.protein_file), ('nan', 'nan')]) self.assertEqual(features.shape, (2, 75, 75, 75, 40)) # test flat features featurizer = RdkitGridFeaturizer( nb_rotations=0, box_width=75., voxel_width=1., feature_types=['flat_combined'], flatten=True, sanitize=True) features = featurizer.featurize([(self.ligand_file, self.protein_file), ('nan', 'nan')]) self.assertEqual(features.shape, (2, 51)) # test rotations featurizer = RdkitGridFeaturizer( nb_rotations=5, box_width=75., voxel_width=1., feature_types=['flat_combined'], flatten=True, sanitize=True) features = featurizer.featurize([(self.ligand_file, self.protein_file), ('nan', 'nan')]) self.assertEqual(features.shape, (2, 306))
def test_default_featurizer(self): # test if default parameters work featurizer = RdkitGridFeaturizer() self.assertIsInstance(featurizer, RdkitGridFeaturizer) feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray)
def test_force_flatten(self): # test if input is flattened when flat features are used featurizer = RdkitGridFeaturizer( feature_types=['ecfp_hashed'], flatten=False) featurizer.flatten = True # False should be ignored with ecfp_hashed feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray) self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
def test_rotations(self): featurizer = RdkitGridFeaturizer( nb_rotations=3, feature_types=['voxel_combined'], flatten=False, sanitize=True) feature_tensors = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))
def test_example_featurizer(self): # check if use-case from examples works featurizer = RdkitGridFeaturizer( voxel_width=16.0, feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'], ecfp_power=9, splif_power=9, flatten=True) feature_tensor = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertIsInstance(feature_tensor, np.ndarray)
def test_rotations(self): featurizer = RdkitGridFeaturizer( nb_rotations=3, box_width=75., voxel_width=1., feature_types=['voxel_combined'], flatten=False, sanitize=True) feature_tensors = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertEqual(feature_tensors.shape, (1, 300, 75, 75, 40)) featurizer = RdkitGridFeaturizer( nb_rotations=3, box_width=75., voxel_width=1., feature_types=['flat_combined'], flatten=True, sanitize=True) feature_tensors = featurizer.featurize([(self.ligand_file, self.protein_file)]) self.assertEqual(feature_tensors.shape, (1, 204))
def load_pdbbind_from_dir(data_folder, index_files, featurizer="grid", split="random", ex_ids=[], save_dir=None): """Load and featurize raw PDBBind dataset from a local directory with the option to avoid certain IDs. Parameters ---------- data_dir: String, Specifies the data directory to store the featurized dataset. index_files: List List of data and labels index file paths relative to the path in data_dir split: Str Either "random" or "index" feat: Str Either "grid" or "atomic" for grid and atomic featurizations. subset: Str Only "core" or "refined" for now. ex_ids: List List of PDB IDs to avoid loading if present save_dir: String Path to store featurized datasets """ pdbbind_tasks = ["-logKd/Ki"] index_file = os.path.join(data_folder, index_files[0]) labels_file = os.path.join(data_folder, index_files[1]) # Extract locations of data pdbs = [] with open(index_file, "r") as g: lines = g.readlines() for line in lines: line = line.split(" ") pdb = line[0] if len(pdb) == 4: pdbs.append(pdb) protein_files = [ os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs if pdb not in ex_ids ] ligand_files = [ os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs if pdb not in ex_ids ] # Extract labels labels_tmp = {} with open(labels_file, "r") as f: lines = f.readlines() for line in lines: # Skip comment lines if line[0] == "#": continue # Lines have format # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name line = line.split() # The base-10 logarithm, -log kd/pk log_label = line[3] labels_tmp[line[0]] = log_label labels = np.array([labels_tmp[pdb] for pdb in pdbs]) print(labels) # Featurize Data if featurizer == "grid": featurizer = RdkitGridFeaturizer(voxel_width=2.0, feature_types=[ 'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi', 'charge' ], flatten=True) elif featurizer == "atomic": # Pulled from PDB files. For larger datasets with more PDBs, would use # max num atoms instead of exact. frag1_num_atoms = 70 # for ligand atoms frag2_num_atoms = 24000 # for protein atoms complex_num_atoms = 24070 # in total max_num_neighbors = 4 # Cutoff in angstroms neighbor_cutoff = 4 featurizer = ComplexNeighborListFragmentAtomicCoordinates( frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors, neighbor_cutoff) else: raise ValueError("Featurizer not supported") print("Featurizing Complexes") features, failures = featurizer.featurize(ligand_files, protein_files) # Delete labels for failing elements labels = np.delete(labels, failures) dataset = deepchem.data.DiskDataset.from_numpy(features, labels) # No transformations of data transformers = [] if split == None: return pdbbind_tasks, (dataset, None, None), transformers # TODO(rbharath): This should be modified to contain a cluster split so # structures of the same protein aren't in both train/test splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset) all_dataset = (train, valid, test) if save_dir: deepchem.utils.data_utils.save_dataset_to_disk(save_dir, train, valid, test, transformers) return pdbbind_tasks, all_dataset, transformers
def load_pdbbind(reload=True, data_dir=None, subset="core", load_binding_pocket=False, featurizer="grid", split="random", split_seed=None, save_dir=None, save_timestamp=False): """Load raw PDBBind dataset by featurization and split. Parameters ---------- reload: Bool, optional Reload saved featurized and splitted dataset or not. data_dir: Str, optional Specifies the directory storing the raw dataset. load_binding_pocket: Bool, optional Load binding pocket or full protein. subset: Str Specifies which subset of PDBBind, only "core" or "refined" for now. featurizer: Str Either "grid" or "atomic" for grid and atomic featurizations. split: Str Either "random" or "index". split_seed: Int, optional Specifies the random seed for splitter. save_dir: Str, optional Specifies the directory to store the featurized and splitted dataset when reload is False. If reload is True, it will load saved dataset inside save_dir. save_timestamp: Bool, optional Save featurized and splitted dataset with timestamp or not. Set it as True when running similar or same jobs simultaneously on multiple compute nodes. """ pdbbind_tasks = ["-logKd/Ki"] deepchem_dir = deepchem.utils.data_utils.get_data_dir() if data_dir == None: data_dir = DEFAULT_DATA_DIR data_folder = os.path.join(data_dir, "pdbbind", "v2015") if save_dir == None: save_dir = os.path.join(DEFAULT_DATA_DIR, "from-pdbbind") if load_binding_pocket: save_folder = os.path.join( save_dir, "protein_pocket-%s-%s-%s" % (subset, featurizer, split)) else: save_folder = os.path.join( save_dir, "full_protein-%s-%s-%s" % (subset, featurizer, split)) if save_timestamp: save_folder = "%s-%s-%s" % ( save_folder, time.strftime("%Y%m%d", time.localtime()), re.search("\.(.*)", str(time.time())).group(1)) if reload: if not os.path.exists(save_folder): print("Dataset does not exist at {}. Reconstructing...".format( save_folder)) else: print("\nLoading featurized and splitted dataset from:\n%s\n" % save_folder) loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk( save_folder) if loaded: return pdbbind_tasks, all_dataset, transformers dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz") if not os.path.exists(dataset_file): logger.warning( "About to download PDBBind full dataset. Large file, 2GB") deepchem.utils.data_utils.download_url( "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/pdbbind_v2015.tar.gz", dest_dir=data_dir) if os.path.exists(data_folder): logger.info("PDBBind full dataset already exists.") else: print("Untarring full dataset...") deepchem.utils.data_utils.untargz_file(dataset_file, dest_dir=os.path.join( data_dir, "pdbbind")) print("\nRaw dataset:\n%s" % data_folder) print("\nFeaturized and splitted dataset:\n%s" % save_folder) if subset == "core": index_labels_file = os.path.join(data_folder, "INDEX_core_data.2013") elif subset == "refined": index_labels_file = os.path.join(data_folder, "INDEX_refined_data.2015") else: raise ValueError("Other subsets not supported") # Extract locations of data with open(index_labels_file, "r") as g: pdbs = [line[:4] for line in g.readlines() if line[0] != "#"] if load_binding_pocket: protein_files = [ os.path.join(data_folder, pdb, "%s_pocket.pdb" % pdb) for pdb in pdbs ] else: protein_files = [ os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs ] ligand_files = [ os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs ] # Extract labels with open(index_labels_file, "r") as g: labels = np.array([ # Lines have format # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name # The base-10 logarithm, -log kd/pk float(line.split()[3]) for line in g.readlines() if line[0] != "#" ]) # Featurize Data if featurizer == "grid": featurizer = RdkitGridFeaturizer(voxel_width=2.0, feature_types=[ 'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi', 'charge' ], flatten=True) elif featurizer == "atomic" or featurizer == "atomic_conv": # Pulled from PDB files. For larger datasets with more PDBs, would use # max num atoms instead of exact. frag1_num_atoms = 70 # for ligand atoms if load_binding_pocket: frag2_num_atoms = 1000 complex_num_atoms = 1070 else: frag2_num_atoms = 24000 # for protein atoms complex_num_atoms = 24070 # in total max_num_neighbors = 4 # Cutoff in angstroms neighbor_cutoff = 4 if featurizer == "atomic": featurizer = ComplexNeighborListFragmentAtomicCoordinates( frag1_num_atoms=frag1_num_atoms, frag2_num_atoms=frag2_num_atoms, complex_num_atoms=complex_num_atoms, max_num_neighbors=max_num_neighbors, neighbor_cutoff=neighbor_cutoff) if featurizer == "atomic_conv": featurizer = AtomicConvFeaturizer( labels=labels, frag1_num_atoms=frag1_num_atoms, frag2_num_atoms=frag2_num_atoms, complex_num_atoms=complex_num_atoms, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, batch_size=64) else: raise ValueError("Featurizer not supported") print("\nFeaturizing Complexes for \"%s\" ...\n" % data_folder) feat_t1 = time.time() features, failures = featurizer.featurize(ligand_files, protein_files) feat_t2 = time.time() print("\nFeaturization finished, took %0.3f s." % (feat_t2 - feat_t1)) # Delete labels and ids for failing elements labels = np.delete(labels, failures) labels = labels.reshape((len(labels), 1)) ids = np.delete(pdbs, failures) print("\nConstruct dataset excluding failing featurization elements...") dataset = deepchem.data.DiskDataset.from_numpy(features, y=labels, ids=ids) # No transformations of data transformers = [] # Split dataset print("\nSplit dataset...\n") if split == None: return pdbbind_tasks, (dataset, None, None), transformers # TODO(rbharath): This should be modified to contain a cluster split so # structures of the same protein aren't in both train/test splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed) all_dataset = (train, valid, test) print("\nSaving dataset to \"%s\" ..." % save_folder) deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid, test, transformers) return pdbbind_tasks, all_dataset, transformers