예제 #1
0
    def test_full_complex_featurization(self):
        """Unit test for ComplexNeighborListFragmentAtomicCoordinates."""
        dir_path = os.path.dirname(os.path.realpath(__file__))
        ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
        protein_file = os.path.join(dir_path, "data/3zso_protein.pdb")
        # Pulled from PDB files. For larger datasets with more PDBs, would use
        # max num atoms instead of exact.
        frag1_num_atoms = 44  # for ligand atoms
        frag2_num_atoms = 2336  # for protein atoms
        complex_num_atoms = 2380  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        complex_featurizer = ComplexNeighborListFragmentAtomicCoordinates(
            frag1_num_atoms, frag2_num_atoms, complex_num_atoms,
            max_num_neighbors, neighbor_cutoff)
        (frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords,
         frag2_neighbor_list, frag2_z, complex_coords, complex_neighbor_list,
         complex_z) = complex_featurizer._featurize_complex(
             ligand_file, protein_file)

        self.assertEqual(frag1_coords.shape, (frag1_num_atoms, 3))
        self.assertEqual(sorted(list(frag1_neighbor_list.keys())),
                         list(range(frag1_num_atoms)))
        self.assertEqual(frag1_z.shape, (frag1_num_atoms, ))

        self.assertEqual(frag2_coords.shape, (frag2_num_atoms, 3))
        self.assertEqual(sorted(list(frag2_neighbor_list.keys())),
                         list(range(frag2_num_atoms)))
        self.assertEqual(frag2_z.shape, (frag2_num_atoms, ))

        self.assertEqual(complex_coords.shape, (complex_num_atoms, 3))
        self.assertEqual(sorted(list(complex_neighbor_list.keys())),
                         list(range(complex_num_atoms)))
        self.assertEqual(complex_z.shape, (complex_num_atoms, ))
예제 #2
0
  def test_atomic_conv_with_feat(self):
    """A simple test for running an atomic convolution on featurized data."""
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(dir_path,
                               "../../../feat/tests/data/3zso_ligand_hyd.pdb")
    protein_file = os.path.join(dir_path,
                                "../../../feat/tests/data/3zso_protein.pdb")
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 44  # for ligand atoms
    frag2_num_atoms = 2336  # for protein atoms
    complex_num_atoms = 2380  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    complex_featurizer = ComplexNeighborListFragmentAtomicCoordinates(
        frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
        neighbor_cutoff)
    # arbitrary label
    labels = np.array([0])
    features, _ = complex_featurizer.featurize_complexes([ligand_file],
                                                         [protein_file])
    dataset = deepchem.data.DiskDataset.from_numpy(features, labels)

    batch_size = 1
    print("Constructing Atomic Conv model")
    atomic_convnet = atomic_conv.AtomicConvModel(
        batch_size=batch_size,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms)

    print("About to call fit")
    # Run a fitting operation
    atomic_convnet.fit(dataset)
예제 #3
0
def compute_atomic_conv_features(tasks, data_dir, pdbbind_dir, y, ids):
  frag1_num_atoms = 140
  frag2_num_atoms = 821
  complex_num_atoms = 908
  max_num_neighbors = 8
  neighbor_cutoff = 12.0
  featurizer = ComplexNeighborListFragmentAtomicCoordinates(
      frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
      neighbor_cutoff)

  w = np.ones_like(y)

  # Currently featurizes with shard_size=1
  # Dataset can be reshard: dataset = dataset.reshard(48) for example
  def shard_generator():
    for ind, pdb_code in enumerate(ids):
      print("Processing %s" % str(pdb_code))
      pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
      protein_file = os.path.join(pdb_subdir, "%s_pocket.pdb" % pdb_code)
      ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
      computed_feature = featurizer._featurize_complex(
          str(ligand_file), str(protein_file))
      if computed_feature[0] is None:
        print("Bad featurization")
        continue
      else:
        X_b = np.reshape(np.array(computed_feature), (1, 9))
        y_b = y[ind]
        w_b = w[ind]
        y_b = np.reshape(y_b, (1, -1))
        w_b = np.reshape(w_b, (1, -1))
        yield (X_b, y_b, w_b, [pdb_code])

  dataset = dc.data.DiskDataset.create_dataset(
      shard_generator(), data_dir=data_dir, tasks=tasks)

  return dataset
예제 #4
0
def load_pdbbind(featurizer="grid",
                 split="random",
                 subset="core",
                 reload=True):
    """Load and featurize raw PDBBind dataset.
  
  Parameters
  ----------
  data_dir: String, optional
    Specifies the data directory to store the featurized dataset.
  split: Str
    Either "random" or "index"
  feat: Str
    Either "grid" or "atomic" for grid and atomic featurizations.
  subset: Str
    Only "core" or "refined" for now.
  """
    pdbbind_tasks = ["-logKd/Ki"]
    data_dir = deepchem.utils.get_data_dir()
    if reload:
        save_dir = os.path.join(data_dir,
                                "pdbbind/" + featurizer + "/" + str(split))
        loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
            save_dir)
        if loaded:
            return pdbbind_tasks, all_dataset, transformers
    dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
    data_folder = os.path.join(data_dir, "v2015")
    if not os.path.exists(dataset_file):
        logger.warning(
            "About to download PDBBind full dataset. Large file, 2GB")
        deepchem.utils.download_url(
            'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
            "pdbbind_v2015.tar.gz")
    if os.path.exists(data_folder):
        logger.info("Data directory for %s already exists" % subset)
    else:
        print("Untarring full dataset")
        deepchem.utils.untargz_file(dataset_file, dest_dir=data_dir)
    if subset == "core":
        index_file = os.path.join(data_folder, "INDEX_core_name.2013")
        labels_file = os.path.join(data_folder, "INDEX_core_data.2013")
    elif subset == "refined":
        index_file = os.path.join(data_folder, "INDEX_refined_name.2013")
        labels_file = os.path.join(data_folder, "INDEX_refined_data.2013")
    else:
        raise ValueError("Other subsets not supported")
    # Extract locations of data
    pdbs = []
    with open(index_file, "r") as g:
        lines = g.readlines()
        for line in lines:
            line = line.split(" ")
            pdb = line[0]
            if len(pdb) == 4:
                pdbs.append(pdb)
    protein_files = [
        os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs
    ]
    ligand_files = [
        os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
    ]
    # Extract labels
    labels = []
    with open(labels_file, "r") as f:
        lines = f.readlines()
        for line in lines:
            # Skip comment lines
            if line[0] == "#":
                continue
            # Lines have format
            # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
            line = line.split()
            # The base-10 logarithm, -log kd/pk
            log_label = line[3]
            labels.append(log_label)
    labels = np.array(labels)
    # Featurize Data
    if featurizer == "grid":
        featurizer = rgf.RdkitGridFeaturizer(voxel_width=2.0,
                                             feature_types=[
                                                 'ecfp', 'splif', 'hbond',
                                                 'salt_bridge', 'pi_stack',
                                                 'cation_pi', 'charge'
                                             ],
                                             flatten=True)
    elif featurizer == "atomic":
        # Pulled from PDB files. For larger datasets with more PDBs, would use
        # max num atoms instead of exact.
        frag1_num_atoms = 70  # for ligand atoms
        frag2_num_atoms = 24000  # for protein atoms
        complex_num_atoms = 24070  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        featurizer = ComplexNeighborListFragmentAtomicCoordinates(
            frag1_num_atoms, frag2_num_atoms, complex_num_atoms,
            max_num_neighbors, neighbor_cutoff)

    elif featurizer == "atomic_conv":
        frag1_num_atoms = 70  # for ligand atoms
        frag2_num_atoms = 24000  # for protein atoms
        complex_num_atoms = 24070  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        featurizer = AtomicConvFeaturizer(labels=labels,
                                          frag1_num_atoms=frag1_num_atoms,
                                          frag2_num_atoms=frag2_num_atoms,
                                          complex_num_atoms=complex_num_atoms,
                                          neighbor_cutoff=neighbor_cutoff,
                                          max_num_neighbors=max_num_neighbors,
                                          batch_size=64)

    else:
        raise ValueError("Featurizer not supported")
    print("Featurizing Complexes")
    features, failures = featurizer.featurize_complexes(
        ligand_files, protein_files)
    # Delete labels for failing elements
    labels = np.delete(labels, failures)
    dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
    print('Featurization complete.')
    # No transformations of data
    transformers = []
    if split == None:
        return pdbbind_tasks, (dataset, None, None), transformers

    # TODO(rbharath): This should be modified to contain a cluster split so
    # structures of the same protein aren't in both train/test
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)
    all_dataset = (train, valid, test)
    if reload:
        deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                                 transformers)
    return pdbbind_tasks, all_dataset, transformers
예제 #5
0
def load_pdbbind(reload=True,
                 data_dir=None,
                 subset="core",
                 load_binding_pocket=False,
                 featurizer="grid",
                 split="random",
                 split_seed=None,
                 save_dir=None,
                 save_timestamp=False):
    """Load raw PDBBind dataset by featurization and split.

  Parameters
  ----------
  reload: Bool, optional
    Reload saved featurized and splitted dataset or not.
  data_dir: Str, optional
    Specifies the directory storing the raw dataset.
  load_binding_pocket: Bool, optional
    Load binding pocket or full protein.
  subset: Str
    Specifies which subset of PDBBind, only "core" or "refined" for now.
  featurizer: Str
    Either "grid" or "atomic" for grid and atomic featurizations.
  split: Str
    Either "random" or "index".
  split_seed: Int, optional
    Specifies the random seed for splitter.
  save_dir: Str, optional
    Specifies the directory to store the featurized and splitted dataset when
    reload is False. If reload is True, it will load saved dataset inside save_dir.
  save_timestamp: Bool, optional
    Save featurized and splitted dataset with timestamp or not. Set it as True
    when running similar or same jobs simultaneously on multiple compute nodes.
  """

    pdbbind_tasks = ["-logKd/Ki"]

    deepchem_dir = deepchem.utils.get_data_dir()

    if data_dir == None:
        data_dir = DEFAULT_DATA_DIR
    data_folder = os.path.join(data_dir, "pdbbind", "v2015")

    if save_dir == None:
        save_dir = os.path.join(DEFAULT_DATA_DIR, "from-pdbbind")
    if load_binding_pocket:
        save_folder = os.path.join(
            save_dir, "protein_pocket-%s-%s-%s" % (subset, featurizer, split))
    else:
        save_folder = os.path.join(
            save_dir, "full_protein-%s-%s-%s" % (subset, featurizer, split))

    if save_timestamp:
        save_folder = "%s-%s-%s" % (
            save_folder, time.strftime("%Y%m%d", time.localtime()),
            re.search("\.(.*)", str(time.time())).group(1))

    if reload:
        if not os.path.exists(save_folder):
            print("Dataset does not exist at {}. Reconstructing...".format(
                save_folder))
        else:
            print("\nLoading featurized and splitted dataset from:\n%s\n" %
                  save_folder)
        loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
            save_folder)
        if loaded:
            return pdbbind_tasks, all_dataset, transformers

    dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
    if not os.path.exists(dataset_file):
        logger.warning(
            "About to download PDBBind full dataset. Large file, 2GB")
        deepchem.utils.download_url(
            'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
            "pdbbind_v2015.tar.gz",
            dest_dir=data_dir)
    if os.path.exists(data_folder):
        logger.info("PDBBind full dataset already exists.")
    else:
        print("Untarring full dataset...")
        deepchem.utils.untargz_file(dataset_file,
                                    dest_dir=os.path.join(data_dir, "pdbbind"))

    print("\nRaw dataset:\n%s" % data_folder)
    print("\nFeaturized and splitted dataset:\n%s" % save_folder)

    if subset == "core":
        index_labels_file = os.path.join(data_folder, "INDEX_core_data.2013")
    elif subset == "refined":
        index_labels_file = os.path.join(data_folder,
                                         "INDEX_refined_data.2015")
    else:
        raise ValueError("Other subsets not supported")

    # Extract locations of data
    with open(index_labels_file, "r") as g:
        pdbs = [line[:4] for line in g.readlines() if line[0] != "#"]
    if load_binding_pocket:
        protein_files = [
            os.path.join(data_folder, pdb, "%s_pocket.pdb" % pdb)
            for pdb in pdbs
        ]
    else:
        protein_files = [
            os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb)
            for pdb in pdbs
        ]
    ligand_files = [
        os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
    ]

    # Extract labels
    with open(index_labels_file, "r") as g:
        labels = np.array([
            # Lines have format
            # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
            # The base-10 logarithm, -log kd/pk
            float(line.split()[3]) for line in g.readlines() if line[0] != "#"
        ])

    # Featurize Data
    if featurizer == "grid":
        featurizer = rgf.RdkitGridFeaturizer(voxel_width=2.0,
                                             feature_types=[
                                                 'ecfp', 'splif', 'hbond',
                                                 'salt_bridge', 'pi_stack',
                                                 'cation_pi', 'charge'
                                             ],
                                             flatten=True)
    elif featurizer == "atomic" or featurizer == "atomic_conv":
        # Pulled from PDB files. For larger datasets with more PDBs, would use
        # max num atoms instead of exact.
        frag1_num_atoms = 70  # for ligand atoms
        if load_binding_pocket:
            frag2_num_atoms = 1000
            complex_num_atoms = 1070
        else:
            frag2_num_atoms = 24000  # for protein atoms
            complex_num_atoms = 24070  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        if featurizer == "atomic":
            featurizer = ComplexNeighborListFragmentAtomicCoordinates(
                frag1_num_atoms=frag1_num_atoms,
                frag2_num_atoms=frag2_num_atoms,
                complex_num_atoms=complex_num_atoms,
                max_num_neighbors=max_num_neighbors,
                neighbor_cutoff=neighbor_cutoff)
        if featurizer == "atomic_conv":
            featurizer = AtomicConvFeaturizer(
                labels=labels,
                frag1_num_atoms=frag1_num_atoms,
                frag2_num_atoms=frag2_num_atoms,
                complex_num_atoms=complex_num_atoms,
                neighbor_cutoff=neighbor_cutoff,
                max_num_neighbors=max_num_neighbors,
                batch_size=64)
    else:
        raise ValueError("Featurizer not supported")

    print("\nFeaturizing Complexes for \"%s\" ...\n" % data_folder)
    feat_t1 = time.time()
    features, failures = featurizer.featurize_complexes(
        ligand_files, protein_files)
    feat_t2 = time.time()
    print("\nFeaturization finished, took %0.3f s." % (feat_t2 - feat_t1))

    # Delete labels and ids for failing elements
    labels = np.delete(labels, failures)
    labels = labels.reshape((len(labels), 1))
    ids = np.delete(pdbs, failures)

    print("\nConstruct dataset excluding failing featurization elements...")
    dataset = deepchem.data.DiskDataset.from_numpy(features, y=labels, ids=ids)

    # No transformations of data
    transformers = []

    # Split dataset
    print("\nSplit dataset...\n")
    if split == None:
        return pdbbind_tasks, (dataset, None, None), transformers

    # TODO(rbharath): This should be modified to contain a cluster split so
    # structures of the same protein aren't in both train/test
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         seed=split_seed)

    all_dataset = (train, valid, test)
    print("\nSaving dataset to \"%s\" ..." % save_folder)
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
    return pdbbind_tasks, all_dataset, transformers
예제 #6
0
def load_pdbbind_from_dir(data_folder,
                          index_files,
                          featurizer="grid",
                          split="random",
                          ex_ids=[],
                          save_dir=None):
  """Load and featurize raw PDBBind dataset from a local directory with the option to avoid certain IDs.

    Parameters
    ----------
    data_dir: String,
      Specifies the data directory to store the featurized dataset.
    index_files: List
      List of data and labels index file paths relative to the path in data_dir
    split: Str
      Either "random" or "index"
    feat: Str
      Either "grid" or "atomic" for grid and atomic featurizations.
    subset: Str
      Only "core" or "refined" for now.
    ex_ids: List
      List of PDB IDs to avoid loading if present
    save_dir: String
      Path to store featurized datasets
    """
  pdbbind_tasks = ["-logKd/Ki"]

  index_file = os.path.join(data_folder, index_files[0])
  labels_file = os.path.join(data_folder, index_files[1])

  # Extract locations of data
  pdbs = []

  with open(index_file, "r") as g:
    lines = g.readlines()
    for line in lines:
      line = line.split(" ")
      pdb = line[0]
      if len(pdb) == 4:
        pdbs.append(pdb)
  protein_files = [
      os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb)
      for pdb in pdbs
      if pdb not in ex_ids
  ]
  ligand_files = [
      os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb)
      for pdb in pdbs
      if pdb not in ex_ids
  ]
  # Extract labels
  labels_tmp = {}
  with open(labels_file, "r") as f:
    lines = f.readlines()
    for line in lines:
      # Skip comment lines
      if line[0] == "#":
        continue
      # Lines have format
      # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
      line = line.split()
      # The base-10 logarithm, -log kd/pk
      log_label = line[3]
      labels_tmp[line[0]] = log_label

  labels = np.array([labels_tmp[pdb] for pdb in pdbs])
  print(labels)
  # Featurize Data
  if featurizer == "grid":
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=2.0,
        feature_types=[
            'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi',
            'charge'
        ],
        flatten=True)
  elif featurizer == "atomic":
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 70  # for ligand atoms
    frag2_num_atoms = 24000  # for protein atoms
    complex_num_atoms = 24070  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    featurizer = ComplexNeighborListFragmentAtomicCoordinates(
        frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
        neighbor_cutoff)

  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")
  features, failures = featurizer.featurize(ligand_files, protein_files)
  # Delete labels for failing elements
  labels = np.delete(labels, failures)
  dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
  # No transformations of data
  transformers = []
  if split == None:
    return pdbbind_tasks, (dataset, None, None), transformers

  # TODO(rbharath): This should be modified to contain a cluster split so
  # structures of the same protein aren't in both train/test
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  all_dataset = (train, valid, test)
  if save_dir:
    deepchem.utils.data_utils.save_dataset_to_disk(save_dir, train, valid, test,
                                                   transformers)
  return pdbbind_tasks, all_dataset, transformers
예제 #7
0
def load_pdbbind(reload=True,
                 data_dir=None,
                 subset="core",
                 load_binding_pocket=False,
                 featurizer="grid",
                 split="random",
                 split_seed=None,
                 save_dir=None,
                 save_timestamp=False):
  """Load raw PDBBind dataset by featurization and split.

  Parameters
  ----------
  reload: Bool, optional
    Reload saved featurized and splitted dataset or not.
  data_dir: Str, optional
    Specifies the directory storing the raw dataset.
  load_binding_pocket: Bool, optional
    Load binding pocket or full protein.
  subset: Str
    Specifies which subset of PDBBind, only "core" or "refined" for now.
  featurizer: Str
    Either "grid" or "atomic" for grid and atomic featurizations.
  split: Str
    Either "random" or "index".
  split_seed: Int, optional
    Specifies the random seed for splitter.
  save_dir: Str, optional
    Specifies the directory to store the featurized and splitted dataset when
    reload is False. If reload is True, it will load saved dataset inside save_dir. 
  save_timestamp: Bool, optional
    Save featurized and splitted dataset with timestamp or not. Set it as True
    when running similar or same jobs simultaneously on multiple compute nodes.
  """

  pdbbind_tasks = ["-logKd/Ki"]

  deepchem_dir = deepchem.utils.get_data_dir()

  if data_dir == None:
    data_dir = deepchem_dir
  data_folder = os.path.join(data_dir, "pdbbind", "v2015")

  if save_dir == None:
    save_dir = os.path.join(deepchem_dir, "from-pdbbind")
  if load_binding_pocket:
    save_folder = os.path.join(
        save_dir, "protein_pocket-%s-%s-%s" % (subset, featurizer, split))
  else:
    save_folder = os.path.join(
        save_dir, "full_protein-%s-%s-%s" % (subset, featurizer, split))

  if save_timestamp:
    save_folder = "%s-%s-%s" % (save_folder,
                                time.strftime("%Y%m%d", time.localtime()),
                                re.search("\.(.*)", str(time.time())).group(1))

  if reload:
    if not os.path.exists(save_folder):
      raise IOError("Cannot find saved dataset from %s!" % save_folder)
    print("\nLoading featurized and splitted dataset from:\n%s\n" % save_folder)
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return pdbbind_tasks, all_dataset, transformers
    else:
      raise IOError("Failed to load featurized and splitted dataset from:\n%s\n"
                    % save_folder)

  dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
  if not os.path.exists(dataset_file):
    logger.warning("About to download PDBBind full dataset. Large file, 2GB")
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
        "pdbbind_v2015.tar.gz",
        dest_dir=data_dir)
  if os.path.exists(data_folder):
    logger.info("PDBBind full dataset already exists.")
  else:
    print("Untarring full dataset...")
    deepchem.utils.untargz_file(
        dataset_file, dest_dir=os.path.join(data_dir, "pdbbind"))

  print("\nRaw dataset:\n%s" % data_folder)
  print("\nFeaturized and splitted dataset:\n%s" % save_folder)

  if subset == "core":
    index_labels_file = os.path.join(data_folder, "INDEX_core_data.2013")
  elif subset == "refined":
    index_labels_file = os.path.join(data_folder, "INDEX_refined_data.2015")
  else:
    raise ValueError("Other subsets not supported")

  # Extract locations of data
  with open(index_labels_file, "r") as g:
    pdbs = [line[:4] for line in g.readlines() if line[0] != "#"]
  if load_binding_pocket:
    protein_files = [
        os.path.join(data_folder, pdb, "%s_pocket.pdb" % pdb) for pdb in pdbs
    ]
  else:
    protein_files = [
        os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs
    ]
  ligand_files = [
      os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
  ]

  # Extract labels
  with open(index_labels_file, "r") as g:
    labels = np.array([
        # Lines have format
        # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
        # The base-10 logarithm, -log kd/pk
        float(line.split()[3]) for line in g.readlines() if line[0] != "#"
    ])

  # Featurize Data
  if featurizer == "grid":
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=2.0,
        feature_types=[
            'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi',
            'charge'
        ],
        flatten=True)
  elif featurizer == "atomic" or featurizer == "atomic_conv":
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 70  # for ligand atoms
    if load_binding_pocket:
      frag2_num_atoms = 1000
      complex_num_atoms = 1070
    else:
      frag2_num_atoms = 24000  # for protein atoms
      complex_num_atoms = 24070  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    if featurizer == "atomic":
      featurizer = ComplexNeighborListFragmentAtomicCoordinates(
          frag1_num_atoms=frag1_num_atoms,
          frag2_num_atoms=frag2_num_atoms,
          complex_num_atoms=complex_num_atoms,
          max_num_neighbors=max_num_neighbors,
          neighbor_cutoff=neighbor_cutoff)
    if featurizer == "atomic_conv":
      featurizer = AtomicConvFeaturizer(
          labels=labels,
          frag1_num_atoms=frag1_num_atoms,
          frag2_num_atoms=frag2_num_atoms,
          complex_num_atoms=complex_num_atoms,
          neighbor_cutoff=neighbor_cutoff,
          max_num_neighbors=max_num_neighbors,
          batch_size=64)
  else:
    raise ValueError("Featurizer not supported")

  print("\nFeaturizing Complexes for \"%s\" ...\n" % data_folder)
  feat_t1 = time.time()
  features, failures = featurizer.featurize_complexes(ligand_files,
                                                      protein_files)
  feat_t2 = time.time()
  print("\nFeaturization finished, took %0.3f s." % (feat_t2 - feat_t1))

  # Delete labels and ids for failing elements
  labels = np.delete(labels, failures)
  labels = labels.reshape((len(labels), 1))
  ids = np.delete(pdbs, failures)

  print("\nConstruct dataset excluding failing featurization elements...")
  dataset = deepchem.data.DiskDataset.from_numpy(features, y=labels, ids=ids)

  # No transformations of data
  transformers = []

  # Split dataset
  print("\nSplit dataset...\n")
  if split == None:
    return pdbbind_tasks, (dataset, None, None), transformers

  # TODO(rbharath): This should be modified to contain a cluster split so
  # structures of the same protein aren't in both train/test
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed)

  all_dataset = (train, valid, test)
  print("\nSaving dataset to \"%s\" ..." % save_folder)
  deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                           transformers)
  return pdbbind_tasks, all_dataset, transformers