예제 #1
0
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> np.ndarray:
        """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: Dataset
      Dataset to make prediction on
    transformers: List[Transformer]
      Transformers that the input data has been transformed by. The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    np.ndarray
      A numpy array of predictions the model produces.
    """
        y_preds = []
        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
예제 #2
0
    def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float:
        """
    Fits a model on data in a Dataset object.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for

    Returns
    -------
    float
      The average loss over the most recent checkpoint interval.
    """
        for epoch in range(nb_epoch):
            logger.info("Starting epoch %s" % str(epoch + 1))
            losses = []
            for (X_batch, y_batch, w_batch,
                 ids_batch) in dataset.iterbatches():
                losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
            logger.info("Avg loss for epoch %d: %f" %
                        (epoch + 1, np.array(losses).mean()))
        return np.array(losses).mean()
예제 #3
0
파일: models.py 프로젝트: tonibois/deepchem
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
        """
    Uses self to make predictions on provided Dataset object.


    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
예제 #4
0
파일: models.py 프로젝트: tonibois/deepchem
    def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float:
        """
    Fits a model on data in a Dataset object.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for

    Returns
    -------
    the average loss over the most recent epoch
    """
        # TODO(rbharath/enf): We need a structured way to deal with potential GPU
        #                     memory overflows.
        for epoch in range(nb_epoch):
            logger.info("Starting epoch %s" % str(epoch + 1))
            losses = []
            for (X_batch, y_batch, w_batch,
                 ids_batch) in dataset.iterbatches():
                losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
            logger.info("Avg loss for epoch %d: %f" %
                        (epoch + 1, np.array(losses).mean()))
        return np.array(losses).mean()
예제 #5
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Create a generator that iterates batches for a dataset.

    Subclasses may override this method to customize how model inputs are
    generated from the data.

    Parameters
    ----------
    dataset: Dataset
      the data to iterate
    epochs: int
      the number of times to iterate over the full dataset
    mode: str
      allowed values are 'fit' (called during training), 'predict' (called
      during prediction), and 'uncertainty' (called during uncertainty
      prediction)
    deterministic: bool
      whether to iterate over the dataset in order, or randomly shuffle the
      data for each epoch
    pad_batches: bool
      whether to pad each batch up to this model's preferred batch size

    Returns
    -------
    a generator that iterates batches, each represented as a tuple of lists:
    ([inputs], [outputs], [weights])
    """
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        yield ([X_b], [y_b], [w_b])
예제 #6
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Convert a dataset into the tensors needed for learning.

    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to convert
    epochs: int, optional (Default 1)
      Number of times to walk over `dataset`
    mode: str, optional (Default 'fit')
      Ignored in this implementation.
    deterministic: bool, optional (Default True)
      Whether the dataset should be walked in a deterministic fashion
    pad_batches: bool, optional (Default True)
      If true, each returned batch will have size `self.batch_size`.

    Returns
    -------
    Iterator which walks over the batches
    """

    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        if y_b is not None:
          if self.mode == 'classification':
            y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                -1, self.n_tasks, self.n_classes)
        inputs = self.compute_features_on_batch(X_b)
        yield (inputs, [y_b], [w_b])
예제 #7
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Convert a dataset into the tensors needed for learning.

    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to convert
    epochs: int, optional (Default 1)
      Number of times to walk over `dataset`
    mode: str, optional (Default 'fit')
      Ignored in this implementation.
    deterministic: bool, optional (Default True)
      Whether the dataset should be walked in a deterministic fashion
    pad_batches: bool, optional (Default True)
      If true, each returned batch will have size `self.batch_size`.

    Returns
    -------
    Iterator which walks over the batches
    """

    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        if y_b is not None:
          if self.mode == 'classification':
            y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                -1, self.n_tasks, self.n_classes)
        atom_feat = []
        pair_feat = []
        atom_split = []
        atom_to_pair = []
        pair_split = []
        start = 0
        for im, mol in enumerate(X_b):
          n_atoms = mol.get_num_atoms()
          # number of atoms in each molecule
          atom_split.extend([im] * n_atoms)
          # index of pair features
          C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
          atom_to_pair.append(
              np.transpose(
                  np.array([C1.flatten() + start,
                            C0.flatten() + start])))
          # number of pairs for each atom
          pair_split.extend(C1.flatten() + start)
          start = start + n_atoms

          # atom features
          atom_feat.append(mol.get_atom_features())
          # pair features
          pair_feat.append(
              np.reshape(mol.get_pair_features(),
                         (n_atoms * n_atoms, self.n_pair_feat[0])))

        inputs = [
            np.concatenate(atom_feat, axis=0),
            np.concatenate(pair_feat, axis=0),
            np.array(pair_split),
            np.array(atom_split),
            np.concatenate(atom_to_pair, axis=0)
        ]
        yield (inputs, [y_b], [w_b])