Exemplo n.º 1
0
    def evaluate_error_class2(self, dataset, transformers=[], batch_size=50):
        """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): Should be a subclass PhysicalModel method. Also, need to
    find a better name for this method (class2 doesn't tell us anything about the
    semantics of this method.
    """
        y_preds = []
        y_train = []
        grads = []
        for (X_batch, y_batch, w_batch,
             ids_batch) in dataset.iterbatches(batch_size):

            # untransformed E is needed for undo_grad_transform
            energy_batch = self.predict_on_batch(X_batch)
            grad_batch = self.predict_grad_on_batch(X_batch)
            grad_batch = undo_grad_transforms(grad_batch, energy_batch,
                                              transformers)
            grads.append(grad_batch)
            y_pred_batch = np.reshape(energy_batch, y_batch.shape)

            # y_pred_batch gives us the pred E and pred multitask trained gradE
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)

            # undo transforms on y_batch should know how to handle E and gradE separately
            y_batch = undo_transforms(y_batch, transformers)
            y_train.append(y_batch)

        y_pred = np.vstack(y_preds)
        y = np.vstack(y_train)
        grad = np.vstack(grads)

        n_samples, n_tasks = len(dataset), self.get_num_tasks()
        n_atoms = int((n_tasks - 1) / 3)

        y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        y = np.reshape(y, (n_samples, n_tasks))
        grad_train = y[:, 1:]

        energy_error = y[:, 0] - y_pred[:, 0]
        energy_error = np.sqrt(np.mean(
            energy_error * energy_error)) * 2625.5002

        grad = np.reshape(grad, (n_samples, n_atoms, 3))
        grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))

        grad_error = grad - grad_train
        grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096

        print("Energy error (RMSD): %f kJ/mol" % energy_error)
        print("Grad error (RMSD): %f kJ/mol/A" % grad_error)

        return energy_error, grad_error
Exemplo n.º 2
0
  def evaluate_error_class2(self, dataset, transformers=[], batch_size=50):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): Should be a subclass PhysicalModel method. Also, need to
    find a better name for this method (class2 doesn't tell us anything about the
    semantics of this method.
    """
    y_preds = []
    y_train = []
    grads = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      # untransformed E is needed for undo_grad_transform
      energy_batch = self.predict_on_batch(X_batch)
      grad_batch = self.predict_grad_on_batch(X_batch)
      grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers)      
      grads.append(grad_batch)
      y_pred_batch = np.reshape(energy_batch, y_batch.shape)

      # y_pred_batch gives us the pred E and pred multitask trained gradE
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      # undo transforms on y_batch should know how to handle E and gradE separately
      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)
    grad = np.vstack(grads)

    n_samples, n_tasks = len(dataset), self.get_num_tasks()
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error
Exemplo n.º 3
0
  def compute_model_performance(self, metrics, per_task_metrics=False):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
    self.model.build()
    y = []
    w = []

    def generator_closure():
      for feed_dict in self.generator:
        y.append(feed_dict[self.label_keys[0]])
        if len(self.weights) > 0:
          w.append(feed_dict[self.weights[0]])
        yield feed_dict

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    y_pred = self.model.predict_on_generator(generator_closure())
    y = np.concatenate(y, axis=0)
    multitask_scores = {}
    all_task_scores = {}

    y = undo_transforms(y, self.output_transformers)
    y_pred = undo_transforms(y_pred, self.output_transformers)
    if len(w) != 0:
      w = np.array(w)
      w = np.reshape(w, newshape=y.shape)

    # Compute multitask metrics
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True, n_classes=self.n_classes)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False, n_classes=self.n_classes)

    if not per_task_metrics:
      return multitask_scores
    else:
      return multitask_scores, all_task_scores
Exemplo n.º 4
0
    def evaluate_error(self, dataset, transformers=[], batch_size=50):
        """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): This looks like it should be a subclass method for a
    PhysicalMethod class. forcebalance style errors aren't meaningful for most
    chem-informatic datasets.
    """
        y_preds = []
        y_train = []
        for (X_batch, y_batch, w_batch,
             ids_batch) in dataset.iterbatches(batch_size):

            y_pred_batch = self.predict_on_batch(X_batch)
            y_pred_batch = np.reshape(y_pred_batch, y_batch.shape)

            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)

            y_batch = undo_transforms(y_batch, transformers)
            y_train.append(y_batch)

        y_pred = np.vstack(y_preds)
        y = np.vstack(y_train)

        n_samples, n_tasks = len(dataset), self.get_num_tasks()
        n_atoms = int((n_tasks - 1) / 3)

        y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        y = np.reshape(y, (n_samples, n_tasks))
        grad = y_pred[:, 1:]
        grad_train = y[:, 1:]

        energy_error = y[:, 0] - y_pred[:, 0]
        # convert Hartree to kJ/mol
        energy_error = np.sqrt(np.mean(
            energy_error * energy_error)) * 2625.5002

        grad = np.reshape(grad, (n_samples, n_atoms, 3))
        grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))

        grad_error = grad - grad_train
        # convert Hartree/bohr to kJ/mol/Angstrom
        grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096

        print("Energy error (RMSD): %f kJ/mol" % energy_error)
        print("Grad error (RMSD): %f kJ/mol/A" % grad_error)

        return energy_error, grad_error
Exemplo n.º 5
0
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
        """
    Uses self to make predictions on provided Dataset object.


    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
Exemplo n.º 6
0
 def predict_proba_on_generator(self, generator, transformers=[]):
   """
   Returns:
     y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
   """
   if not self.built:
     self.build()
   with self._get_tf("Graph").as_default():
     with tf.Session() as sess:
       saver = tf.train.Saver()
       self._initialize_weights(sess, saver)
       out_tensors = [x.out_tensor for x in self.outputs]
       results = []
       for feed_dict in generator:
         # Extract number of unique samples in the batch from w_b
         n_valid_samples = len(np.nonzero(feed_dict[self.weights][:, 0])[0])
         feed_dict = {
             self.layers[k.name].out_tensor: v
             for k, v in six.iteritems(feed_dict)
         }
         feed_dict[self._training_placeholder] = 0.0
         result = np.array(sess.run(out_tensors, feed_dict=feed_dict))
         if len(result.shape) == 3:
           result = np.transpose(result, axes=[1, 0, 2])
         result = undo_transforms(result, transformers)
         # Only fetch the first set of unique samples
         results.append(result[:n_valid_samples])
       return np.concatenate(results, axis=0)
Exemplo n.º 7
0
    def predict_proba(self, dataset, transformers=[], batch_size=None):
        """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
        if not self.built:
            self.build()
        if batch_size is None:
            batch_size = self.batch_size
        with self._get_tf("Graph").as_default():
            saver = tf.train.Saver()
            with tf.Session() as sess:
                saver.restore(sess, self.last_checkpoint)
                y_preds = []
                n_tasks = self.get_num_tasks()
                for (X_batch, y_batch, w_batch,
                     ids_batch) in dataset.iterbatches(batch_size,
                                                       deterministic=True):
                    n_samples = len(X_batch)
                    y_pred_batch = self.predict_proba_on_batch(X_batch,
                                                               sess=sess)
                    y_pred_batch = y_pred_batch[:n_samples]
                    y_pred_batch = undo_transforms(y_pred_batch, transformers)
                    y_preds.append(y_pred_batch)
                y_pred = np.vstack(y_preds)
                # The iterbatches does padding with zero-weight examples on the last batch.
                # Remove padded examples.
                n_samples = len(dataset)
                y_pred = y_pred[:n_samples]
                return y_pred
Exemplo n.º 8
0
  def predict(self, dataset, transformers=[]):
    """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    n_tasks = self.get_num_tasks()
    ind = 0

    for (X_batch, _, _, ids_batch) in dataset.iterbatches(
        self.batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_on_batch(X_batch)
      # Discard any padded predictions
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks))
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    y_pred = np.vstack(y_preds)

    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples = len(dataset)
    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    # Special case to handle singletasks.
    if n_tasks == 1:
      y_pred = np.reshape(y_pred, (n_samples,))
    return y_pred
Exemplo n.º 9
0
  def compute_model_performance(self, metrics, csv_out=None, stats_out=None,
                                threshold=None):
    """
    Computes statistics of model on test data and saves results to csv.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(
          self.dataset, self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbosity)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w)
    
    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbosity)
      self.output_statistics(multitask_scores, stats_out)
  
    return multitask_scores
Exemplo n.º 10
0
  def bayesian_predict(self,
                       dataset,
                       transformers=[],
                       n_passes=4,
                       untransform=False):
    """Generates predictions and confidences on a dataset object
     https://arxiv.org/pdf/1506.02142.pdf

    # Returns:
      mu: numpy ndarray of shape (n_samples, n_tasks)
      sigma: numpy ndarray of shape (n_samples, n_tasks)
    """
    X = dataset.X
    max_index = X.shape[0] - 1
    num_batches = (max_index // self.batch_size) + 1

    mus = []
    sigmas = []
    for i in range(num_batches):
      start = i * self.batch_size
      end = min((i + 1) * self.batch_size, max_index + 1)
      batch = X[start:end]
      mu, sigma = self.bayesian_predict_on_batch(
          batch, transformers=[], n_passes=n_passes)
      mus.append(mu)
      sigmas.append(sigma)
    mu = np.concatenate(mus, axis=0)
    sigma = np.concatenate(sigmas, axis=0) + 0.55

    if untransform:
      mu = undo_transforms(mu, transformers)
      for i in range(sigma.shape[1]):
        sigma[:, i] = sigma[:, i] * transformers[0].y_stds[i]

    return mu[:max_index + 1], sigma[:max_index + 1]
Exemplo n.º 11
0
  def predict_proba(self, dataset, transformers=[], n_classes=2):
    """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    y_preds = []
    n_tasks = self.get_num_tasks()

    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        self.batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks, n_classes))
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    y_pred = np.vstack(y_preds)
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples = len(dataset)
    y_pred = y_pred[:n_samples]
    y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    return y_pred
Exemplo n.º 12
0
  def predict_on_smiles(self, smiles, transformers=[], untransform=False):
    """Generates predictions on a numpy array of smile strings

            # Returns:
              y_: numpy ndarray of shape (n_samples, n_tasks)
            """
    max_index = len(smiles) - 1
    n_tasks = len(self.outputs)
    num_batches = (max_index // self.batch_size) + 1
    featurizer = ConvMolFeaturizer()

    y_ = []
    for i in range(num_batches):
      start = i * self.batch_size
      end = min((i + 1) * self.batch_size, max_index + 1)
      smiles_batch = smiles[start:end]
      y_.append(
          self.predict_on_smiles_batch(smiles_batch, featurizer, transformers))
    y_ = np.concatenate(y_, axis=0)[:max_index + 1]
    y_ = y_.reshape(-1, n_tasks)

    if untransform:
      y_ = undo_transforms(y_, transformers)

    return y_
Exemplo n.º 13
0
  def compute_model_performance(self, metrics, csv_out=None, stats_out=None,
                                threshold=None):
    """
    Computes statistics of model on test data and saves results to csv.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(
          self.dataset, self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbose)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w)
    
    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbose)
      self.output_statistics(multitask_scores, stats_out)
  
    return multitask_scores
Exemplo n.º 14
0
    def predict(self, dataset, transformers=[], batch_size=None):
        """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(batch_size, deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks))
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.vstack(y_preds)

        # The iterbatches does padding with zero-weight examples on the last batch.
        # Remove padded examples.
        n_samples = len(dataset)
        y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        # Special case to handle singletasks.
        if n_tasks == 1:
            y_pred = np.reshape(y_pred, (n_samples, ))
        return y_pred
Exemplo n.º 15
0
  def predict_on_smiles(self, smiles, transformers=[], untransform=False):
    """Generates predictions on a numpy array of smile strings

    # Returns:
      y_: numpy ndarray of shape (n_samples, n_tasks)
    """
    max_index = len(smiles) - 1
    n_tasks = len(self.outputs)
    num_batches = (max_index // self.batch_size) + 1
    featurizer = ConvMolFeaturizer()

    y_ = []
    for i in range(num_batches):
      start = i * self.batch_size
      end = min((i + 1) * self.batch_size, max_index + 1)
      smiles_batch = smiles[start:end]
      y_.append(
          self.predict_on_smiles_batch(smiles_batch, featurizer, transformers))
    y_ = np.concatenate(y_, axis=0)[:max_index + 1]
    y_ = y_.reshape(-1, n_tasks)

    if untransform:
      y_ = undo_transforms(y_, transformers)

    return y_
Exemplo n.º 16
0
    def predict(self, dataset, transformers=[], batch_size=None):
        """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
        if not self.built:
            self.build()
        with self._get_tf("Graph").as_default():
            saver = tf.train.Saver()
            with tf.Session() as sess:
                saver.restore(sess, self.last_checkpoint)
                y_preds = []
                n_tasks = self.get_num_tasks()
                for (X_batch, y_b, w_b,
                     ids_batch) in dataset.iterbatches(batch_size,
                                                       deterministic=True):
                    y_pred_batch = self.predict_on_batch(X_batch, sess=sess)
                    y_pred_batch = undo_transforms(y_pred_batch, transformers)
                    y_preds.append(y_pred_batch)
                y_pred = np.vstack(y_preds)

                # The iterbatches does padding with zero-weight examples on the last batch.
                # Remove padded examples.
                n_samples = len(dataset)
                y_pred = y_pred[:n_samples]
                y_pred = np.reshape(y_pred, (n_samples, n_tasks))
                return y_pred
Exemplo n.º 17
0
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> np.ndarray:
        """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: Dataset
      Dataset to make prediction on
    transformers: List[Transformer]
      Transformers that the input data has been transformed by. The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    np.ndarray
      A numpy array of predictions the model produces.
    """
        y_preds = []
        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
Exemplo n.º 18
0
  def predict_proba(self, dataset, transformers=[], batch_size=None):
    """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    if not self.built:
      self.build()
    if batch_size is None:
      batch_size = self.batch_size
    with self._get_tf("Graph").as_default():
      saver = tf.train.Saver()
      with tf.Session() as sess:
        saver.restore(sess, self.last_checkpoint)
        y_preds = []
        n_tasks = self.get_num_tasks()
        for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
            batch_size, deterministic=True):
          n_samples = len(X_batch)
          y_pred_batch = self.predict_proba_on_batch(X_batch, sess=sess)
          y_pred_batch = y_pred_batch[:n_samples]
          y_pred_batch = undo_transforms(y_pred_batch, transformers)
          y_preds.append(y_pred_batch)
        y_pred = np.vstack(y_preds)
        # The iterbatches does padding with zero-weight examples on the last batch.
        # Remove padded examples.
        n_samples = len(dataset)
        y_pred = y_pred[:n_samples]
        return y_pred
Exemplo n.º 19
0
    def predict_proba(self,
                      dataset,
                      transformers=[],
                      batch_size=None,
                      n_classes=2):
        """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        for (X_batch, y_batch, w_batch,
             ids_batch) in dataset.iterbatches(batch_size, deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_proba_on_batch(X_batch)
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = np.reshape(y_pred_batch,
                                      (n_samples, n_tasks, n_classes))
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.vstack(y_preds)
        # The iterbatches does padding with zero-weight examples on the last batch.
        # Remove padded examples.
        n_samples = len(dataset)
        y_pred = y_pred[:n_samples]
        y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
        return y_pred
Exemplo n.º 20
0
  def evaluate_error(self, dataset, transformers=[], batch_size=50):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): This looks like it should be a subclass method for a
    PhysicalMethod class. forcebalance style errors aren't meaningful for most
    chem-informatic datasets.
    """
    y_preds = []
    y_train = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      y_pred_batch = self.predict_on_batch(X_batch)
      y_pred_batch = np.reshape(y_pred_batch, y_batch.shape)

      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)

    n_samples, n_tasks = len(dataset), self.get_num_tasks()
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad = y_pred[:,1:]
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    # convert Hartree to kJ/mol
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    # convert Hartree/bohr to kJ/mol/Angstrom
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error
Exemplo n.º 21
0
    def test_fd_grad(self, dataset, transformers=[], batch_size=50):
        """
    Uses self to calculate finite difference gradient on provided Dataset object.
    Currently only useful if your task is energy and self contains predict_grad_on_batch.

    TODO(rbharath): This shouldn't be a method of the Model class. Perhaps a
    method of PhysicalModel subclass. Leaving it in for time-being while refactoring
    continues.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
        y_preds = []
        for (X_batch, y_batch, w_batch,
             ids_batch) in dataset.iterbatches(batch_size):

            for xb in X_batch:

                num_atoms = xb.shape[0]
                coords = 3

                h = 0.001
                fd_batch = []
                # Filling a new batch with displaced geometries
                for i in range(num_atoms):
                    for j in range(coords):
                        displace = np.zeros((num_atoms, coords))
                        displace[i][j] += h / 2
                        fd_batch.append(xb + displace)
                        fd_batch.append(xb - displace)

                fd_batch = np.asarray(fd_batch)
                # Predict energy on displaced geometry batch
                y_pred_batch = self.predict_on_batch(fd_batch)
                energy = y_pred_batch[:, 0]
                y_pred_batch = undo_transforms(y_pred_batch, transformers)
                y_pred_batch = y_pred_batch[:, 0]
                y_pred_batch = np.reshape(y_pred_batch, (3 * num_atoms, 2))

                fd_grads = []
                # Calculate numerical gradient by centered finite difference
                for x in y_pred_batch:
                    fd_grads.append((x[0] - x[1]) / h)

                fd_grads = np.asarray(fd_grads)
                fd_grads = np.reshape(fd_grads, (num_atoms, coords))

                xb = np.asarray([xb])
                y_pred_batch = self.predict_grad_on_batch(xb)
                y_pred_batch = undo_grad_transforms(energy, y_pred_batch,
                                                    transformers)
                # Calculate error between symbolic gradient and numerical gradient
                y_pred_batch = y_pred_batch - fd_grads
                #print(y_pred_batch)
                y_preds.append(y_pred_batch)

        y_pred = np.vstack(y_preds)

        return y_pred
Exemplo n.º 22
0
  def compute_model_performance(self,
                                metrics,
                                csv_out=None,
                                stats_out=None,
                                per_task_metrics=False):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(self.dataset,
                                        self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}
    all_task_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbose)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)

    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbose)
      self.output_statistics(multitask_scores, stats_out)

    if not per_task_metrics:
      return multitask_scores
    else:
      return multitask_scores, all_task_scores
Exemplo n.º 23
0
  def compute_model_performance(self,
                                metrics,
                                csv_out=None,
                                stats_out=None,
                                per_task_metrics=False):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(self.dataset,
                                        self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}
    all_task_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbose)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)

    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbose)
      self.output_statistics(multitask_scores, stats_out)

    if not per_task_metrics:
      return multitask_scores
    else:
      return multitask_scores, all_task_scores
Exemplo n.º 24
0
  def test_fd_grad(self, dataset, transformers=[], batch_size=50):
    """
    Uses self to calculate finite difference gradient on provided Dataset object.
    Currently only useful if your task is energy and self contains predict_grad_on_batch.

    TODO(rbharath): This shouldn't be a method of the Model class. Perhaps a
    method of PhysicalModel subclass. Leaving it in for time-being while refactoring
    continues.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      for xb in X_batch:

        num_atoms = xb.shape[0]
        coords = 3

        h = 0.001
        fd_batch = []
        # Filling a new batch with displaced geometries
        for i in range(num_atoms):
          for j in range(coords):
            displace = np.zeros((num_atoms, coords))
            displace[i][j] += h/2
            fd_batch.append(xb+displace)
            fd_batch.append(xb-displace)

        fd_batch = np.asarray(fd_batch)
        # Predict energy on displaced geometry batch
        y_pred_batch = self.predict_on_batch(fd_batch)
        energy = y_pred_batch[:,0]
        y_pred_batch = undo_transforms(y_pred_batch, transformers)
        y_pred_batch = y_pred_batch[:,0]
        y_pred_batch = np.reshape(y_pred_batch, (3*num_atoms, 2))

        fd_grads = []
        # Calculate numerical gradient by centered finite difference
        for x in y_pred_batch:
          fd_grads.append((x[0]-x[1])/h)

        fd_grads = np.asarray(fd_grads)
        fd_grads = np.reshape(fd_grads, (num_atoms, coords))

        xb = np.asarray([xb])
        y_pred_batch = self.predict_grad_on_batch(xb)
        y_pred_batch = undo_grad_transforms(energy, y_pred_batch, transformers)
        # Calculate error between symbolic gradient and numerical gradient
        y_pred_batch = y_pred_batch-fd_grads
        #print(y_pred_batch)
        y_preds.append(y_pred_batch)

    y_pred = np.vstack(y_preds)
  
    return y_pred
Exemplo n.º 25
0
  def predict_on_generator(self, generator, transformers=[], outputs=None):
    out = super(TextCNNModel, self).predict_on_generator(
        generator, transformers=[], outputs=outputs)
    if outputs is None:
      outputs = self.outputs
    if len(outputs) > 1:
      out = np.stack(out, axis=1)

    out = undo_transforms(out, transformers)
    return out
Exemplo n.º 26
0
  def predict_on_generator(self, generator, transformers=[], outputs=None):
    out = super(TextCNNModel, self).predict_on_generator(
        generator, transformers=[], outputs=outputs)
    if outputs is None:
      outputs = self.outputs
    if len(outputs) > 1:
      out = np.stack(out, axis=1)

    out = undo_transforms(out, transformers)
    return out
Exemplo n.º 27
0
    def predict_on_generator(self, generator, transformers=[], outputs=None):
        """
    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
        if not self.built:
            self.build()
        if outputs is None:
            outputs = self.outputs
        elif not isinstance(outputs, collections.Sequence):
            outputs = [outputs]
        with self._get_tf("Graph").as_default():
            with tf.Session() as sess:
                saver = tf.train.Saver()
                self._initialize_weights(sess, saver)
                out_tensors = [x.out_tensor for x in self.outputs]
                # Gather results for each output
                results = [[] for out in out_tensors]
                for feed_dict in generator:
                    feed_dict = {
                        self.layers[k.name].out_tensor: v
                        for k, v in six.iteritems(feed_dict)
                    }
                    feed_dict[self._training_placeholder] = 0.0
                    feed_results = sess.run(out_tensors, feed_dict=feed_dict)
                    if len(feed_results) > 1:
                        if len(transformers):
                            raise ValueError(
                                "Does not support transformations "
                                "for multiple outputs.")
                    elif len(feed_results) == 1:
                        result = undo_transforms(feed_results[0], transformers)
                        feed_results = [result]
                    for ind, result in enumerate(feed_results):
                        results[ind].append(result)

                final_results = []
                for result_list in results:
                    final_results.append(np.concatenate(result_list, axis=0))
                # If only one output, just return array
                if len(final_results) == 1:
                    return final_results[0]
                else:
                    return final_results
Exemplo n.º 28
0
 def predict(self, dataset, transformers=[], outputs=None):
   if outputs is None:
     outputs = self.outputs
   if transformers != [] and not isinstance(outputs, collections.Sequence):
     raise ValueError(
         "DTNN does not support single tensor output with transformers")
   retval = super(DTNNTensorGraph, self).predict(dataset, outputs=outputs)
   if not isinstance(outputs, collections.Sequence):
     return retval
   retval = np.concatenate(retval, axis=-1)
   return undo_transforms(retval, transformers)
Exemplo n.º 29
0
    def predict(self, dataset, transformers=[]):
        """
    Prediction for multitask models.
    """
        n_tasks = len(self.tasks)
        n_samples = len(dataset)
        y_pred = np.zeros((n_samples, n_tasks))
        for ind, task in enumerate(self.tasks):
            task_model = self.model_builder(self.task_model_dirs[task])
            task_model.reload()

            y_pred[:, ind] = task_model.predict(dataset, [])
        y_pred = undo_transforms(y_pred, transformers)
        return y_pred
Exemplo n.º 30
0
  def predict(self, dataset, transformers=[]):
    """
    Prediction for multitask models. 
    """
    n_tasks = len(self.tasks)
    n_samples = len(dataset) 
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = task_model.predict(dataset, [])
    y_pred = undo_transforms(y_pred, transformers)
    return y_pred
Exemplo n.º 31
0
  def predict(self, dataset, transformers=[]):
    """
    Prediction for multitask models.
    """
    n_tasks = len(self.tasks)
    n_samples = len(dataset)
    y_preds = []
    for ind, task in enumerate(self.tasks):
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_preds.append(task_model.predict(dataset, []))
    y_pred = np.stack(y_preds, axis=1)
    y_pred = undo_transforms(y_pred, transformers)
    return y_pred
Exemplo n.º 32
0
    def predict(self, dataset, transformers=[]):
        """
    Prediction for multitask models.
    """
        n_tasks = len(self.tasks)
        n_samples = len(dataset)
        y_preds = []
        for ind, task in enumerate(self.tasks):
            task_model = self.model_builder(self.task_model_dirs[task])
            task_model.reload()

            y_preds.append(task_model.predict(dataset, []))
        y_pred = np.stack(y_preds, axis=1)
        y_pred = undo_transforms(y_pred, transformers)
        return y_pred
Exemplo n.º 33
0
 def predict_proba(self, dataset, transformers=[], n_classes=2):
   y_preds = []
   n_tasks = self.n_tasks
   for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
       self.batch_size, deterministic=True):
     n_samples = len(X_batch)
     y_pred_batch = self.predict_proba_on_batch(X_batch)
     assert y_pred_batch.shape == (n_samples, n_tasks, n_classes)
     y_pred_batch = undo_transforms(y_pred_batch, transformers)
     y_preds.append(y_pred_batch)
   y_pred = np.vstack(y_preds)
   # The iterbatches does padding with zero-weight examples on the last batch.
   # Remove padded examples.
   n_samples = len(dataset)
   y_pred = y_pred[:n_samples]
   y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
   return y_pred
Exemplo n.º 34
0
 def predict_proba(self, dataset, transformers=[], n_classes=2):
     y_preds = []
     n_tasks = self.n_tasks
     for (X_batch, y_batch, w_batch,
          ids_batch) in dataset.iterbatches(self.batch_size,
                                            deterministic=True):
         n_samples = len(X_batch)
         y_pred_batch = self.predict_proba_on_batch(X_batch)
         assert y_pred_batch.shape == (n_samples, n_tasks, n_classes)
         y_pred_batch = undo_transforms(y_pred_batch, transformers)
         y_preds.append(y_pred_batch)
     y_pred = np.vstack(y_preds)
     # The iterbatches does padding with zero-weight examples on the last batch.
     # Remove padded examples.
     n_samples = len(dataset)
     y_pred = y_pred[:n_samples]
     y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
     return y_pred
Exemplo n.º 35
0
 def predict_proba_on_generator(self, generator, transformers=[]):
   if not self.built:
     self.build()
   with self._get_tf("Graph").as_default():
     out_tensors = [x.out_tensor for x in self.outputs]
     results = []
     for feed_dict in generator:
       feed_dict = {
           self.layers[k.name].out_tensor: v
           for k, v in six.iteritems(feed_dict)
       }
       feed_dict[self._training_placeholder] = 1.0  ##
       result = np.array(self.session.run(out_tensors, feed_dict=feed_dict))
       if len(result.shape) == 3:
         result = np.transpose(result, axes=[1, 0, 2])
       if len(transformers) > 0:
         result = undo_transforms(result, transformers)
       results.append(result)
     return np.concatenate(results, axis=0)
Exemplo n.º 36
0
    def predict(self, dataset, transformers=[], batch_size=None):
        """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(batch_size, deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
Exemplo n.º 37
0
 def predict_proba_on_generator(self, generator, transformers=[]):
   if not self.built:
     self.build()
   with self._get_tf("Graph").as_default():
     with tf.Session() as sess:
       saver = tf.train.Saver()
       saver.restore(sess, self.last_checkpoint)
       out_tensors = [x.out_tensor for x in self.outputs]
       results = []
       for feed_dict in generator:
         feed_dict = {
             self.layers[k.name].out_tensor: v
             for k, v in six.iteritems(feed_dict)
         }
         result = np.array(sess.run(out_tensors, feed_dict=feed_dict))
         if len(result.shape) == 3:
           result = np.transpose(result, axes=[1, 0, 2])
         if len(transformers) > 0:
           result = undo_transforms(result, transformers)
         results.append(result)
       return np.concatenate(results, axis=0)
Exemplo n.º 38
0
  def predict(self, dataset, transformers=[], batch_size=None):
    """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    n_tasks = self.get_num_tasks()
    ind = 0

    for (X_batch, _, _, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_on_batch(X_batch)
      # Discard any padded predictions
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    y_pred = np.concatenate(y_preds)
    return y_pred
Exemplo n.º 39
0
    def predict_on_generator(self, generator, transformers=[], outputs=None):
        if not self.built:
            self.build()
        if outputs is None:
            outputs = self.outputs
        elif not isinstance(outputs, collections.Sequence):
            outputs = [outputs]
        with self._get_tf("Graph").as_default():
            # Gather results for each output
            results = [[] for out in outputs]
            for feed_dict in generator:
                feed_dict = {
                    self.layers[k.name].out_tensor: v
                    for k, v in six.iteritems(feed_dict)
                }
                # Recording the number of samples in the input batch
                n_samples = max(feed_dict[self.membership.out_tensor]) + 1
                feed_dict[self._training_placeholder] = 0.0
                feed_results = self.session.run(outputs, feed_dict=feed_dict)
                if len(feed_results) > 1:
                    if len(transformers):
                        raise ValueError("Does not support transformations "
                                         "for multiple outputs.")
                elif len(feed_results) == 1:
                    result = undo_transforms(feed_results[0], transformers)
                    feed_results = [result]
                for ind, result in enumerate(feed_results):
                    # GraphConvTensorGraph constantly outputs batch_size number of
                    # results, only valid samples should be appended to final results
                    results[ind].append(result[:n_samples])

            final_results = []
            for result_list in results:
                final_results.append(np.concatenate(result_list, axis=0))
            # If only one output, just return array
            if len(final_results) == 1:
                return final_results[0]
            else:
                return final_results
Exemplo n.º 40
0
    def _predict(
            self, generator: Iterable[Tuple[Any, Any, Any]],
            transformers: List[Transformer], uncertainty: bool,
            other_output_types: Optional[OneOrMany[str]]
    ) -> OneOrMany[np.ndarray]:
        """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not
    call it directly.  Instead call one of the public prediction
    methods.

    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    other_output_types: list, optional
      Provides a list of other output_types (strings) to predict from model.
    Returns:
      a NumPy array of the model produces a single output, or a list of arrays
      if it produces multiple outputs
    """
        results: Optional[List[np.ndarray]] = None
        variances: Optional[List[np.ndarray]] = None
        if uncertainty and (other_output_types is not None):
            raise ValueError(
                'This model cannot compute uncertainties and other output types simultaneously. Please invoke one at a time.'
            )
        if uncertainty:
            if self._variance_outputs is None or len(
                    self._variance_outputs) == 0:
                raise ValueError('This model cannot compute uncertainties')
            if len(self._variance_outputs) != len(self._prediction_outputs):
                raise ValueError(
                    'The number of variances must exactly match the number of outputs'
                )
        if other_output_types:
            if self._other_outputs is None or len(self._other_outputs) == 0:
                raise ValueError(
                    'This model cannot compute other outputs since no other output_types were specified.'
                )
        self._ensure_built()
        self.model.eval()
        for batch in generator:
            inputs, labels, weights = batch
            inputs, _, _ = self._prepare_batch((inputs, None, None))

            # Invoke the model.
            if len(inputs) == 1:
                inputs = inputs[0]
            output_values = self.model(inputs)
            if isinstance(output_values, torch.Tensor):
                output_values = [output_values]
            output_values = [t.detach().cpu().numpy() for t in output_values]

            # Apply tranformers and record results.
            if uncertainty:
                var = [output_values[i] for i in self._variance_outputs]
                if variances is None:
                    variances = [var]
                else:
                    for i, t in enumerate(var):
                        variances[i].append(t)
            access_values = []
            if other_output_types:
                access_values += self._other_outputs
            elif self._prediction_outputs is not None:
                access_values += self._prediction_outputs

            if len(access_values) > 0:
                output_values = [output_values[i] for i in access_values]

            if len(transformers) > 0:
                if len(output_values) > 1:
                    raise ValueError(
                        "predict() does not support Transformers for models with multiple outputs."
                    )
                elif len(output_values) == 1:
                    output_values = [
                        undo_transforms(output_values[0], transformers)
                    ]
            if results is None:
                results = [[] for i in range(len(output_values))]
            for i, t in enumerate(output_values):
                results[i].append(t)

        # Concatenate arrays to create the final results.
        final_results = []
        final_variances = []
        if results is not None:
            for r in results:
                final_results.append(np.concatenate(r, axis=0))
        if uncertainty and variances is not None:
            for v in variances:
                final_variances.append(np.concatenate(v, axis=0))
            return zip(final_results, final_variances)
        if len(final_results) == 1:
            return final_results[0]
        else:
            return final_results
    mode='regression',
    model_dir=MODEL_DIR,
    error_bars=ERROR_BARS)

model.fit(train_dataset, nb_epoch=8)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

model.save()
model.load_from_dir('model_saves')

mu, sigma = model.bayesian_predict(
    valid_dataset, transformers, untransform=True, n_passes=24)
print(mu[:4])
print(sigma[:4])

target = undo_transforms(valid_dataset.y, transformers)

print(r2_score(target, mu))

mu = mu[:, 0].tolist()
sigma = sigma[:, 0].tolist()
target = target[:, 0].tolist()

print(mu[:4])
print(sigma[:4])
print(target[:4])

in_one_sigma = 0
in_two_sigma = 0
in_four_sigma = 0
Exemplo n.º 42
0
    def compute_model_performance(self,
                                  metrics,
                                  csv_out=None,
                                  stats_out=None,
                                  per_task_metrics=False,
                                  no_r2=False,
                                  no_concordance_index=False,
                                  plot=False):
        """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
        self.model.build()
        y = []
        w = []

        def generator_closure():
            for feed_dict in self.generator:
                y.append(feed_dict[self.label_keys[0]])
                if len(self.weights) > 0:
                    w.append(feed_dict[self.weights[0]])
                yield feed_dict

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        if mode == "classification":
            y_pred = self.model.predict_proba_on_generator(generator_closure())
            y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
            y = np.reshape(y, newshape=(-1, self.n_tasks, self.n_classes))
            y = from_one_hot(y, axis=-1)
        else:
            y_pred = self.model.predict_proba_on_generator(generator_closure())
            y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
            y = np.reshape(y, newshape=(-1, self.n_tasks))
            y_pred = np.reshape(y_pred, newshape=(-1, self.n_tasks))
        y_pred = self.model.predict_on_generator(generator_closure())
        y = np.concatenate(y, axis=0)
        multitask_scores = {}
        all_task_scores = {}

        y = undo_transforms(y, self.output_transformers)
        y_pred = undo_transforms(y_pred, self.output_transformers)
        if len(w) != 0:
            w = np.array(w)
            w = np.reshape(w, newshape=y.shape)

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbose)
            self.output_predictions(y_pred, csv_out)

        plot_finished = False
        # Compute multitask metrics
        for i, metric in enumerate(metrics):
            mtc_name = metric.metric.__name__
            if no_r2 and (mtc_name == 'r2_score'
                          or mtc_name == 'pearson_r2_score'):
                continue
            if per_task_metrics:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        all_task_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                n_classes=self.n_classes,
                                plot=True,
                                all_metrics=metrics,
                                is_training_set=self.is_training_set,
                                no_concordance_index=no_concordance_index,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics
                        plot_finished = True
                    else:
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                n_classes=self.n_classes,
                                plot=False,
                                is_training_set=self.is_training_set,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            n_classes=self.n_classes,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics
                    plot_finished = True

                else:  #Otherwise don't need to plot.
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            n_classes=self.n_classes,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics

            else:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            n_classes=self.n_classes,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            no_concordance_index=no_concordance_index,
                            tasks=self.tasks,
                            model_name=self.model_name)
                        plot_finished = True
                    else:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            n_classes=self.n_classes,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        n_classes=self.n_classes,
                        plot=True,
                        all_metrics=metrics,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)
                    plot_finished = True

                else:  #Otherwise don't need to plot.
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        n_classes=self.n_classes,
                        plot=False,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)

        if not per_task_metrics:
            return multitask_scores
        else:
            return multitask_scores, all_task_scores
Exemplo n.º 43
0
    def compute_model_performance(self,
                                  metrics,
                                  csv_out=None,
                                  stats_out=None,
                                  per_task_metrics=False,
                                  no_concordance_index=False,
                                  plot=False,
                                  no_r2=False):
        """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
        y = self.dataset.y
        y = undo_transforms(y, self.output_transformers)
        w = self.dataset.w

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        y_pred = self.model.predict(self.dataset, self.output_transformers)
        if mode == "classification":
            y_pred_print = np.argmax(y_pred, -1)
        else:
            y_pred_print = y_pred
        multitask_scores = {}
        all_task_scores = {}

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbose)
            self.output_predictions(y_pred_print, csv_out)

        plot_finished = False
        # Compute multitask metrics
        for i, metric in enumerate(metrics):
            mtc_name = metric.metric.__name__
            if no_r2 and (mtc_name == 'r2_score'
                          or mtc_name == 'pearson_r2_score'):
                continue
            if per_task_metrics:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        all_task_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        # If this dataset is the training data set, don't calculate CI if no_concordance_index.
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                plot=True,
                                all_metrics=metrics,
                                is_training_set=self.is_training_set,
                                no_concordance_index=no_concordance_index,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics
                        plot_finished = True
                    else:
                        # No longer need to plot. Could be wasting time calculating metrics again, but they
                        # are super fast so it is no big deal.
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                plot=False,
                                is_training_set=self.is_training_set,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics

                # Now deal with validation or test sets.
                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics
                    plot_finished = True
                else:  # Otherwise don't need to plot.
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics

            else:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            no_concordance_index=no_concordance_index,
                            tasks=self.tasks,
                            model_name=self.model_name)
                        plot_finished = True
                    else:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        plot=True,
                        all_metrics=metrics,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)
                    plot_finished = True
                else:
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        plot=False,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)

        if stats_out is not None:
            log("Saving stats to %s" % stats_out, self.verbose)
            self.output_statistics(multitask_scores, stats_out)

        if not per_task_metrics:
            return multitask_scores
        else:
            return multitask_scores, all_task_scores
Exemplo n.º 44
0
    mode='regression',
    model_dir=MODEL_DIR,
    error_bars=ERROR_BARS)

model.fit(train_dataset, nb_epoch=8)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

model.save()
model.load_from_dir('model_saves')

mu, sigma = model.bayesian_predict(
    valid_dataset, transformers, untransform=True, n_passes=24)
print(mu[:4])
print(sigma[:4])

target = undo_transforms(valid_dataset.y, transformers)

print(r2_score(target, mu))

mu = mu[:, 0].tolist()
sigma = sigma[:, 0].tolist()
target = target[:, 0].tolist()

print(mu[:4])
print(sigma[:4])
print(target[:4])

in_one_sigma = 0
in_two_sigma = 0
in_four_sigma = 0
Exemplo n.º 45
0
def compute_model_performance(metrics,
                              y_pred,
                              y,
                              w,
                              transformers,
                              tasks,
                              n_classes=2,
                              per_task_metrics=False):
    """
    Computes statistics of a model based and saves results to csv.

    :param metrics: list
        List of :Metric objects.
    :param y_pred: ndarray
        The predicted values.
    :param y: ndarray
        The ground truths.
    :param w: ndarray
        Label weights.
    :param transformers: list
        DeepChem/PADME data transformers used in the loading pipeline.
    :param n_classes: int, optional
        Number of classes in the data (for classification tasks only).
    :param per_task_metrics: bool, optional
        If true, return computed metric for each task on multitask dataset.
    :return:
    """

    if not len(metrics):
        return {}
    multitask_scores = {}
    all_task_scores = {}

    y = undo_transforms(y, transformers)
    y_pred = undo_transforms(y_pred, transformers)
    if len(w) != 0:
        w = np.array(w)
        w = np.reshape(w, newshape=y.shape)

    # Compute multitask metrics
    for metric in metrics:
        if per_task_metrics:
            multitask_scores[
                metric.name], computed_metrics = metric.compute_metric(
                    y,
                    y_pred,
                    w,
                    per_task_metrics=True,
                    n_classes=n_classes,
                    tasks=tasks)
            all_task_scores[metric.name] = computed_metrics
        else:
            multitask_scores[metric.name] = metric.compute_metric(
                y,
                y_pred,
                w,
                per_task_metrics=False,
                n_classes=n_classes,
                tasks=tasks)

    if not per_task_metrics:
        return multitask_scores
    else:
        return multitask_scores, all_task_scores
Exemplo n.º 46
0
  def _predict(self, generator, transformers, outputs, uncertainty):
    """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not call it directly.
    Instead call one of the public prediction methods.

    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    if not self.built:
      self.build()
    if outputs is None:
      outputs = self.outputs
    elif not isinstance(outputs, collections.Sequence):
      outputs = [outputs]
    if uncertainty:
      if len(self.variances) == 0:
        raise ValueError('This model cannot compute uncertainties')
      if len(self.variances) != len(outputs):
        raise ValueError(
            'The number of variances must exactly match the number of outputs')
      tensors = outputs + self.variances
    else:
      tensors = outputs

    with self._get_tf("Graph").as_default():
      # Gather results for each output
      results = [[] for out in tensors]
      n_samples = 0
      n_enqueued = [0]
      final_sample = [None]
      if self.queue_installed:
        enqueue_thread = threading.Thread(
            target=_enqueue_batch,
            args=(self, generator, self._get_tf("Graph"), self.session,
                  n_enqueued, final_sample))
        enqueue_thread.start()
      for feed_dict in self._create_feed_dicts(generator, uncertainty):
        if self.queue_installed:
          # Don't let this thread get ahead of the enqueue thread, since if
          # we try to read more batches than the total number that get queued,
          # this thread will hang indefinitely.
          while n_enqueued[0] <= n_samples:
            if n_samples == final_sample[0]:
              break
            time.sleep(0)
          if n_samples == final_sample[0]:
            break
        n_samples += 1
        feed_results = self._run_graph(tensors, feed_dict, uncertainty)
        if tfe.in_eager_mode():
          feed_results = [f.numpy() for f in feed_results]
        if len(feed_results) > 1:
          if len(transformers):
            raise ValueError("Does not support transformations "
                             "for multiple outputs.")
        elif len(feed_results) == 1:
          result = undo_transforms(feed_results[0], transformers)
          feed_results = [result]
        for ind, result in enumerate(feed_results):
          results[ind].append(result)

      final_results = []
      for result_list in results:
        final_results.append(np.concatenate(result_list, axis=0))
      # If only one output, just return array
      if len(final_results) == 1:
        return final_results[0]
      elif uncertainty:
        return zip(final_results[:len(outputs)], final_results[len(outputs):])
      else:
        return final_results
Exemplo n.º 47
0
  def _predict(
      self, generator: Iterable[Tuple[Any, Any, Any]],
      transformers: List[Transformer], outputs: Optional[OneOrMany[tf.Tensor]],
      uncertainty: bool,
      other_output_types: Optional[OneOrMany[str]]) -> OneOrMany[np.ndarray]:
    """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not
    call it directly.  Instead call one of the public prediction
    methods.

    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    outputs: Tensor or list of Tensors
      The outputs to return.  If this is None, the model's standard prediction
      outputs will be returned.  Alternatively one or more Tensors within the
      model may be specified, in which case the output of those Tensors will be
      returned.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    other_output_types: list, optional
      Provides a list of other output_types (strings) to predict from model.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
    results: Optional[List[List[np.ndarray]]] = None
    variances: Optional[List[List[np.ndarray]]] = None
    if (outputs is not None) and (other_output_types is not None):
      raise ValueError(
          'This model cannot compute outputs and other output_types simultaneously.'
          'Please invoke one at a time.')
    if uncertainty and (other_output_types is not None):
      raise ValueError(
          'This model cannot compute uncertainties and other output types simultaneously.'
          'Please invoke one at a time.')
    if uncertainty:
      assert outputs is None
      if self._variance_outputs is None or len(self._variance_outputs) == 0:
        raise ValueError('This model cannot compute uncertainties')
      if len(self._variance_outputs) != len(self._prediction_outputs):
        raise ValueError(
            'The number of variances must exactly match the number of outputs')
    if other_output_types:
      assert outputs is None
      if self._other_outputs is None or len(self._other_outputs) == 0:
        raise ValueError(
            'This model cannot compute other outputs since no other output_types were specified.'
        )
    if (outputs is not None and self.model.inputs is not None and
        len(self.model.inputs) == 0):
      raise ValueError(
          "Cannot use 'outputs' argument with a model that does not specify its inputs."
          "Note models defined in imperative subclassing style cannot specify outputs"
      )
    if tf.is_tensor(outputs):
      outputs = [outputs]
    for batch in generator:
      inputs, labels, weights = batch
      self._create_inputs(inputs)
      inputs, _, _ = self._prepare_batch((inputs, None, None))

      # Invoke the model.
      if len(inputs) == 1:
        inputs = inputs[0]
      if outputs is not None:
        outputs = tuple(outputs)
        key = tuple(t.ref() for t in outputs)
        if key not in self._output_functions:
          self._output_functions[key] = tf.keras.backend.function(
              self.model.inputs, outputs)
        output_values = self._output_functions[key](inputs)
      else:
        output_values = self._compute_model(inputs)
        if tf.is_tensor(output_values):
          output_values = [output_values]
        output_values = [t.numpy() for t in output_values]

      # Apply tranformers and record results.
      if uncertainty:
        var = [output_values[i] for i in self._variance_outputs]
        if variances is None:
          variances = [var]
        else:
          for i, t in enumerate(var):
            variances[i].append(t)
      access_values = []
      if other_output_types:
        access_values += self._other_outputs
      elif self._prediction_outputs is not None:
        access_values += self._prediction_outputs

      if len(access_values) > 0:
        output_values = [output_values[i] for i in access_values]

      if len(transformers) > 0:
        if len(output_values) > 1:
          raise ValueError(
              "predict() does not support Transformers for models with multiple outputs."
          )
        elif len(output_values) == 1:
          output_values = [undo_transforms(output_values[0], transformers)]
      if results is None:
        results = [[] for i in range(len(output_values))]
      for i, t in enumerate(output_values):
        results[i].append(t)

    # Concatenate arrays to create the final results.
    final_results = []
    final_variances = []
    if results is not None:
      for r in results:
        final_results.append(np.concatenate(r, axis=0))
    if uncertainty and variances is not None:
      for v in variances:
        final_variances.append(np.concatenate(v, axis=0))
      return zip(final_results, final_variances)
    if len(final_results) == 1:
      return final_results[0]
    else:
      return final_results