예제 #1
0
  def evaluate(self, dataset, metric, n_pos,
               n_neg, n_trials=1000, exclude_support=True):
    """Evaluate performance on dataset according to metrics


    Evaluates the performance of the trained model by sampling supports randomly
    for each task in dataset. For each sampled support, the accuracy of the
    model with support provided is computed on all data for that task. If
    exclude_support is True (by default), the support set is excluded from this
    accuracy calculation. exclude_support should be set to false if model's
    memorization capacity wants to be evaluated. 
    

    Since the accuracy on a task is dependent on the choice of random support,
    the evaluation experiment is repeated n_trials times for each task.
    (Each task gets n_trials experiments). The computed accuracies
    are averaged across trials.

    TODO(rbharath): Currently does not support any transformers.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to test on.
    metrics: dc.metrics.Metric
      Evaluation metric.
    n_pos: int, optional
      Number of positive samples per support.
    n_neg: int, optional
      Number of negative samples per support.
    exclude_support: bool, optional
      Whether support set should be excluded when computing model accuracy.
    """
    # Get batches
    test_tasks = range(len(dataset.get_task_names()))
    task_scores = {task: [] for task in test_tasks}
    support_generator = SupportGenerator(dataset, n_pos, n_neg, n_trials)
    for ind, (task, support) in enumerate(support_generator):
      print("Eval sample %d from task %s" % (ind, str(task)))
      # TODO(rbharath): Add test for get_task_dataset_minus_support for
      # multitask case with missing data...
      if exclude_support:
        print("Removing support datapoints for eval.")
        task_dataset = get_task_dataset_minus_support(dataset, support, task)
      else:
        print("Keeping support datapoints for eval.")
        task_dataset = get_task_dataset(dataset, task)
      y_pred = self.predict_proba(support, task_dataset)
      task_scores[task].append(metric.compute_metric(
          task_dataset.y, y_pred, task_dataset.w))

    # Join information for all tasks.
    mean_task_scores = {}
    std_task_scores = {}
    for task in test_tasks:
      mean_task_scores[task] = np.mean(np.array(task_scores[task]))
      std_task_scores[task] = np.std(np.array(task_scores[task]))
    return mean_task_scores, std_task_scores
예제 #2
0
    def evaluate(self,
                 dataset,
                 metric,
                 n_pos,
                 n_neg,
                 n_trials=1000,
                 exclude_support=True):
        """Evaluate performance on dataset according to metrics


    Evaluates the performance of the trained model by sampling supports randomly
    for each task in dataset. For each sampled support, the accuracy of the
    model with support provided is computed on all data for that task. If
    exclude_support is True (by default), the support set is excluded from this
    accuracy calculation. exclude_support should be set to false if model's
    memorization capacity wants to be evaluated. 
    

    Since the accuracy on a task is dependent on the choice of random support,
    the evaluation experiment is repeated n_trials times for each task.
    (Each task gets n_trials experiments). The computed accuracies
    are averaged across trials.

    TODO(rbharath): Currently does not support any transformers.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to test on.
    metrics: dc.metrics.Metric
      Evaluation metric.
    n_pos: int, optional
      Number of positive samples per support.
    n_neg: int, optional
      Number of negative samples per support.
    exclude_support: bool, optional
      Whether support set should be excluded when computing model accuracy.
    """
        # Get batches
        test_tasks = range(len(dataset.get_task_names()))
        task_scores = {task: [] for task in test_tasks}
        support_generator = SupportGenerator(dataset, n_pos, n_neg, n_trials)
        for ind, (task, support) in enumerate(support_generator):
            print("Eval sample %d from task %s" % (ind, str(task)))
            # TODO(rbharath): Add test for get_task_dataset_minus_support for
            # multitask case with missing data...
            if exclude_support:
                print("Removing support datapoints for eval.")
                task_dataset = get_task_dataset_minus_support(
                    dataset, support, task)
            else:
                print("Keeping support datapoints for eval.")
                task_dataset = get_task_dataset(dataset, task)
            y_pred = self.predict_proba(support, task_dataset)
            task_scores[task].append(
                metric.compute_metric(task_dataset.y, y_pred, task_dataset.w))

        # Join information for all tasks.
        mean_task_scores = {}
        std_task_scores = {}
        for task in test_tasks:
            mean_task_scores[task] = np.mean(np.array(task_scores[task]))
            std_task_scores[task] = np.std(np.array(task_scores[task]))
        return mean_task_scores, std_task_scores