Exemplo n.º 1
0
    def test_load_curves_unordered(self):
        # Generate a curve that is unordered (according to env step).
        log_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
        writer = tf.summary.create_file_writer(log_dir)
        with writer.as_default():
            for global_step, env_step, avg_return in [(0, 5, 5.1), (1, 3, 3.2),
                                                      (2, 7, 7.3),
                                                      (2, 9, 9.5)]:
                tf.summary.scalar('Metrics/EnvironmentSteps',
                                  env_step,
                                  step=global_step)
                tf.summary.scalar('Metrics/AverageReturn',
                                  avg_return,
                                  step=global_step)
        # Test load_curves using steps of dependent variable as timepoint variable.
        # Check that, for repeated steps, only the last step is loaded.
        curves = data_loading.load_input_data([log_dir],
                                              'Metrics/AverageReturn',
                                              None,
                                              align_on_global_step=True)
        expected = np.array([[0, 1, 2], [5.1, 3.2, 9.5]])
        np.testing.assert_allclose(expected, curves[0])

        # Test load_curves using EnvironmentSteps as timepoint variable. Check that,
        # for repeated steps, only the last step is loaded, and that the curve is
        # now ordered.
        curves = data_loading.load_input_data([log_dir],
                                              'Metrics/AverageReturn',
                                              'Metrics/EnvironmentSteps',
                                              align_on_global_step=True)
        expected = np.array([[3, 5, 9], [3.2, 5.1, 9.5]])
        np.testing.assert_allclose(expected, curves[0])
    def evaluate(self, run_dirs, outfile_prefix='/tmp/robustness_results_'):
        """Evaluate robustness metrics on a set of run directories.

    Args:
      run_dirs: List of paths to directories containing Tensorboard summaries
        for all the runs of an experiment, one directory per run. Summaries must
        include a scalar or tensor summary that defines the variable to be
        analyzed (the 'dependent_variable'). Optionally they may also have a
        scalar or tensor summary that defines a "timepoint" (the
        'timepoint_variable').
      outfile_prefix: Prefix for JSON output files, where we write results and
        metric parameters.

    Returns:
      A dictionary of robustness values {metric_name: metric_value}
    """
        curves = data_loading.load_input_data(run_dirs,
                                              self.dependent_variable,
                                              self.timepoint_variable,
                                              self.align_on_global_step)

        results = self.compute_metrics(curves)
        self.write_results(results, outfile_prefix)

        return results
Exemplo n.º 3
0
    def test_load_curves_steps_cleanup_on_global_step(self):
        # Generate a curve where the steps differ for the timepoint and dependent
        # variables, and where there are repeated values of the step. Use the global
        # step to align timepoint and dependent variables.
        log_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
        writer = tf.summary.create_file_writer(log_dir)

        with writer.as_default():
            for global_step, env_step, avg_return in [(0, 3, 3.2), (1, 5, 5.1),
                                                      (2, 5, 7.3),
                                                      (2, 7, 7.4)]:
                tf.summary.scalar('Metrics/EnvironmentSteps',
                                  env_step,
                                  step=global_step)
                tf.summary.scalar('Metrics/AverageReturn',
                                  avg_return,
                                  step=global_step)

            # Add an extra summary only for the timepoint variable, with no
            # corresponding summary for the dependent variable.
            tf.summary.scalar('Metrics/EnvironmentSteps', 10, step=3)

        # Test load_input_data, check that we only load the summaries that have step
        # values in common for both variables, and that we only load the latest
        # summary for each step value.
        curves = data_loading.load_input_data([log_dir],
                                              'Metrics/AverageReturn',
                                              'Metrics/EnvironmentSteps',
                                              align_on_global_step=True)
        expected = np.array([[3, 5, 7], [3.2, 5.1, 7.4]])
        np.testing.assert_allclose(expected, curves[0])
Exemplo n.º 4
0
 def test_load_curves(self):
     curves = data_loading.load_input_data(self.run_dirs,
                                           'Metrics/AverageReturn',
                                           'Metrics/EnvironmentSteps',
                                           align_on_global_step=True)
     self.assertLen(curves, 3)
     self.assertEqual(curves[0].shape, (2, 3))
Exemplo n.º 5
0
    def test_load_curves_steps_cleanup_on_timestep_variable(
            self, align_on_global_step, expected):
        # Generate a curve where the steps differ for the timepoint and dependent
        # variables, and where there are repeated values of the step. Use the
        # timepoint variable to align timepoint and dependent variables.
        log_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
        with tf.summary.create_file_writer(log_dir).as_default():
            for global_step, env_step, avg_return in [(0, 5, 5.1), (1, 3, 3.2),
                                                      (2, 3, 7.3),
                                                      (2, 7, 7.4)]:
                # Write the timestep variable.
                tf.summary.scalar('Metrics/EnvironmentSteps',
                                  env_step,
                                  step=global_step)
                # Write the dependent variable.
                tf.summary.scalar('Metrics/AverageReturn',
                                  avg_return,
                                  step=env_step)
                tf.summary.scalar('Metrics/EnvironmentSteps', 10, step=3)

        curves = data_loading.load_input_data([log_dir],
                                              'Metrics/AverageReturn',
                                              'Metrics/EnvironmentSteps',
                                              align_on_global_step)
        np.testing.assert_allclose(expected, curves[0])
  def test_load_curves_with_restart_in_global_step(self):
    # Generate a curve where there is a restart in the global step variable.
    log_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
    with tf.summary.create_file_writer(log_dir).as_default():
      # Write the timestep variable.
      for global_step, env_step in [(0, 10), (1, 20), (1, 21), (2, 30)]:
        tf.summary.scalar(
            'Metrics/EnvironmentSteps', env_step, step=global_step)

      # Write the dependent variable.
      for global_step, avg_return in [(0, 1), (1, 2), (2, 3), (3, 4)]:
        tf.summary.scalar('Metrics/AverageReturn', avg_return, step=global_step)

    curves = data_loading.load_input_data([log_dir],
                                          'Metrics/AverageReturn',
                                          'Metrics/EnvironmentSteps',
                                          align_on_global_step=True)
    expected = np.array([[10, 21, 30], [1, 2, 3]])
    np.testing.assert_allclose(expected, curves[0])
    def evaluate_with_permutations(
            self,
            run_dirs_1,
            run_dirs_2,
            outfile_prefix='/tmp/robustness_results_permuted',
            n_permutations=1000,
            permutation_start_idx=0,
            random_seed=0):
        """Evaluate robustness metrics on runs permuted across two sets.

    This method is useful for computing permutation tests to evaluate
    statistical significance on the difference in metric values between two
    sets of runs (e.g. for one algorithm vs another algorithm). In particular,
    this method is necessary to run permutation tests for across-run metrics
    (for per-run metrics, we can run permutation tests just by permuting the
    original metrics values or rankings).

    We permute the runs across the two sets and divide into two sets of the
    same size as the original two sets. We evaluate the metrics on the
    two permuted sets. This is performed n_permutations times. This provides a
    null distribution that can later be loaded to compute a p-value for a
    permutation test.

    Args:
      run_dirs_1: List of paths to directories containing Tensorboard summaries
        for all the runs of an experiment, one directory per run. Summaries must
        include a scalar or tensor summary that defines the variable to be
        analyzed (the 'dependent_variable'). Optionally they may also have a
        scalar or tensor summary that defines a "timepoint" (the
        'timepoint_variable').
      run_dirs_2: Another list of paths.
      outfile_prefix: Prefix for JSON output files, where we write results and
        metric parameters.
      n_permutations: Number of permutations to perform.
      permutation_start_idx: If desired, the indexing of permutations can start
        at any integer. This affects the naming of the output files.
      random_seed: Numpy random seed.

    Returns:
      A list of robustness results. Each result is a dictionary of robustness
      values {metric_name: metric_value}
    """
        np.random.seed(random_seed)

        curves_1 = data_loading.load_input_data(run_dirs_1,
                                                self.dependent_variable,
                                                self.timepoint_variable,
                                                self.align_on_global_step)
        curves_2 = data_loading.load_input_data(run_dirs_2,
                                                self.dependent_variable,
                                                self.timepoint_variable,
                                                self.align_on_global_step)
        all_curves = curves_1 + curves_2

        all_results = {}
        for i_permutation in range(permutation_start_idx,
                                   permutation_start_idx + n_permutations):
            logging.info('Permutation %d...', i_permutation)
            curves_permuted = permute_curves(all_curves)
            curves_permuted_1 = curves_permuted[:len(curves_1)]
            curves_permuted_2 = curves_permuted[len(curves_1):]

            results_1 = self.compute_metrics(curves_permuted_1)
            results_2 = self.compute_metrics(curves_permuted_2)
            all_results['permutation%d' % i_permutation] = {
                'curves1': results_1,
                'curves2': results_2
            }

        permutation_end_idx = permutation_start_idx + n_permutations - 1
        outfile_prefix_extended = '%spermutations%dto%d_' % (
            outfile_prefix, permutation_start_idx, permutation_end_idx)
        self.write_results(all_results, outfile_prefix_extended)

        return all_results
    def evaluate_with_bootstraps(
            self,
            run_dirs,
            outfile_prefix='/tmp/robustness_results_bootstrapped',
            n_bootstraps=1000,
            bootstrap_start_idx=0,
            random_seed=0):
        """Evaluate robustness metrics on bootstrapped runs.

    I.e. the runs are resampled with replacement.

    This method is useful for computing bootstrapped confidence intervals on
    the metric values for a single set of runs (e.g. for a single algorithm).
    In particular, this method is necessary to obtain confidence intervals for
    across-run metrics (for per-run metrics, we can obtain confidence intervals
    just by bootstrapping the original metrics values or rankings).

    We bootstrap the runs (resample with replacement) n_bootstraps times, each
    time re-computing the metrics. This provides bootstrap distributions on
    the metric values that can later be loaded to compute confidence intervals.

    Args:
      run_dirs: List of paths to directories containing Tensorboard summaries
        for all the runs of an experiment, one directory per run. Summaries must
        include a scalar or tensor summary that defines the variable to be
        analyzed (the 'dependent_variable'). Optionally they may also have a
        scalar or tensor summary that defines a "timepoint" (the
        'timepoint_variable').
      outfile_prefix: Prefix for JSON output files, where we write results and
        metric parameters.
      n_bootstraps: Number of bootstraps to perform.
      bootstrap_start_idx: If desired, the indexing of bootstraps can start at
        any integer. This affects the naming of the output files.
      random_seed: Numpy random seed.

    Returns:
      A dict of robustness results. Each entry in the dict has the form
       {'bootstrap%%BOOTSTRAP_IDX%%': metric_result_for_this_resampling}.
      Each metric result is a dictionary of metric values
        {metric_name: metric_value}.
    """
        np.random.seed(random_seed)

        curves = data_loading.load_input_data(run_dirs,
                                              self.dependent_variable,
                                              self.timepoint_variable,
                                              self.align_on_global_step)

        all_results = {}
        for i_boot in range(bootstrap_start_idx,
                            bootstrap_start_idx + n_bootstraps):
            logging.info('Bootstrap %d...', i_boot)
            curves_resampled = resample_curves(curves)
            results_resampled = self.compute_metrics(curves_resampled)

            all_results['bootstrap%d' % i_boot] = results_resampled

        bootstrap_end_idx = bootstrap_start_idx + n_bootstraps - 1
        outfile_prefix_extended = '%sbootstraps%dto%d_' % (
            outfile_prefix, bootstrap_start_idx, bootstrap_end_idx)
        self.write_results(all_results, outfile_prefix_extended)

        return all_results