Пример #1
0
def run_exp(env,
            policy,
            hp,
            steps,
            dir_name,
            evaluate,
            seed,
            eval_interval,
            log_interval,
            save_interval,
            initial_exploration_steps):
    """Run a single training procedure.

    Parameters
    ----------
    env : str or gym.Env
        the training/testing environment
    policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
        the policy class to use
    hp : dict
        additional algorithm hyper-parameters
    steps : int
        total number of training steps
    dir_name : str
        the location the results files are meant to be stored
    evaluate : bool
        whether to include an evaluation environment
    seed : int
        specified the random seed for numpy, tensorflow, and random
    eval_interval : int
        number of simulation steps in the training environment before an
        evaluation is performed
    log_interval : int
        the number of training steps before logging training results
    save_interval : int
        number of simulation steps in the training environment before the model
        is saved
    initial_exploration_steps : int
        number of timesteps that the policy is run before training to
        initialize the replay buffer with samples
    """
    eval_env = env if evaluate else None

    alg = OffPolicyRLAlgorithm(
        policy=policy,
        env=env,
        eval_env=eval_env,
        **hp
    )

    # perform training
    alg.learn(
        total_timesteps=steps,
        log_dir=dir_name,
        log_interval=log_interval,
        eval_interval=eval_interval,
        save_interval=save_interval,
        initial_exploration_steps=initial_exploration_steps,
        seed=seed,
    )
Пример #2
0
def run_exp(env,
            hp,
            steps,
            dir_name,
            evaluate,
            seed,
            eval_interval,
            log_interval,
            save_interval):
    """Run a single training procedure.

    Parameters
    ----------
    env : str or gym.Env
        the training/testing environment
    hp : dict
        additional algorithm hyper-parameters
    steps : int
        total number of training steps
    dir_name : str
        the location the results files are meant to be stored
    evaluate : bool
        whether to include an evaluation environment
    seed : int
        specified the random seed for numpy, tensorflow, and random
    eval_interval : int
        number of simulation steps in the training environment before an
        evaluation is performed
    log_interval : int
        the number of training steps before logging training results
    save_interval : int
        number of simulation steps in the training environment before the model
        is saved
    """
    eval_env = env if evaluate else None

    alg = OffPolicyRLAlgorithm(
        policy=FeedForwardPolicy,
        env=env,
        eval_env=eval_env,
        **hp
    )

    # perform training
    alg.learn(
        total_timesteps=steps,
        log_dir=dir_name,
        log_interval=log_interval,
        eval_interval=eval_interval,
        save_interval=save_interval,
        seed=seed,
    )
Пример #3
0
    def test_setup_model_feedforward(self):
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = FeedForwardPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # check the policy_kwargs term
        policy_kwargs = FEEDFORWARD_PARAMS.copy()
        policy_kwargs.update(TD3_PARAMS)
        policy_kwargs['verbose'] = self.init_parameters['verbose']
        self.assertDictEqual(alg.policy_kwargs, policy_kwargs)

        with alg.graph.as_default():
            expected_vars = sorted([var.name for var in get_trainable_vars()])

        # Check that all trainable variables have been created in the
        # TensorFlow graph.
        self.assertListEqual(
            expected_vars,
            ['model/pi/fc0/bias:0',
             'model/pi/fc0/kernel:0',
             'model/pi/fc1/bias:0',
             'model/pi/fc1/kernel:0',
             'model/pi/output/bias:0',
             'model/pi/output/kernel:0',
             'model/qf_0/fc0/bias:0',
             'model/qf_0/fc0/kernel:0',
             'model/qf_0/fc1/bias:0',
             'model/qf_0/fc1/kernel:0',
             'model/qf_0/qf_output/bias:0',
             'model/qf_0/qf_output/kernel:0',
             'model/qf_1/fc0/bias:0',
             'model/qf_1/fc0/kernel:0',
             'model/qf_1/fc1/bias:0',
             'model/qf_1/fc1/kernel:0',
             'model/qf_1/qf_output/bias:0',
             'model/qf_1/qf_output/kernel:0',
             'target/pi/fc0/bias:0',
             'target/pi/fc0/kernel:0',
             'target/pi/fc1/bias:0',
             'target/pi/fc1/kernel:0',
             'target/pi/output/bias:0',
             'target/pi/output/kernel:0',
             'target/qf_0/fc0/bias:0',
             'target/qf_0/fc0/kernel:0',
             'target/qf_0/fc1/bias:0',
             'target/qf_0/fc1/kernel:0',
             'target/qf_0/qf_output/bias:0',
             'target/qf_0/qf_output/kernel:0',
             'target/qf_1/fc0/bias:0',
             'target/qf_1/fc0/kernel:0',
             'target/qf_1/fc1/bias:0',
             'target/qf_1/fc1/kernel:0',
             'target/qf_1/qf_output/bias:0',
             'target/qf_1/qf_output/kernel:0']
        )
Пример #4
0
    def test_learn_init(self):
        """Test the non-loop components of the `learn` method."""
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Run the learn operation for zero timesteps.
        alg.learn(0, log_dir='results', initial_exploration_steps=0)
        self.assertEqual(alg.episodes, 0)
        self.assertEqual(alg.total_steps, 0)
        self.assertEqual(alg.epoch, 0)
        self.assertEqual(len(alg.episode_rewards_history), 0)
        self.assertEqual(alg.epoch_episodes, 0)
        self.assertEqual(len(alg.epoch_actions), 0)
        self.assertEqual(len(alg.epoch_q1s), 0)
        self.assertEqual(len(alg.epoch_q2s), 0)
        self.assertEqual(len(alg.epoch_actor_losses), 0)
        self.assertEqual(len(alg.epoch_q1_losses), 0)
        self.assertEqual(len(alg.epoch_q2_losses), 0)
        self.assertEqual(len(alg.epoch_episode_rewards), 0)
        self.assertEqual(len(alg.epoch_episode_steps), 0)
        shutil.rmtree('results')

        # Test the seeds.
        alg.learn(0, log_dir='results', seed=1, initial_exploration_steps=0)
        self.assertEqual(np.random.sample(), 0.417022004702574)
        self.assertEqual(random.uniform(0, 1), 0.13436424411240122)
        shutil.rmtree('results')
Пример #5
0
    def test_evaluate(self):
        """Validate the functionality of the _evaluate method.

        This is done for the following cases:

        1. policy = FeedForwardPolicy
        2. policy = GoalConditionedPolicy
        """
        # Set the random seeds.
        random.seed(0)
        np.random.seed(0)
        tf.compat.v1.set_random_seed(0)

        # =================================================================== #
        # test case 1                                                         #
        # =================================================================== #

        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = FeedForwardPolicy
        policy_params['eval_env'] = 'MountainCarContinuous-v0'
        policy_params['nb_eval_episodes'] = 1
        policy_params['verbose'] = 2
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Run the _evaluate operation.
        ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env)

        # Test the output from the operation.
        self.assertEqual(len(ep_rewards), 1)
        self.assertEqual(len(ep_successes), 0)
        self.assertEqual(list(info.keys()), ['initial', 'final', 'average'])

        # Clear memory.
        del alg

        # =================================================================== #
        # test case 2                                                         #
        # =================================================================== #

        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['eval_env'] = 'MountainCarContinuous-v0'
        policy_params['nb_eval_episodes'] = 1
        policy_params['verbose'] = 2
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Run the _evaluate operation.
        ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env)

        # Test the output from the operation.
        self.assertEqual(len(ep_rewards), 1)
        self.assertEqual(len(ep_successes), 0)
        self.assertEqual(list(info.keys()), ['initial', 'final', 'average'])

        # Clear memory.
        del alg
Пример #6
0
    def test_fingerprints(self):
        """Validate the functionality of the fingerprints.

        When the fingerprint functionality is turned on, the observation within
        the algorithm (stored under self.obs) should always include the
        fingerprint element.

        Policy-specific features of the fingerprint implementation are also
        tested here. This feature should add a fingerprint dimension to the
        observation spaces, but NOT the context space of the lower-level or the
        action space of the higher-level. The intrinsic reward function should
        also be ignoring the fingerprint elements during its computation. The
        fingerprint elements are passed by the algorithm, and tested under
        test_algorithm.py
        """
        # Create the algorithm.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['nb_rollout_steps'] = 1
        policy_params['policy_kwargs'] = {'use_fingerprints': True}
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Test the observation spaces of the policies, as well as the context
        # space of the lower-level policy and action space of the higher-level
        # policy.
        self.assertTupleEqual(alg.policy_tf.policy[0].ob_space.shape, (4,))
        self.assertTupleEqual(alg.policy_tf.policy[0].ac_space.shape, (2,))
        self.assertTupleEqual(alg.policy_tf.policy[-1].ob_space.shape, (4,))
        self.assertTupleEqual(alg.policy_tf.policy[-1].co_space.shape, (2,))

        # Test intrinsic_reward method within the policy.
        self.assertAlmostEqual(
            alg.policy_tf.intrinsic_reward_fn(
                states=np.array([1, 2, 3]),
                goals=np.array([0, 0]),
                next_states=np.array([1, 2, 3])),
            -np.sqrt(1**2 + 2**2)
        )

        # Validate that observations include the fingerprints elements upon
        # initializing the `learn` procedure and  during a step in the
        # `_collect_samples` method.
        alg.learn(1, log_dir='results', log_interval=1,
                  initial_exploration_steps=0)
        self.assertEqual(len(alg.obs[0]), alg.ob_space.shape[0])
        np.testing.assert_almost_equal(
            alg.obs[0][-alg.policy_tf.fingerprint_dim[0]:], np.array([0, 5]))

        # Validate that observations include the fingerprints elements during
        # a reset in the `_collect_samples` method.
        alg.learn(500, log_dir='results', log_interval=500,
                  initial_exploration_steps=0)
        self.assertEqual(len(alg.obs[0]), alg.ob_space.shape[0])
        np.testing.assert_almost_equal(
            alg.obs[0][-alg.policy_tf.fingerprint_dim[0]:],
            np.array([4.99, 0.01]))

        # Delete generated files.
        shutil.rmtree('results')
Пример #7
0
    def test_log_eval(self):
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['_init_setup_model'] = False
        alg = OffPolicyRLAlgorithm(**policy_params)

        # test for one evaluation environment
        rewards = [0, 1, 2]
        successes = [True, False, False]
        info = {"test": 5}
        alg._log_eval(file_path="test_eval.csv",
                      start_time=0,
                      rewards=rewards,
                      successes=successes,
                      info=info)

        # check that the file was generated
        self.assertTrue(os.path.exists('test_eval_0.csv'))

        # import the stored data
        reader = csv.DictReader(open('test_eval_0.csv', 'r'))
        results = {"successes": [], "rewards": [], "test": []}
        for line in reader:
            results["successes"].append(float(line["success_rate"]))
            results["rewards"].append(float(line["average_return"]))
            results["test"].append(float(line["test"]))

        # test that the data matches expected values
        self.assertListEqual(results["rewards"], [1])
        self.assertListEqual(results["successes"], [1 / 3])
        self.assertListEqual(results["test"], [5])

        # Delete generated files.
        os.remove('test_eval_0.csv')

        # test for one evaluation environment with no successes
        successes = []
        alg._log_eval(file_path="test_eval.csv",
                      start_time=0,
                      rewards=rewards,
                      successes=successes,
                      info=info)

        # check that the file was generated
        self.assertTrue(os.path.exists('test_eval_0.csv'))

        # import the stored data
        reader = csv.DictReader(open('test_eval_0.csv', 'r'))
        results = {"successes": []}
        for line in reader:
            results["successes"].append(float(line["success_rate"]))

        # test that the successes are all zero
        self.assertListEqual(results["successes"], [0])

        # Delete generated files.
        os.remove('test_eval_0.csv')
Пример #8
0
    def test_learn_initial_exploration_steps(self):
        """Test the initial_exploration_steps parameter in the learn method.

        This is done for the following cases:

        1. initial_exploration_steps= = 0
        2. initial_exploration_steps= = 100
        """
        # =================================================================== #
        # test case 1                                                         #
        # =================================================================== #

        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = FeedForwardPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Run the learn operation for zero exploration steps.
        alg.learn(0, log_dir='results', initial_exploration_steps=0)

        # Check the size of the replay buffer
        self.assertEqual(len(alg.policy_tf.replay_buffer), 1)

        # Clear memory.
        del alg
        shutil.rmtree('results')

        # =================================================================== #
        # test case 2                                                         #
        # =================================================================== #

        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = FeedForwardPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Run the learn operation for zero exploration steps.
        alg.learn(0, log_dir='results', initial_exploration_steps=100)

        # Check the size of the replay buffer
        self.assertEqual(len(alg.policy_tf.replay_buffer), 100)

        # Clear memory.
        del alg
        shutil.rmtree('results')
Пример #9
0
    def test_init(self):
        """Ensure that the parameters at init are as expected."""
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['_init_setup_model'] = False
        alg = OffPolicyRLAlgorithm(**policy_params)

        # Test the attribute values.
        self.assertEqual(alg.policy, self.init_parameters['policy'])
        self.assertEqual(alg.eval_env, self.init_parameters['eval_env'])
        self.assertEqual(alg.nb_train_steps,
                         self.init_parameters['nb_train_steps'])
        self.assertEqual(alg.nb_rollout_steps,
                         self.init_parameters['nb_rollout_steps'])
        self.assertEqual(alg.nb_eval_episodes,
                         self.init_parameters['nb_eval_episodes'])
        self.assertEqual(alg.reward_scale,
                         self.init_parameters['reward_scale'])
        self.assertEqual(alg.render, self.init_parameters['render'])
        self.assertEqual(alg.render_eval, self.init_parameters['render_eval'])
        self.assertEqual(alg.verbose, self.init_parameters['verbose'])
Пример #10
0
    def test_setup_model_goal_conditioned(self):
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # check the policy_kwargs term
        policy_kwargs = GOAL_CONDITIONED_PARAMS.copy()
        policy_kwargs.update(TD3_PARAMS)
        policy_kwargs['verbose'] = self.init_parameters['verbose']
        policy_kwargs['env_name'] = self.init_parameters['env']
        policy_kwargs['num_envs'] = self.init_parameters['num_envs']
        self.assertDictEqual(alg.policy_kwargs, policy_kwargs)

        with alg.graph.as_default():
            expected_vars = sorted([var.name for var in get_trainable_vars()])

        # Check that all trainable variables have been created in the
        # TensorFlow graph.
        self.assertListEqual(
            expected_vars,
            ['level_0/model/pi/fc0/bias:0',
             'level_0/model/pi/fc0/kernel:0',
             'level_0/model/pi/fc1/bias:0',
             'level_0/model/pi/fc1/kernel:0',
             'level_0/model/pi/output/bias:0',
             'level_0/model/pi/output/kernel:0',
             'level_0/model/qf_0/fc0/bias:0',
             'level_0/model/qf_0/fc0/kernel:0',
             'level_0/model/qf_0/fc1/bias:0',
             'level_0/model/qf_0/fc1/kernel:0',
             'level_0/model/qf_0/qf_output/bias:0',
             'level_0/model/qf_0/qf_output/kernel:0',
             'level_0/model/qf_1/fc0/bias:0',
             'level_0/model/qf_1/fc0/kernel:0',
             'level_0/model/qf_1/fc1/bias:0',
             'level_0/model/qf_1/fc1/kernel:0',
             'level_0/model/qf_1/qf_output/bias:0',
             'level_0/model/qf_1/qf_output/kernel:0',
             'level_0/target/pi/fc0/bias:0',
             'level_0/target/pi/fc0/kernel:0',
             'level_0/target/pi/fc1/bias:0',
             'level_0/target/pi/fc1/kernel:0',
             'level_0/target/pi/output/bias:0',
             'level_0/target/pi/output/kernel:0',
             'level_0/target/qf_0/fc0/bias:0',
             'level_0/target/qf_0/fc0/kernel:0',
             'level_0/target/qf_0/fc1/bias:0',
             'level_0/target/qf_0/fc1/kernel:0',
             'level_0/target/qf_0/qf_output/bias:0',
             'level_0/target/qf_0/qf_output/kernel:0',
             'level_0/target/qf_1/fc0/bias:0',
             'level_0/target/qf_1/fc0/kernel:0',
             'level_0/target/qf_1/fc1/bias:0',
             'level_0/target/qf_1/fc1/kernel:0',
             'level_0/target/qf_1/qf_output/bias:0',
             'level_0/target/qf_1/qf_output/kernel:0',
             'level_1/model/pi/fc0/bias:0',
             'level_1/model/pi/fc0/kernel:0',
             'level_1/model/pi/fc1/bias:0',
             'level_1/model/pi/fc1/kernel:0',
             'level_1/model/pi/output/bias:0',
             'level_1/model/pi/output/kernel:0',
             'level_1/model/qf_0/fc0/bias:0',
             'level_1/model/qf_0/fc0/kernel:0',
             'level_1/model/qf_0/fc1/bias:0',
             'level_1/model/qf_0/fc1/kernel:0',
             'level_1/model/qf_0/qf_output/bias:0',
             'level_1/model/qf_0/qf_output/kernel:0',
             'level_1/model/qf_1/fc0/bias:0',
             'level_1/model/qf_1/fc0/kernel:0',
             'level_1/model/qf_1/fc1/bias:0',
             'level_1/model/qf_1/fc1/kernel:0',
             'level_1/model/qf_1/qf_output/bias:0',
             'level_1/model/qf_1/qf_output/kernel:0',
             'level_1/target/pi/fc0/bias:0',
             'level_1/target/pi/fc0/kernel:0',
             'level_1/target/pi/fc1/bias:0',
             'level_1/target/pi/fc1/kernel:0',
             'level_1/target/pi/output/bias:0',
             'level_1/target/pi/output/kernel:0',
             'level_1/target/qf_0/fc0/bias:0',
             'level_1/target/qf_0/fc0/kernel:0',
             'level_1/target/qf_0/fc1/bias:0',
             'level_1/target/qf_0/fc1/kernel:0',
             'level_1/target/qf_0/qf_output/bias:0',
             'level_1/target/qf_0/qf_output/kernel:0',
             'level_1/target/qf_1/fc0/bias:0',
             'level_1/target/qf_1/fc0/kernel:0',
             'level_1/target/qf_1/fc1/bias:0',
             'level_1/target/qf_1/fc1/kernel:0',
             'level_1/target/qf_1/qf_output/bias:0',
             'level_1/target/qf_1/qf_output/kernel:0']
        )
Пример #11
0
def train_h_baselines(env_name, args, multiagent):
    """Train policies using SAC and TD3 with h-baselines."""
    from hbaselines.algorithms import OffPolicyRLAlgorithm
    from hbaselines.utils.train import parse_options, get_hyperparameters

    # Get the command-line arguments that are relevant here
    args = parse_options(description="", example_usage="", args=args)

    # the base directory that the logged data will be stored in
    base_dir = "training_data"

    for i in range(args.n_training):
        # value of the next seed
        seed = args.seed + i

        # The time when the current experiment started.
        now = strftime("%Y-%m-%d-%H:%M:%S")

        # Create a save directory folder (if it doesn't exist).
        dir_name = os.path.join(base_dir, '{}/{}'.format(args.env_name, now))
        ensure_dir(dir_name)

        # Get the policy class.
        if args.alg == "TD3":
            if multiagent:
                from hbaselines.multi_fcnet.td3 import MultiFeedForwardPolicy
                policy = MultiFeedForwardPolicy
            else:
                from hbaselines.fcnet.td3 import FeedForwardPolicy
                policy = FeedForwardPolicy
        elif args.alg == "SAC":
            if multiagent:
                from hbaselines.multi_fcnet.sac import MultiFeedForwardPolicy
                policy = MultiFeedForwardPolicy
            else:
                from hbaselines.fcnet.sac import FeedForwardPolicy
                policy = FeedForwardPolicy
        else:
            raise ValueError("Unknown algorithm: {}".format(args.alg))

        # Get the hyperparameters.
        hp = get_hyperparameters(args, policy)

        # Add the seed for logging purposes.
        params_with_extra = hp.copy()
        params_with_extra['seed'] = seed
        params_with_extra['env_name'] = args.env_name
        params_with_extra['policy_name'] = policy.__name__
        params_with_extra['algorithm'] = args.alg
        params_with_extra['date/time'] = now

        # Add the hyperparameters to the folder.
        with open(os.path.join(dir_name, 'hyperparameters.json'), 'w') as f:
            json.dump(params_with_extra, f, sort_keys=True, indent=4)

        # Create the algorithm object.
        alg = OffPolicyRLAlgorithm(
            policy=policy,
            env="flow:{}".format(env_name),
            eval_env="flow:{}".format(env_name) if args.evaluate else None,
            **hp)

        # Perform training.
        alg.learn(
            total_steps=args.total_steps,
            log_dir=dir_name,
            log_interval=args.log_interval,
            eval_interval=args.eval_interval,
            save_interval=args.save_interval,
            initial_exploration_steps=args.initial_exploration_steps,
            seed=seed,
        )
Пример #12
0
def main(args):
    """Execute multiple training operations."""
    flags = parse_options(args)

    # get the hyperparameters
    env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name)
    hp['render'] = not flags.no_render  # to visualize the policy

    # create the algorithm object. We will be using the eval environment in
    # this object to perform the rollout.
    alg = OffPolicyRLAlgorithm(policy=policy,
                               env=env_name,
                               eval_env=env_name,
                               **hp)

    # setup the seed value
    random.seed(seed)
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)

    # get the checkpoint number
    if flags.ckpt_num is None:
        filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints"))
        metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
        metanum = [int(f.split("-")[-1]) for f in metafiles]
        ckpt_num = max(metanum)
    else:
        ckpt_num = flags.ckpt_num

    # location to the checkpoint
    ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num))

    # restore the previous checkpoint
    alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars)
    alg.load(ckpt)

    # some variables that will be needed when replaying the rollout
    policy = alg.policy_tf
    env = alg.eval_env

    # Perform the evaluation procedure.
    episdoe_rewards = []

    for episode_num in range(flags.num_rollouts):
        # Run a rollout.
        obs = env.reset()
        total_reward = 0
        while True:
            context = [env.current_context] \
                if hasattr(env, "current_context") else None
            action = policy.get_action(
                np.asarray([obs]),
                context=context,
                apply_noise=False,
                random_actions=False,
            )
            obs, reward, done, _ = env.step(action)
            if not flags.no_render:
                env.render()
            total_reward += reward
            if done:
                break

        # Print total returns from a given episode.
        episdoe_rewards.append(total_reward)
        print("Round {}, return: {}".format(episode_num, total_reward))

    # Print total statistics.
    print("Average, std return: {}, {}".format(np.mean(episdoe_rewards),
                                               np.std(episdoe_rewards)))
Пример #13
0
def main(args):
    """Execute multiple training operations."""
    flags = parse_options(args)

    # get the hyperparameters
    env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name)
    hp['render'] = not flags.no_render  # to visualize the policy

    # create the algorithm object. We will be using the eval environment in
    # this object to perform the rollout.
    alg = OffPolicyRLAlgorithm(policy=policy, env=env_name, **hp)

    # setup the seed value
    if not flags.random_seed:
        random.seed(seed)
        np.random.seed(seed)
        tf.compat.v1.set_random_seed(seed)

    # get the checkpoint number
    if flags.ckpt_num is None:
        filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints"))
        metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
        metanum = [int(f.split("-")[-1]) for f in metafiles]
        ckpt_num = max(metanum)
    else:
        ckpt_num = flags.ckpt_num

    # location to the checkpoint
    ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num))

    # restore the previous checkpoint
    alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars)
    alg.load(ckpt)

    # some variables that will be needed when replaying the rollout
    policy = alg.policy_tf
    env = alg.sampler.env

    # Perform the evaluation procedure.
    episdoe_rewards = []

    # Add an emission path to Flow environments.
    if env_name in FLOW_ENV_NAMES:
        sim_params = deepcopy(env.wrapped_env.sim_params)
        sim_params.emission_path = "./flow_results"
        env.wrapped_env.restart_simulation(sim_params,
                                           render=not flags.no_render)

    for episode_num in range(flags.num_rollouts):
        # Run a rollout.
        obs = env.reset()
        total_reward = 0
        while True:
            context = [env.current_context] \
                if hasattr(env, "current_context") else None
            action = policy.get_action(
                np.asarray([obs]),
                context=context,
                apply_noise=False,
                random_actions=False,
            )
            obs, reward, done, _ = env.step(action[0])
            if not flags.no_render:
                env.render()
            total_reward += reward
            if done:
                break

        # Print total returns from a given episode.
        episdoe_rewards.append(total_reward)
        print("Round {}, return: {}".format(episode_num, total_reward))

    # Print total statistics.
    print("Average, std return: {}, {}".format(np.mean(episdoe_rewards),
                                               np.std(episdoe_rewards)))

    if env_name in FLOW_ENV_NAMES:
        # wait a short period of time to ensure the xml file is readable
        time.sleep(0.1)

        # collect the location of the emission file
        dir_path = env.wrapped_env.sim_params.emission_path
        emission_filename = "{0}-emission.xml".format(
            env.wrapped_env.network.name)
        emission_path = os.path.join(dir_path, emission_filename)

        # convert the emission file into a csv
        emission_to_csv(emission_path)

        # Delete the .xml version of the emission file.
        os.remove(emission_path)
Пример #14
0
def main(args):
    """Execute multiple training operations."""
    flags = parse_options(args)

    # Run assertions.
    assert not (flags.no_render and flags.save_video), \
        "If saving the rendering, no_render cannot be set to True."

    # get the hyperparameters
    env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name)
    hp['num_envs'] = 1
    hp['render_eval'] = not flags.no_render  # to visualize the policy

    # create the algorithm object. We will be using the eval environment in
    # this object to perform the rollout.
    alg = OffPolicyRLAlgorithm(policy=policy,
                               env=env_name,
                               eval_env=env_name,
                               **hp)

    # setup the seed value
    if not flags.random_seed:
        random.seed(seed)
        np.random.seed(seed)
        tf.compat.v1.set_random_seed(seed)

    # get the checkpoint number
    if flags.ckpt_num is None:
        filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints"))
        metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
        metanum = [int(f.split("-")[-1]) for f in metafiles]
        ckpt_num = max(metanum)
    else:
        ckpt_num = flags.ckpt_num

    # location to the checkpoint
    ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num))

    # restore the previous checkpoint
    alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars)
    alg.load(ckpt)

    # some variables that will be needed when replaying the rollout
    policy = alg.policy_tf
    env = alg.eval_env

    # Perform the evaluation procedure.
    episode_rewards = []

    # Add an emission path to Flow environments.
    if env_name in FLOW_ENV_NAMES:
        sim_params = deepcopy(env.wrapped_env.sim_params)
        sim_params.emission_path = "./flow_results"
        env.wrapped_env.restart_simulation(sim_params,
                                           render=not flags.no_render)

    if not isinstance(env, list):
        env_list = [env]
    else:
        env_list = env

    for env_num, env in enumerate(env_list):
        for episode_num in range(flags.num_rollouts):
            if not flags.no_render and env_name not in FLOW_ENV_NAMES:
                out = FFmpegWriter("{}_{}_{}.mp4".format(
                    flags.video, env_num, episode_num))
            else:
                out = None

            obs, total_reward = env.reset(), 0

            while True:
                context = [env.current_context] \
                    if hasattr(env, "current_context") else None

                action = policy.get_action(
                    obs=np.asarray([obs]),
                    context=context,
                    apply_noise=False,
                    random_actions=False,
                )

                # Visualize the sub-goals of the hierarchical policy.
                if hasattr(policy, "_meta_action") \
                        and policy._meta_action is not None \
                        and hasattr(env, "set_goal"):
                    goal = policy._meta_action[0][0] + (obs[
                        policy.goal_indices] if policy.relative_goals else 0)
                    env.set_goal(goal)

                new_obs, reward, done, _ = env.step(action[0])
                if not flags.no_render:
                    if flags.save_video:
                        if alg.env_name == "AntGather":
                            out.writeFrame(env.render(mode='rgb_array'))
                        else:
                            out.writeFrame(
                                env.render(mode='rgb_array',
                                           height=1024,
                                           width=1024))
                    else:
                        env.render()
                total_reward += reward
                if done:
                    break

                policy.store_transition(
                    obs0=obs,
                    context0=context[0] if context is not None else None,
                    action=action[0],
                    reward=reward,
                    obs1=new_obs,
                    context1=context[0] if context is not None else None,
                    done=done,
                    is_final_step=done,
                    evaluate=True,
                )

                obs = new_obs

            # Print total returns from a given episode.
            episode_rewards.append(total_reward)
            print("Round {}, return: {}".format(episode_num, total_reward))

            # Save the video.
            if not flags.no_render and env_name not in FLOW_ENV_NAMES \
                    and flags.save_video:
                out.close()

    # Print total statistics.
    print("Average, std return: {}, {}".format(np.mean(episode_rewards),
                                               np.std(episode_rewards)))
Пример #15
0
def main(args):
    """Execute multiple training operations."""
    flags = parse_options(args)

    data = {
        'name': [],
        'step': [],
        'distance': [],
    }

    for dir_name, name in zip(flags.dir_name, flags.name):

        # get the hyperparameters
        env_name, policy, hp, seed = get_hyperparameters_from_dir(dir_name)

        print(hp.keys())
        del hp['algorithm']
        del hp['date/time']

        # create the algorithm object. We will be using the eval environment in
        # this object to perform the rollout.
        alg = OffPolicyRLAlgorithm(
            policy=policy, env=env_name, eval_env=env_name, **hp)

        # setup the seed value
        random.seed(seed)
        np.random.seed(seed)
        tf.compat.v1.set_random_seed(seed)

        filenames = os.listdir(os.path.join(dir_name, "checkpoints"))
        metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
        metanum = list(sorted([int(f.split("-")[-1]) for f in metafiles]))[:-1]

        # get the checkpoint number
        ckpt_num = max(metanum)

        # location to the checkpoint
        ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num))

        # restore the previous checkpoint
        alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars)
        alg.load(ckpt)

        # some variables that will be needed when replaying the rollout
        policy = alg.policy_tf

        batches = []
        for b in range(flags.num_batches):
            worker_obs0 = policy.replay_buffer.sample(with_additional=False)[5]
            batches.append(worker_obs0)

        for ckpt_num_one, ckpt_num_two in zip(metanum[1:], metanum[:-1]):

            # get the checkpoint number
            ckpt_num = ckpt_num_one

            # location to the checkpoint
            ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num))

            # restore the previous checkpoint
            alg.load(ckpt)

            # some variables that will be needed when replaying the rollout
            policy = alg.policy_tf

            mean_one = []
            for b in batches:
                a = policy.policy[-1].get_action(b, None, False, False)
                mean_one.append(a)

            # get the checkpoint number
            ckpt_num = ckpt_num_two

            # location to the checkpoint
            ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num))

            # restore the previous checkpoint
            alg.load(ckpt)

            # some variables that will be needed when replaying the rollout
            policy = alg.policy_tf

            mean_two = []
            for b in batches:
                a = policy.policy[-1].get_action(b, None, False, False)
                mean_two.append(a)

            # compute a distance metric between the policies
            mean_one = np.concatenate(mean_one, axis=0)
            mean_two = np.concatenate(mean_two, axis=0)
            kl = np.sum((mean_one - mean_two) ** 2, axis=1).mean()
            print("{},{},{},{}".format(name, ckpt_num_one, ckpt_num_two, kl))

            data['name'].append(name)
            data['step'].append(ckpt_num_one)
            data['distance'].append(kl)

    df = pd.DataFrame(data, columns=['name', 'step', 'distance'])
    plt.title("Manager MDP Non-Stationarity")
    ax = sns.lineplot(x='step', y='distance', hue='name', data=df)
    plt.savefig('ns.png')