Exemplo n.º 1
0
    def test_ddpg_trainer(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(200000, 1.0)
        epochs = 3
        trainer = DDPGTrainer(
            self.get_ddpg_parameters(),
            environment.normalization,
            environment.normalization_action,
        )
        evaluator = GridworldDDPGEvaluator(environment, True)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for epoch in range(epochs):
            print("On epoch {} of {}".format(epoch + 1, epochs))
            critic_predictor = trainer.predictor()
            evaluator.evaluate_critic(critic_predictor)
            for tdp in tdps:
                training_samples = [
                    tdp.states,
                    tdp.actions,
                    tdp.rewards.flatten(),
                    tdp.next_states,
                    None,
                    1 - tdp.not_terminals.flatten(),  # done
                    None,
                    None,
                    [1 for i in range(len(tdp.states))],  # time diff
                ]
                trainer.train(training_samples)

        critic_predictor = trainer.predictor()
        error = evaluator.evaluate_critic(critic_predictor)
        print("gridworld MAE: {0:.3f}".format(error))
Exemplo n.º 2
0
    def test_ddpg_trainer(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(500000, 0.25)
        trainer = DDPGTrainer(
            self.get_ddpg_parameters(),
            environment.normalization,
            environment.normalization_action,
            environment.min_action_range,
            environment.max_action_range,
        )
        evaluator = GridworldDDPGEvaluator(environment, True, DISCOUNT, False,
                                           samples)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        critic_predictor = trainer.predictor(actor=False)
        evaluator.evaluate_critic(critic_predictor)
        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        # Make sure actor predictor works
        actor = trainer.predictor(actor=True)
        evaluator.evaluate_actor(actor)

        # Evaluate critic predicor for correctness
        critic_predictor = trainer.predictor(actor=False)
        error = evaluator.evaluate_critic(critic_predictor)
        print("gridworld MAE: {0:.3f}".format(error))
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, _ = environment.generate_samples(100000, 1.0)
        true_values = environment.true_values_for_sample(
            states, actions, False)
        # Hijack the reward timeline to insert the ground truth
        reward_timelines = []
        for tv in true_values:
            reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.05)
        self.assertLess(evaluator.mc_loss[-1], 0.12)
    def test_trainer_sarsa(self):
        environment = GridworldContinuous()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        evaluator = GridworldContinuousEvaluator(environment, False)
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        self.assertGreater(evaluator.evaluate(predictor), 0.15)

        for tdp in tdps:
            trainer.train_numpy(tdp, None)
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
    def test_trainer_maxq(self):
        environment = GridworldContinuous()
        rl_parameters = self.get_sarsa_parameters()
        new_rl_parameters = ContinuousActionModelParameters(
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=rl_parameters.training,
            knn=rl_parameters.knn,
        )
        maxq_trainer = ContinuousActionDQNTrainer(
            new_rl_parameters,
            environment.normalization,
            environment.normalization_action,
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldContinuousEvaluator(environment, True)
        self.assertGreater(evaluator.evaluate(predictor), 0.2)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.15)
Exemplo n.º 6
0
 def test_gridworld_continuous_generate_samples(self):
     env = GridworldContinuous()
     num_samples = 1000
     num_steps = 5
     samples = env.generate_samples(
         num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps
     )
     self._check_samples(samples, num_samples, num_steps, True)
    def test_evaluator_timeline(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(1)

        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Exemplo n.º 8
0
 def test_gridworld_continuous_generate_samples(self):
     env = GridworldContinuous()
     num_samples = 1000
     num_steps = 5
     samples = env.generate_samples(
         num_samples,
         epsilon=1.0,
         discount_factor=0.9,
         multi_steps=num_steps,
         include_shorter_samples_at_start=True,
         include_shorter_samples_at_end=True,
     )
     self._check_samples(samples, num_samples, num_steps, True)
Exemplo n.º 9
0
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(500000, 1.0, DISCOUNT)
        # Hijack the reward timeline to insert the ground truth
        samples.episode_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(None, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.15)
Exemplo n.º 10
0
    def test_trainer_sarsa(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT,
                                                 False, samples)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train_numpy(tdp, None)
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.15)
Exemplo n.º 11
0
 def generate_samples(self, num_transitions, epsilon,
                      discount_factor) -> Samples:
     samples = GridworldContinuous.generate_samples(self, num_transitions,
                                                    epsilon,
                                                    discount_factor)
     enum_states = []
     for state in samples.states:
         enum_states.append({0: float(list(state.keys())[0])})
     enum_next_states = []
     for state in samples.next_states:
         enum_next_states.append({0: float(list(state.keys())[0])})
     return Samples(
         mdp_ids=samples.mdp_ids,
         sequence_numbers=samples.sequence_numbers,
         states=enum_states,
         actions=samples.actions,
         action_probabilities=samples.action_probabilities,
         rewards=samples.rewards,
         possible_actions=samples.possible_actions,
         next_states=enum_next_states,
         next_actions=samples.next_actions,
         terminals=samples.terminals,
         possible_next_actions=samples.possible_next_actions,
         episode_values=samples.episode_values,
     )
 def true_values_for_sample(self, enum_states, actions, assume_optimal_policy: bool):
     states = []
     for state in enum_states:
         states.append({int(list(state.values())[0]): 1})
     return GridworldContinuous.true_values_for_sample(
         self, states, actions, assume_optimal_policy
     )
Exemplo n.º 13
0
    def _test_trainer_sarsa(self,
                            use_gpu=False,
                            use_all_avail_gpus=False,
                            modular=False):
        environment = GridworldContinuous()
        evaluator = GridworldContinuousEvaluator(
            environment,
            assume_optimal_policy=False,
            gamma=DISCOUNT,
            use_int_features=False,
        )

        if modular:
            # FIXME: the exporter should make a copy of the model; moving it to CPU inplace
            if use_gpu:
                self.run_pre_training_eval = False
            if use_all_avail_gpus:
                self.tolerance_threshold = 0.11
            trainer, exporter = self.get_modular_sarsa_trainer_exporter(
                environment, None, use_gpu, use_all_avail_gpus)
        else:
            trainer, exporter = self.get_sarsa_trainer_exporter(
                environment, None, use_gpu, use_all_avail_gpus)

        self.evaluate_gridworld(environment, evaluator, trainer, exporter,
                                use_gpu)
Exemplo n.º 14
0
    def _test_sac_trainer(self, use_2_q_functions=False, use_gpu=False):
        environment = GridworldContinuous()
        trainer = self.get_sac_trainer(
            environment, self.get_sac_parameters(use_2_q_functions), use_gpu)
        evaluator = GridworldContinuousEvaluator(
            environment,
            assume_optimal_policy=False,
            gamma=DISCOUNT,
            use_int_features=False,
        )

        exporter = self.get_critic_exporter(trainer, environment)

        self.tolerance_threshold = 0.2
        if use_gpu:
            self.run_pre_training_eval = False
        self.evaluate_gridworld(environment, evaluator, trainer, exporter,
                                use_gpu)

        # Make sure actor predictor works
        actor_predictor = self.get_actor_predictor(trainer, environment)
        # Just test that it doesn't blow up
        preds = actor_predictor.predict(evaluator.logged_states, None)
        self._test_save_load_actor(preds, actor_predictor,
                                   evaluator.logged_states)
 def generate_samples(self,
                      num_transitions,
                      epsilon,
                      with_possible=True) -> Samples:
     samples = GridworldContinuous.generate_samples(self, num_transitions,
                                                    epsilon, with_possible)
     enum_states = []
     for state in samples.states:
         enum_states.append({0: float(list(state.keys())[0])})
     enum_next_states = []
     for state in samples.next_states:
         enum_next_states.append({0: float(list(state.keys())[0])})
     return Samples(
         mdp_ids=samples.mdp_ids,
         sequence_numbers=samples.sequence_numbers,
         states=enum_states,
         actions=samples.actions,
         propensities=samples.propensities,
         rewards=samples.rewards,
         next_states=enum_next_states,
         next_actions=samples.next_actions,
         terminals=samples.terminals,
         possible_next_actions=samples.possible_next_actions,
         reward_timelines=samples.reward_timelines,
     )
Exemplo n.º 16
0
    def test_trainer_sarsa_factorized(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(500000, 1.0, DISCOUNT)
        trainer = self.get_sarsa_trainer(
            environment, self.get_sarsa_parameters_factorized())
        predictor = trainer.predictor()
        evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT,
                                                 False, samples)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train(tdp)

        predictor = trainer.predictor()
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.15)
Exemplo n.º 17
0
    def test_trainer_sarsa(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(150000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT,
                                                 False, samples)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        predictor = trainer.predictor()
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.15)
Exemplo n.º 18
0
    def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False):
        environment = GridworldContinuous()
        evaluator = GridworldContinuousEvaluator(
            environment, assume_optimal_policy=False, gamma=DISCOUNT
        )

        trainer = self.get_trainer(environment, None, use_gpu, use_all_avail_gpus)

        self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
Exemplo n.º 19
0
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(100000, 1.0)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.reward_timelines = []
        for tv in true_values:
            samples.reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(1)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.05)
        self.assertLess(evaluator.mc_loss[-1], 0.12)
Exemplo n.º 20
0
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(200000, 1.0)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.reward_timelines = []
        for tv in true_values:
            samples.reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(None, 10, DISCOUNT)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.15)
Exemplo n.º 21
0
    def test_trainer_maxq(self):
        environment = GridworldContinuous()
        rl_parameters = self.get_sarsa_parameters()
        new_rl_parameters = ContinuousActionModelParameters(
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=rl_parameters.training,
            knn=rl_parameters.knn)
        maxq_trainer = ContinuousActionDQNTrainer(
            new_rl_parameters,
            environment.normalization,
            environment.normalization_action,
        )

        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldContinuousEvaluator(environment, True)
        self.assertGreater(evaluator.evaluate(predictor), 0.4)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.stream_tdp(tdp)
            evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.1)
Exemplo n.º 22
0
 def _test_trainer_sarsa_factorized(self, use_gpu=False, use_all_avail_gpus=False):
     self.check_tolerance = False
     self.tolerance_threshold = 0.15
     environment = GridworldContinuous()
     trainer, exporter = self.get_sarsa_trainer_exporter(
         environment,
         self.get_sarsa_parameters_factorized(),
         use_gpu,
         use_all_avail_gpus,
     )
     evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT)
     self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
Exemplo n.º 23
0
    def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False):
        environment = GridworldContinuous()
        evaluator = GridworldContinuousEvaluator(
            environment, assume_optimal_policy=False, gamma=DISCOUNT
        )

        if use_all_avail_gpus:
            self.tolerance_threshold = 0.11
        trainer, exporter = self.get_modular_sarsa_trainer_exporter(
            environment, None, use_gpu, use_all_avail_gpus
        )

        self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
    def test_evaluator_timeline(self):
        environment = GridworldContinuous()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)

        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Exemplo n.º 25
0
    def _test_sac_trainer(self, use_gpu=False, **kwargs):
        environment = GridworldContinuous()
        trainer = self.get_sac_trainer(environment, use_gpu, **kwargs)
        evaluator = GridworldContinuousEvaluator(environment,
                                                 assume_optimal_policy=False,
                                                 gamma=DISCOUNT)

        self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)

        # Make sure actor predictor works
        actor_predictor = self.get_actor_predictor(trainer, environment)
        # Just test that it doesn't blow up
        preds = actor_predictor.predict(evaluator.logged_states)
        self._test_save_load_actor(preds, actor_predictor,
                                   evaluator.logged_states)
Exemplo n.º 26
0
 def _test_ddpg_trainer(self, use_gpu=False, use_all_avail_gpus=False):
     self.check_tolerance = False
     self.tolerance_threshold = 1.0
     environment = GridworldContinuous()
     trainer = DDPGTrainer(
         self.get_ddpg_parameters(),
         environment.normalization,
         environment.normalization_action,
         environment.min_action_range,
         environment.max_action_range,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
     evaluator = GridworldDDPGEvaluator(environment, DISCOUNT)
     self.evaluate_gridworld(environment, evaluator, trainer, trainer,
                             use_gpu)
 def preprocess_samples(
     self,
     states: List[Dict[str, float]],
     actions: List[Dict[str, float]],
     rewards: List[float],
     next_states: List[Dict[str, float]],
     next_actions: List[Dict[str, float]],
     is_terminals: List[bool],
     possible_next_actions: List[List[Dict[str, float]]],
     reward_timelines: List[Dict[int, float]],
 ) -> TrainingDataPage:
     tdp = GridworldContinuous.preprocess_samples(
         self, states, actions, rewards, next_states, next_actions,
         is_terminals, possible_next_actions, reward_timelines)
     tdp.states = np.where(tdp.states == 1.0)[1].reshape(-1, 1).astype(
         np.float32)
     tdp.next_states = np.where(tdp.next_states == 1.0)[1].reshape(
         -1, 1).astype(np.float32)
     return tdp
 def generate_samples(
     self, num_transitions, epsilon, with_possible=True
 ) -> Tuple[List[Dict[str, float]], List[Dict[str, float]], List[float],
            List[Dict[str, float]], List[Dict[str, float]], List[bool],
            List[List[Dict[str, float]]], List[Dict[int, float]]]:
     states, actions, rewards, next_states, next_actions, is_terminals, \
         possible_next_actions, reward_timelines = \
         GridworldContinuous.generate_samples(
             self, num_transitions, epsilon, with_possible)
     enum_states = []
     for state in states:
         enum_states.append({'0': float(list(state.keys())[0])})
     enum_next_states = []
     for state in next_states:
         enum_next_states.append({'0': float(list(state.keys())[0])})
     return (
         enum_states, actions, rewards, enum_next_states, next_actions,
         is_terminals, possible_next_actions, reward_timelines
     )
Exemplo n.º 29
0
    def _test_td3_trainer(self, use_gpu=False, **kwargs):
        environment = GridworldContinuous()
        trainer = self.get_td3_trainer(
            environment, self.get_td3_parameters(**kwargs), use_gpu
        )
        evaluator = GridworldContinuousEvaluator(
            environment, assume_optimal_policy=False, gamma=DISCOUNT
        )

        self.current_predictor_network = trainer.q1_network
        self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)

        if trainer.q2_network is not None:
            self.current_predictor_network = trainer.q2_network
            self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)

        # Make sure actor predictor works
        actor_predictor = self.get_actor_predictor(trainer, environment)
        preds = actor_predictor.predict(evaluator.logged_states)
        self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
Exemplo n.º 30
0
    def _test_td3_trainer(self, use_gpu=False, **kwargs):
        environment = GridworldContinuous()
        trainer = self.get_td3_trainer(environment,
                                       self.get_td3_parameters(**kwargs),
                                       use_gpu)
        evaluator = GridworldContinuousEvaluator(environment,
                                                 assume_optimal_policy=False,
                                                 gamma=DISCOUNT)

        exporter1, exporter2 = self.get_critic_exporter(trainer, environment)

        self.evaluate_gridworld(environment, evaluator, trainer, exporter1,
                                use_gpu)

        if exporter2:
            self.evaluate_gridworld(environment, evaluator, trainer, exporter2,
                                    use_gpu)

        # Make sure actor predictor works
        actor_predictor = self.get_actor_predictor(trainer, environment)
        preds = actor_predictor.predict(evaluator.logged_states)
        self._test_save_load_actor(preds, actor_predictor,
                                   evaluator.logged_states)