def __init__(self, alpha, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n,
                 sigma, dim_out, fully_connected_layers):
        self.env = MountainCar()
        self.tpolicy = EpsilonGreedyPolicy(
            initial_epsilon=epsilon_tpolicy,
            numActions=self.env.get_num_actions())
        self.bpolicy = EpsilonGreedyPolicy(
            initial_epsilon=epsilon_bpolicy,
            numActions=self.env.get_num_actions())

        " Model Parameters "
        name = "experiment"
        observation_dimensions = self.env.get_observation_dimensions()
        num_actions = self.env.get_num_actions()
        gate_fun = tf.nn.relu
        self.model = Model_mFO(name=name,
                               dim_out=dim_out,
                               observation_dimensions=observation_dimensions,
                               num_actions=num_actions,
                               gate_fun=gate_fun,
                               fully_connected_layers=fully_connected_layers)

        " Neural Network Parameters "
        batch_size = 1
        self.tf_session = tf.Session()
        number_of_percentiles = 0
        percentile_index = 0
        training_steps = 1
        self.fa = NeuralNetwork_wPrioritizedTraining(
            model=self.model,
            optimizer=tf.train.GradientDescentOptimizer,
            numActions=num_actions,
            batch_size=batch_size,
            alpha=alpha,
            tf_session=self.tf_session,
            observation_dimensions=observation_dimensions,
            layer_training_print_freq=5000000,
            number_of_percentiles=number_of_percentiles,
            training_steps=training_steps,
            percentile_to_train_index=percentile_index)

        " Agent Parameters "
        self.agent = QSigma(n=n,
                            gamma=gamma,
                            beta=beta,
                            sigma=sigma,
                            environment=self.env,
                            function_approximator=self.fa,
                            target_policy=self.tpolicy,
                            behaviour_policy=self.bpolicy)
        self.agent_parameters = {
            "beta": beta,
            "gamma": gamma,
            "n": n,
            "bpolicy": self.bpolicy,
            "tpolicy": self.tpolicy
        }
Exemplo n.º 2
0
    def __init__(self, alpha, numTilings, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n, sigma):
        self.env = MountainCar()
        self.fa = TileCoderFA(numTilings=numTilings, numActions=self.env.get_num_actions(), alpha=alpha,
                              state_space_range=(self.env.get_high() - self.env.get_low()),
                              state_space_size=len(self.env.get_current_state()), tile_side_length=10)
        self.tpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_tpolicy, numActions=self.env.get_num_actions())
        self.bpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_bpolicy, numActions=self.env.get_num_actions())

        self.agent = QSigma(n=n, gamma=gamma, beta=beta, sigma=sigma, environment=self.env,
                            function_approximator=self.fa, target_policy=self.tpolicy, behaviour_policy=self.bpolicy)
 def reset_agent(self):
     self.env = MountainCar()
     for var in tf.global_variables():
         self.tf_session.run(var.initializer)
     self.agent = QSigma(beta=self.agent_parameters["beta"],
                         gamma=self.agent_parameters["gamma"],
                         n=self.agent_parameters["n"],
                         behaviour_policy=self.agent_parameters["bpolicy"],
                         target_policy=self.agent_parameters["tpolicy"],
                         environment=self.env,
                         function_approximator=self.fa)
    def setUp(self):
        config = Config()
        self.env = MountainCar(config)

        " Target Policy Parameters "
        config.num_actions = self.env.get_num_actions()
        config.target_policy = Config()
        config.target_policy.num_actions = self.env.get_num_actions()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False

        " FA Parameters "
        config.num_tilings = 32
        config.tiling_side_length = 8
        config.num_actions = 3
        config.num_dims = 2
        config.alpha = (1/4) / 32

        self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        ### Test 1 Setup ###
        self.fa1 = TileCoderFA(config)
        config.behaviour_policy = config.target_policy
        self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        config1 = Config()
        config1.n = 4
        config1.gamma = 1
        config1.beta = 1
        config1.sigma = 0.5
        config1.save_summary = True

        self.summary = {}
        self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary)

        ### Test 2 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.5
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 10000
        config.behaviour_policy.anneal_epsilon = True

        config2 = Config()
        config2.n = 3
        config2.gamma = 1
        config2.beta = 1
        config2.sigma = 0.5
        self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa2 = TileCoderFA(config)
        self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy2)

        ### Test 3 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 1
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.anneal_epsilon = True

        config3 = Config()
        config3.n = 3
        config3.gamma = 1
        config3.beta = 1
        config3.sigma = 0.5
        config3.initial_rand_steps = 5000
        config3.rand_steps_count = 0
        self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa3 = TileCoderFA(config)
        self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy3)
class Test_MountainCar_Environment(unittest.TestCase):

    def setUp(self):
        config = Config()
        self.env = MountainCar(config)

        " Target Policy Parameters "
        config.num_actions = self.env.get_num_actions()
        config.target_policy = Config()
        config.target_policy.num_actions = self.env.get_num_actions()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False

        " FA Parameters "
        config.num_tilings = 32
        config.tiling_side_length = 8
        config.num_actions = 3
        config.num_dims = 2
        config.alpha = (1/4) / 32

        self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        ### Test 1 Setup ###
        self.fa1 = TileCoderFA(config)
        config.behaviour_policy = config.target_policy
        self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        config1 = Config()
        config1.n = 4
        config1.gamma = 1
        config1.beta = 1
        config1.sigma = 0.5
        config1.save_summary = True

        self.summary = {}
        self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary)

        ### Test 2 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.5
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 10000
        config.behaviour_policy.anneal_epsilon = True

        config2 = Config()
        config2.n = 3
        config2.gamma = 1
        config2.beta = 1
        config2.sigma = 0.5
        self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa2 = TileCoderFA(config)
        self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy2)

        ### Test 3 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 1
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.anneal_epsilon = True

        config3 = Config()
        config3.n = 3
        config3.gamma = 1
        config3.beta = 1
        config3.sigma = 0.5
        config3.initial_rand_steps = 5000
        config3.rand_steps_count = 0
        self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa3 = TileCoderFA(config)
        self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy3)

    def test_train(self):
        print("\n############ Testing Training Function ##############")
        print("Training 50 episodes:")
        for i in range(50):
            # print("\tTraining episode:", i+1)
            self.agent1.train(1)

        print("\tThe average return after 50 episodes is:", np.average(self.summary['return_per_episode']))

        print("Training 450 more episodes:")
        for i in range(9):
            print("\tTraining", 50, "more episodes...")
            self.agent1.train(50)
            print("\tThe average return after", (i+1) * 50 + 50, "episodes is:",
                  np.average(self.summary['return_per_episode']))

    def test_annealing_epsilon(self):
        print("\n############ Testing Annealing Epsilon ###############")
        print("The initial epsilon is:", self.agent2.bpolicy.initial_epsilon)
        print("The final epsilon is:", self.agent2.bpolicy.final_epsilon)
        print("The annealing period is:", self.agent2.bpolicy.annealing_period)
        print("Training for 1 episodes...")
        self.agent2.train(1)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("Training for 10 more episodes...")
        self.agent2.train(10)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("Training for 100 more episodes...")
        self.agent2.train(100)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)


    def test_steps_before_training(self):
        print("\n############ Testing Steps Before Training ###############")
        print("The initial epsilon is:", self.agent3.bpolicy.initial_epsilon)
        print("The final epsilon is:", self.agent3.bpolicy.final_epsilon)
        print("The annealing period is:", self.agent3.bpolicy.annealing_period)
        print("The number of steps before training is:", self.agent3.config.initial_rand_steps)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)
        print("Training for 1 episodes...")
        self.agent3.train(1)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:", self.agent3.config.rand_steps_count)
        print("Training for 10 more episodes...")
        self.agent3.train(10)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)
        print("Training for 100 more episodes...")
        self.agent3.train(100)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)