Пример #1
0
def looping(qt=None, epsilon=config.epsilon, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        end = False
        epsilon = epsilon * 0.9999
        while not end:
            current_state = cart.state
            action = choose_action(current_state, qt, epsilon)
            new_state, reward, end, _ = cart.step(action)
            if end:
                reward = -10
            update_qt_new(qt, current_state, reward, action, new_state)
            turn += 1
            if (visu):
                cart.render()
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon)
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Пример #2
0
def loop(qt=None, epsilon=1, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    config.epsilon = epsilon
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        s = cart.state
        end = False
        epsilon_tmp = config.epsilon
        while not end:
            config.epsilon *= 0.97
            if (visu):
                cart.render()
            a = choose_action(s, qt)
            _, _, end, _ = cart.step(a)
            l_val = bellman_q(s, qt, dummy_cart(s), action=a)
            # print(l_val)
            update_qt(qt, s, a, l_val)
            s = cart.state
            turn += 1
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:",
              config.epsilon)
        config.epsilon = epsilon_tmp
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Пример #3
0
from cartpole import CartPoleEnv
import numpy as np
cart = CartPoleEnv()
cart.reset()

for _ in range(1000):

    # Calculate the Gradients

    # Update Thetas

    # Sample u trajectory

    # Apply u[0] to the actual system
    cart.step(10)  # Apply Some force

    # Update the New State in the Learner

    # Shift the Thetas

    # Simulate
    cart.render()

cart.close()
Пример #4
0
            if done:
                print('Episode {} ended after {} time steps, eps = {:.2}.'.
                      format(episode, t, agent.epsilon))
                episode_data.append(episode)
                score_data.append(t)
                break
        agent.replay()
    # agent.save_model('bot.h5')

    # Performance
    for episode in range(30):
        state = env.reset()
        state = np.reshape(state, [1, 4])
        agent.epsilon = 0
        for t in range(1000):
            env.render()
            action, force = agent.act(state)
            if force == 2:
                env.force_mag = 6
            elif force == 3:
                env.force_mag = 8
            else:
                env.force_mag = 10
            state_, reward, done, info = env.step(action)

            state_ = np.reshape(state_, [1, 4])
            state = state_
            if done or t == 1000:
                episode_data_.append(episode)
                score_data_.append(t)
                print('Trial {} ended with score {}'.format(episode, t))
Пример #5
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.cart = CartPoleEnv()
        self.cart.reset()
        self.predi_net = DQN()
        self.updat_net = deepcopy(self.predi_net)
        self.turn = 0
        self.epidode = 0
        self.epsilon = config.epsilon
        self.eps_decay = 0.99
        self.visu = False
        self.visu_update = False  #300
        self.visu_window = 5
        self.consecutive_wins = 0
        self.best_consecutive_wins = 0
        self.last_save = 0
        self.memory = []

    def reward_optimisation(self, state, end):
        reward = -25 if end else 1
        if reward == 1:
            # Angle reward modification
            angle_r = 0.418 / 2
            reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2
            # Position reward modification
            pos_r = 0.418 / 2
            reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2
        return reward

    def learn(self):
        self.episode = 0
        n = 0
        while self.episode < 10:
            self.turn = 0
            end = False
            states = []
            targets = []
            while not end:
                # 1. Init
                state = self.cart.state
                # 2. Choose action
                q_values = self.predi_net.predict(state).tolist()
                a = choose_action_net(q_values, self.epsilon)
                # 3. Perform action
                next_state, _, end, _ = self.cart.step(a)
                # 4. Measure reward
                reward = self.reward_optimisation(next_state, end)
                q_values_next = self.predi_net.predict(next_state)
                # 5. Calcul Q-Values
                q_values[a] = reward + net_config.gamma * \
                    torch.max(q_values_next).item()

                self.turn += 1
                self.memory.append((state, a, next_state, reward, end))
                # self.updat_net.update(state, q_values)
                states.append(state)
                targets.append(q_values)
                if (self.turn % 20 and self.turn) or end:
                    self.updat_net.update(states, targets)
                    states = []
                    targets = []

                if self.turn >= 500:
                    end = True
                if self.visu:
                    self.cart.render()

            self.episode += 1
            self.replay(20)
            if self.episode % net_config.n_update == 0 and self.episode:
                print("Update")
                self.predi_net.model.load_state_dict(
                    self.updat_net.model.state_dict())
            self.end()
            n += 1

        self.save()
        self.cart.close()
        self.plot_data.clear()

    def replay(self, size):
        if size > len(self.memory):
            size = len(self.memory)
        data = random.sample(self.memory, size)
        states = []
        targets = []
        for state, action, next_state, reward, done in data:
            q_values = self.predi_net.predict(state)
            if done:
                q_values[action] = reward
            else:
                # The only difference between the simple replay is in this line
                # It ensures that next q values are predicted with the target network.
                q_values_next = self.predi_net.predict(next_state)
                q_values[action] = reward + net_config.gamma * torch.max(
                    q_values_next).item()
            states.append(state)
            targets.append(q_values)
        self.updat_net.update(state, q_values)

    def end(self):
        self.plot_data.new_data(self.turn)
        if self.turn > 195:
            self.consecutive_wins += 1
            if self.best_consecutive_wins < self.consecutive_wins:
                self.best_consecutive_wins = self.consecutive_wins
            if self.consecutive_wins > 200:
                self.save()
                print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100)
        else:
            self.consecutive_wins = 0
            if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins:
                self.save()
                self.last_save = self.best_consecutive_wins
        print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:",
              self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins),
              "/", self.best_consecutive_wins)
        self.turn = 0
        self.cart.reset()
        if self.episode % config.graph_update == 0 and self.episode != 0:
            self.plot_data.graph()
        if self.visu_update:
            if self.episode % self.visu_update == 0:
                self.visu = True
            if self.episode % self.visu_update == self.visu_window:
                self.visu = False
                self.cart.close()
        self.epsilon = max(self.epsilon * self.eps_decay, 0.01)

    def save(self):
        pass