Exemplo n.º 1
0
def execute(times):
    for t in range(times):
        print(t)

        # INICIALIZACION
        state     = init_state
        numMov    = 0               # Numero de movimientos
        coinObtAt = 0               # Numero de movimientos necesitados para coger la moneda
        coinObt   = False           # Si se ha cogido o no la moneda
        done      = False           # Si se ha terminado (llegado al estado final)
        observaciones = deque()

        for o in range(OBSERVACIONES):

            action, next = DQN.take_action(state)
            GUI.visualize(state, next, o, t);

            if next == end_state:
                reward = 5 / (numMov + 1)
                done = True
            elif next == coin_pos and not coinObt:
                coinObt = True
                reward = 15 / (numMov + 1)
                coinObtAt = numMov + 1
            else:
                reward = 0

            observaciones.append((state, action, next, reward, done))

            state = next[:]
            numMov += 1

            if done:
                GUI.visualize(state,init_state,o,t)
                print('Finished in %i movements, coin obtained in %i' % (numMov, coinObtAt))
                state     = init_state
                numMov    = 0
                coinObtAt = 0
                coinObt   = False
                done      = False

                DQN.updateExploration(-0.005)

        print('Observation finished')
        GUI.visualize(state,init_state,o,t)

        DQN.learn(observaciones,BATCH_SIZE)
Exemplo n.º 2
0
    def __init__(
        self,
        n_actions,
        n_features,
        n_episodes,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=300,
        memory_size=500,
        batch_size=32,
        e_greedy_increment=None,
        output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.n_episodes = n_episodes
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self.target_net = DQN(n_features, n_actions)
        self.eval_net = DQN(n_features, n_actions)
        # self.eval_net.load_state_dict(torch.load('/content/params.pkl'))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.eval_net.parameters(),
                                         lr=self.lr)
Exemplo n.º 3
0
def execute(times):
    for t in range(times):
        print(t)

        # INICIALIZACION
        state = init_state
        numMov = 0  # Numero de movimientos
        coinObtAt = 0  # Numero de movimientos necesitados para coger la moneda
        coinObt = False  # Si se ha cogido o no la moneda
        done = False  # Si se ha terminado (llegado al estado final)
        batch_con_recompensa = False
        observaciones = deque()

        tabla = DQN.calcularTabla()

        GUI.actualizar(tabla)

        for o in range(OBSERVACIONES):

            action, next = DQN.take_action(state)
            GUI.visualize(state, next, o, t, coinObt)

            #if next[:2] == end_state:
            if next == end_state:
                reward = 25  #/ (numMov + 1)
                done = True
                batch_con_recompensa = True
            elif next[:2] == coin_pos and not coinObt:
                coinObt = True
                next[2] = 1
                reward = 15  #/ (numMov + 1)
                coinObtAt = numMov + 1
                batch_con_recompensa = True
            else:
                reward = 0

            observaciones.append((state, action, next, reward, done))

            state = next[:]
            numMov += 1

            if done:
                GUI.visualize(state, init_state, o, t, coinObt)
                print('Finished in %i movements, coin obtained in %i' %
                      (numMov, coinObtAt))
                state = init_state
                numMov = 0
                coinObtAt = 0
                coinObt = False
                done = False

                DQN.updateExploration(-0.01)

        print('Observation finished')
        GUI.visualize(state, init_state, o, t, coinObt)

        if batch_con_recompensa:
            print("Aprendiendo...")
            DQN.learn(observaciones)
Exemplo n.º 4
0
class Train():
    def __init__(
        self,
        n_actions,
        n_features,
        n_episodes,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=300,
        memory_size=500,
        batch_size=32,
        e_greedy_increment=None,
        output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.n_episodes = n_episodes
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self.target_net = DQN(n_features, n_actions)
        self.eval_net = DQN(n_features, n_actions)
        # self.eval_net.load_state_dict(torch.load('/content/params.pkl'))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.eval_net.parameters(),
                                         lr=self.lr)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.eval_net(
                Variable(torch.from_numpy(
                    observation).float())).cpu().detach().numpy()
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            # self.sess.run(self.target_replace_op)
            self.target_net.load_state_dict(self.eval_net.state_dict())
            # print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size,
                                            size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter,
                                            size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        self.s = Variable(torch.from_numpy(
            batch_memory[:, :self.n_features]).float(),
                          requires_grad=True)
        self.a = Variable(
            torch.from_numpy(batch_memory[:, self.n_features]).long())
        self.r = Variable(
            torch.from_numpy(batch_memory[:, self.n_features + 1]).float())
        self.s_ = Variable(
            torch.from_numpy(batch_memory[:, -self.n_features:]).float())

        current_Q_values = self.eval_net(self.s).gather(
            1, self.a.unsqueeze(1)).view(-1)
        next_Q_values = self.target_net(self.s_).detach().max(1)[0]
        # Compute the target of the current Q values
        target_Q_values = self.r + (self.gamma * next_Q_values)
        # Compute Bellman error
        loss = self.criterion(target_Q_values, current_Q_values)

        self.optimizer.zero_grad()
        # run backward pass
        loss.backward()

        # Perfom the update
        self.optimizer.step()

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1
Exemplo n.º 5
0
                reward = 0

            observaciones.append((state, action, next, reward, done))

            state = next[:]
            numMov += 1

            if done:
                GUI.visualize(state, init_state, o, t, coinObt)
                print('Finished in %i movements, coin obtained in %i' %
                      (numMov, coinObtAt))
                state = init_state
                numMov = 0
                coinObtAt = 0
                coinObt = False
                done = False

                DQN.updateExploration(-0.01)

        print('Observation finished')
        GUI.visualize(state, init_state, o, t, coinObt)

        if batch_con_recompensa:
            print("Aprendiendo...")
            DQN.learn(observaciones)


if __name__ == '__main__':
    DQN = DQN(acciones, alpha, gamma, exploration)
    GUI = Maze(8, 8, coin_pos, end_state)
    execute(1000)