예제 #1
0
def train():

    memory = []
    Transition = collections.namedtuple(
        "Transition", ["state", "action", "reward", "next_state"])

    model = DeepQNetwork(flags.n_actions, flags.n_features, flags.lr,
                         flags.gamma, flags.epsilon_max, empty_goal_action)

    loss_his = []
    reward_his = []
    step_his = []

    for ii in range(flags.max_epoch):
        state = env.reset()

        reward_all = 0
        done = False
        steps = 0
        loss = 0

        while not done:
            action = model.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_all += reward
            steps += 1

            if len(memory) > flags.memory_size:
                memory.pop(0)
            memory.append(Transition(state, action, reward, next_state))

            if len(memory) > flags.batch_size * 2:
                batch_transition = random.sample(memory, flags.batch_size)
                batch_state, batch_action, batch_reward, batch_next_state = map(
                    np.array, zip(*batch_transition))
                loss = model.train(state=batch_state,
                                   action=batch_action,
                                   reward=batch_reward,
                                   state_=batch_next_state)

            if (ii + 1) % flags.replace_target_freq == 0:
                model.replace_target()
                model.decay_epsilon()

            state = next_state

        if loss > 0:
            loss_his.append(loss)
            reward_his.append(reward_all)
            step_his.append(steps)
            print("epoch=", ii, "/loss=", loss, "/reward_all=", reward_all,
                  "/steps=", steps)

    return loss_his, reward_his, step_his
예제 #2
0
def testing_trained_agent():
    """Testing Trained Agent"""
    global frame_size, stack_size

    with tf.Session() as sess:
        game = Doom()

        state_size = list(frame_size)
        state_size.append(stack_size)

        no_actions = len(game.actions)

        learning_rate = 0.0002

        deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate)

        totalScore = 0

        saver = tf.train.Saver()

        saver.restore(sess, "./models/model.ckpt")

        game.start_game()

        for i in range(1):
            done = False

            game.restart_episode()

            img, game_vars = game.get_environment_state()
            state = frame_stacking(img, True)

            while not game.is_episode_finished():
                Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: state.reshape((1, *state.shape))})

                choice = np.argmax(Qs)
                action = game.actions[int(choice)]

                game.take_action(action)

                done = game.is_episode_finished()

                score = game.game_environment.get_total_reward()

                if done:
                    break

                else:
                    print("else ")
                    next_img, next_game_vars = game.get_environment_state()
                    next_state = frame_stacking(next_img, False)
                    state = next_state

            score = game.game_environment.get_total_reward()
            print("Score: ", score)
        game.close_environment()
 def __init__(self,
              gamma,
              epsilon,
              alpha,
              maxMemSize,
              epsEnd,
              replace=25000,
              actionSpace=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]):
     self.Q_values = np.zeros([9, 6, 11])  #state size 9
     self.GAMMA = gamma
     self.ALPHA = alpha
     self.EPSILON = epsilon
     self.EPS_END = epsEnd
     self.actionSpace = actionSpace
     self.memSize = maxMemSize
     self.steps = 0
     self.learn_step_counter = 0  #target network replacement
     self.memory = []
     self.memCntr = 0
     self.replace_target_cnt = replace
     self.Q_eval = DeepQNetwork(alpha=alpha)
     self.Q_next = DeepQNetwork(alpha=alpha)
class agent():
    def __init__(self,
                 gamma,
                 epsilon,
                 alpha,
                 maxMemSize,
                 epsEnd,
                 replace=25000,
                 actionSpace=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]):
        self.Q_values = np.zeros([9, 6, 11])  #state size 9
        self.GAMMA = gamma
        self.ALPHA = alpha
        self.EPSILON = epsilon
        self.EPS_END = epsEnd
        self.actionSpace = actionSpace
        self.memSize = maxMemSize
        self.steps = 0
        self.learn_step_counter = 0  #target network replacement
        self.memory = []
        self.memCntr = 0
        self.replace_target_cnt = replace
        self.Q_eval = DeepQNetwork(alpha=alpha)
        self.Q_next = DeepQNetwork(alpha=alpha)
        #https://github.com/dennybritz/reinforcement-learning/blob/master/DP/Policy%20Iteration%20Solution.ipynb

    def storeTransition(self, state, action, reward, nextState):
        if self.memCntr < self.memSize:
            self.memory.append([state, action, reward, nextState])
        else:
            self.memory[self.memCntr %
                        self.memSize] = [state, action, reward, nextState]
        self.memCntr += 1

    def load_memory(self, batch_size):
        if self.memCntr + batch_size < self.memSize:
            memStart = int(np.random.choice((range(self.memCntr))))
        else:
            memStart = int(np.random.choice(range(self.memCntr - batch_size)))
        minibatch = self.memory[memStart:memStart + batch_size]
        resCurrent = np.zeros([batch_size, 54])
        resNext = np.zeros([batch_size, 54])
        rewards = np.zeros([batch_size])
        i = 0
        for state, action, reward, nextState in minibatch:
            resCurrent[i, :] = state
            resNext[i, :] = nextState
            rewards[i] = reward
            i += 1
        return resCurrent, resNext, rewards

    def chooseAction(self, observation):
        rand = np.random.random()
        actions = self.Q_eval.forward(observation)
        if rand < 1 - self.EPSILON:
            action = T.argmax(actions)
        else:
            action = np.random.choice(self.actionSpace)
        self.steps += 1
        return action

    def learn(self, batch_size):
        self.Q_eval.optimizer.zero_grad()
        if self.replace_target_cnt is not None and\
            self.learn_step_counter % self.replace_target_cnt ==0:
            self.Q_next.load_state_dict(self.Q_eval.state_dict())

        Qpred, Qnext, rewards = self.load_memory(batch_size)
        Qpred = self.Q_eval.forward(Qpred)
        Qnext = self.Q_next.forward(Qnext)
        maxA = T.argmax(Qnext, dim=1).cuda()
        rewards = T.Tensor(rewards).cuda()
        Qtarget = Qpred
        Qtarget[:, maxA] = rewards + self.GAMMA * T.max(Qnext[1])

        if self.steps > 500:
            self.EPSILON = np.max([self.EPS_END, self.EPSILON - 1e-4])
        loss = self.Q_eval.loss(Qtarget, Qpred).cuda()
        loss.backward()
        self.Q_eval.optimizer.step()
        self.learn_step_counter += 1
예제 #5
0
def main():
    global frame_size, stack_size

    state_size = list(frame_size)
    state_size.append(stack_size)

    game = Doom()
    no_actions = len(game.actions)

    learning_rate = 0.002
    no_episodes = 500
    max_steps = 100
    batch_size = 32

    explore_max = 1.
    explore_min = 0.01
    decay_rate = 0.00001

    gamma = 0.95

    pretrain_length  = batch_size
    memory_size = 1000000

    training = True

    episode_render = True

    tf.reset_default_graph()

    deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate)

    memory = Memory(max_size=memory_size)
    game.start_game()
    game.restart_episode()

    for i in range(pretrain_length):
        if i == 0:
            img, game_vars = game.get_environment_state()
            state = frame_stacking(img, True)

        action = random.choice(game.actions)

        reward = game.take_action(action)

        done = game.is_episode_finished()

        if done:
            next_state = np.zeros(state.shape)
            memory.add((state, action, reward, next_state, done))

            game.restart_episode()
            img, game_vars = game.get_environment_state()
            state = frame_stacking(img, True)

        else:
            next_img, next_game_vars = game.get_environment_state()
            next_state = frame_stacking(img, False)

            memory.add((state, action, reward, next_state, done))

            state = next_state

    writer = tf.summary.FileWriter("./tensorboard/dqn/1")

    tf.summary.scalar("Loss", deep_Q_network.loss)

    write_op = tf.summary.merge_all()

    """Prediction """

    def predict_action(curr_decay_step, curr_state):
        exp_exp_tradeoff = np.random.rand()

        curr_explore_prob = explore_min + ((explore_max - explore_min) * np.exp(-decay_rate * curr_decay_step))

        if curr_explore_prob > exp_exp_tradeoff:
            curr_action = random.choice(game.actions)

        else:
            Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: curr_state.reshape((1, *curr_state.shape))})

            choice = np.argmax(Qs)
            curr_action = game.actions[choice]

        return curr_action, curr_explore_prob

    """Training Agent"""
    saver = tf.train.Saver()

    if training:
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            decay_step = 0

            game.start_game()

            for episode in range(no_episodes):
                step = 0

                episode_rewards = []

                game.restart_episode()
                img, game_vars = game.get_environment_state()

                state = frame_stacking(img, True)

                while step < max_steps:
                    step += 1

                    decay_step += 1

                    action, explore_prob = predict_action(decay_step, state)

                    reward = game.take_action(action)

                    done = game.is_episode_finished()

                    episode_rewards.append(reward)

                    if done:
                        next_img = np.zeros(frame_size, dtype=np.int)
                        next_state = frame_stacking(next_img, False)

                        step = max_steps

                        total_rewards = np.sum(episode_rewards)

                        print("Episode No. {}".format(episode),
                              "Total reward: {}".format(total_rewards),
                              "Training Loss: {:.4f}".format(loss_val),
                              "Explore Prob: {:.4f}".format(explore_prob))

                        memory.add((state, action, reward, next_state, done))

                    else:
                        next_img, next_game_vars = game.get_environment_state()
                        next_state = frame_stacking(next_img, False)

                        memory.add((state, action, reward, next_state, done))

                        state = next_state

                    """Learning Part """
                    """Get mini-batches from memory and train"""
                    batch = memory.sample(batch_size)

                    states_mb = []
                    actions_mb = []
                    rewards_mb = []
                    next_states_mb = []
                    dones_mb = []

                    for each in batch:
                        states_mb.append(each[0])
                        actions_mb.append(each[1])
                        rewards_mb.append(each[2])
                        next_states_mb.append(each[3])
                        dones_mb.append(each[4])

                    states_mb = np.array(states_mb)
                    actions_mb = np.array(actions_mb)
                    rewards_mb = np.array(rewards_mb)
                    next_states_mb = np.array(next_states_mb)
                    dones_mb = np.array(dones_mb)

                    target_Qs_batch = []

                    Qs_next_state = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: next_states_mb})

                    for i in range(0, len(batch)):
                        terminal = dones_mb[i]

                        if terminal:
                            target_Qs_batch.append(rewards_mb[i])

                        else:
                            target = rewards_mb[i] + (gamma * np.max(Qs_next_state[i]))
                            target_Qs_batch.append(target)

                    targets_mb = np.array(target_Qs_batch)

                    loss_val, _ = sess.run([deep_Q_network.loss, deep_Q_network.optimizer],
                                           feed_dict={deep_Q_network.inputs: states_mb,
                                                      deep_Q_network.target_Q: targets_mb,
                                                      deep_Q_network.actions: actions_mb})

                    summary = sess.run(write_op, feed_dict={deep_Q_network.inputs: states_mb,
                                                            deep_Q_network.target_Q: targets_mb,
                                                            deep_Q_network.actions: actions_mb})

                    writer.add_summary(summary, episode)
                    writer.flush()

                if episode % 5 == 0:
                    save_path = saver.save(sess, "./models/model.ckpt")
                    print("Model Saved")
예제 #6
0
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()

    #    RL = DeepQNetwork(env.n_actions, env.n_features,
    #                      learning_rate=0.01,
    #                      reward_decay=0.9,
    #                      e_greedy=0.9,
    #                      replace_target_iter=200,
    #                      memory_size=2000
    #                      )

    # param tuning by hand, best version for now
    RL = DeepQNetwork(env.n_actions,
                      env.n_features,
                      learning_rate=0.005,
                      reward_decay=0.8,
                      e_greedy=0.8,
                      replace_target_iter=200,
                      memory_size=2000)
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
    exit()
예제 #7
0
파일: main.py 프로젝트: RuisongZhou/RLbase
                RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        lr=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=2000,
        # output_graph=True
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
예제 #8
0
                print('episode: {}, Reward: {}'.format(episode, Reward))
                break


def _eval():
    for episode in range(10):
        obs = env.reset()

        Reward = 0

        while True:
            env.render()

            action = RL.choose_action(obs, True)

            obs, reward, done, _ = env.step(action)
            Reward += reward

            if done:
                print('Reward: {}'.format(Reward))
                break


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    RL = DeepQNetwork(env.observation_space.shape[0], env.action_space.n)

    train()

    _eval()
예제 #9
0
def main():
    env = gym.make('SpaceInvaders-v0')

    memory = deque(maxlen=MEM_SIZE)

    # fill memory with random interactions with the environment
    while len(memory) < MEM_SIZE:
        observation = env.reset()
        frames = deque([np.zeros((185, 95)) for _ in range(STACK_SIZE)], maxlen=STACK_SIZE)
        frames.append(preprocess(observation))
        state = stack_frames(frames)
        done = False
        while not done:
            # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            frames.append(preprocess(observation_))
            state_ = stack_frames(frames)
            memory = store_transition(memory, state, action, reward, state_)
            state = state_
    print('done initializing memory')

    init_Q, pred_Q = DeepQNetwork()

    # two separate Q-Table approximations (eval and next)
    # initialize parameters, not committing to a batch size (NHWC)
    # we choose 3 channels as we want to pass stacks of 4 consecutive frames
    in_shape = (-1, 185, 95, STACK_SIZE)
    if LOAD:
        path = os.path.join(WEIGHTS_PATH, "params_Q_eval.npy")
        params_Q_eval = load_params(path)
    else:
        _, params_Q_eval = init_Q(in_shape)
    params_Q_next = params_Q_eval.copy()

    # Initialize RMSProp optimizer
    opt_init, opt_update = optimizers.rmsprop(ALPHA)
    opt_state = opt_init(params_Q_eval)
    opt_step = 0

    # Define a simple mean-squared-error loss
    def loss(params, batch):
        inputs, targets = batch
        predictions = pred_Q(params, inputs)
        return np.mean((predictions - targets) ** 2)

    # Define a compiled update step
    @jit
    def step(j, opt_state, batch):
        params = optimizers.get_params(opt_state)
        g = grad(loss)(params, batch)
        return opt_update(j, g, opt_state)

    def learn(opt_step, opt_state, params_Q_eval, params_Q_next):
        mini_batch = sample(memory, BATCH_SIZE)

        if opt_step % TAU == 0:
            params_Q_next = params_Q_eval.copy()

        input_states = np.stack([transition[0] for transition in mini_batch])
        next_states = np.stack([transition[3] for transition in mini_batch])

        predicted_Q = pred_Q(params_Q_eval, input_states)
        predicted_Q_next = pred_Q(params_Q_next, next_states)

        max_action = np.argmax(predicted_Q_next, axis=1)
        rewards = np.array([transition[2] for transition in mini_batch])

        Q_target = onp.array(predicted_Q)
        Q_target[:, max_action] = rewards + GAMMA * np.max(predicted_Q_next, axis=1)

        opt_state = step(opt_step, opt_state, (input_states, Q_target))
        params_Q_eval = optimizers.get_params(opt_state)

        return opt_state, params_Q_eval, params_Q_next

    scores = []
    eps_history = []
    eps = EPS_START if LEARN else 0

    for i in range(NUM_GAMES):
        print('starting game ', i + 1, 'epsilon: %.4f' % eps)
        eps_history.append(eps)
        done = False
        observation = env.reset()
        frames = deque([np.zeros((185, 95)) for _ in range(STACK_SIZE)], maxlen=STACK_SIZE)
        frames.append(preprocess(observation))
        state = stack_frames(frames)
        score = 0
        while not done:
            action = choose_action(env, state.reshape((1, 185, 95, STACK_SIZE)),
                                   pred_Q, params_Q_eval, eps)
            observation_, reward, done, info = env.step(action)
            score += reward

            if RENDER:
                env.render()

            if LEARN:
                frames.append(preprocess(observation))
                state_ = stack_frames(frames)
                memory = store_transition(memory, state, action, reward, state_)
                state = state_
                opt_state, params_Q_eval, params_Q_next = learn(opt_step, opt_state,
                                                                params_Q_eval, params_Q_next)
                opt_step += 1

                if opt_step > 500:
                    if eps - 1e-4 > EPS_END:
                        eps -= 1e-4
                    else:
                        eps = EPS_END

        if LEARN:
            out_path = os.path.join(WEIGHTS_PATH, 'params_Q_eval_' + str(i))

            onp.save(out_path, params_Q_eval)
        scores.append(score)
        print('score: ', score)