Пример #1
0
    def qlearning(self):
        train_scores = []
        eval_scores = []

        cfg = read_cfg(self.config_file)

        all_scores = []

        for train_ep in range(self.epochs):
            score = 0
            env = FallingObjects(cfg)
            obs = env.reset()
            state, _ = self.get_state(obs)

            for i in range(self.moves_per_epoch):
                actions = self.actions
                action = self.epsilon_greedy(self.Q, state, actions,
                                             self.epsilon)

                obs, r, done, _ = env.step(action)
                statep, r = self.get_state(obs)
                if train_ep > 1:
                    print(statep, r)
                    cv2.imshow('hehe', obs)
                    cv2.waitKey(0)
                score += r

                maximum = -float('inf')
                actionsp = self.actions
                for actionp in actionsp:
                    if self.Q.get((statep, actionp), 0) > maximum:
                        maximum = self.Q.get((statep, actionp), 0)

                self.Q[(state, action)] = self.Q.get(
                    (state, action), 0) + self.learning_rate * (
                        r + self.discount * maximum - self.Q.get(
                            (state, action), 0))

                state = statep

                if self.epsilon > self.epsilon_min:
                    self.epsilon *= 0.99999

            print("Epoch: {}; Score: {}; Epsilon: {}".format(
                train_ep, score, self.epsilon))
            all_scores.append(score)
            if train_ep % 200 == 0 and train_ep > 0:
                self.save_q()
                print("Mean score for the last 200 epochs: {}".format(
                    np.average(all_scores[:-200])))
Пример #2
0
    def qlearning(self):
        train_scores = []
        eval_scores = []

        cfg = read_cfg(self.config_file)

        all_scores = []

        for train_ep in range(self.epochs):
            if train_ep <= 10:
                self.epsilon = 1
            elif self.done_pre == False:
                self.done_pre = True
                self.epsilon = 0.25
            score = 0
            env = FallingObjects(cfg)
            obs = env.reset()
            obs, _ = self.get_state(obs)
            stack_frame = deque([obs for _ in range(self.frame_size)],
                                maxlen=self.frame_size)
            state, stack_frame = self.get_frame(stack_frame, obs)
            state = np.reshape(state, [1, 1, self.frame_size, 86, 86])

            for i in range(self.moves_per_epoch):
                actions = self.actions
                action = self.epsilon_greedy(state, actions, self.epsilon)

                obs, r, done, _ = env.step(actions[action])
                obs, r = self.get_state(obs)
                print("Move: {}; action: {}; reward: {}; epsilon: {}".format(
                    i, actions[action], r, self.epsilon))

                statep, stack_frame = self.get_frame(stack_frame, obs)
                statep = np.reshape(statep, [1, 1, self.frame_size, 86, 86])
                score += r

                self.memory.append((state, action, r, statep))

                state = statep

                if train_ep > 10:
                    self.replay()

            print("Episode: {}; score: {}".format(train_ep, score))
            all_scores.append(score)
            if train_ep % 20 == 0 and train_ep > 0:
                print("Mean score for the last 200 epochs: {}".format(
                    np.average(all_scores[:-200])))
                torch.save(self.model, 'model.pt')
Пример #3
0
    def qlearning(self):
        cfg = read_cfg(self.config_file)
        all_scores = []
        for train_ep in range(self.epochs):
            if train_ep <= 10:
                self.epsilon = 0.02
            elif self.done_pre == False:
                self.done_pre = True
                self.epsilon = 0.6
            score = 0
            env = FallingObjects(cfg)
            obs = env.reset()
            obs, _ = self.get_state(obs)
            #stack_frame = deque([obs for _ in range(self.frames)], maxlen=self.frames)
            #state, stack_frame = self.get_frame(stack_frame, obs)
            #state = np.reshape(state, [1, self.frames, 86, 86, 1])
            state = obs

            for i in range(self.moves_per_epoch):
                actions = self.actions
                action = self.epsilon_greedy(state, actions, self.epsilon)
                #print("Move: {}; action: {}".format(i, actions[action]))

                obs, r, done, _ = env.step(actions[action])
                if train_ep > 10000:
                    print(statep, r)
                    cv2.imshow('hehe', obs)
                    cv2.waitKey(0)
                obs, r = self.get_state(obs)
                #statep, stack_frame = self.get_frame(stack_frame, obs)
                #statep = np.reshape(statep, [1, self.frames, 86, 86, 1])
                statep = obs
                score += r

                self.memory.append((state, action, r, statep))

                state = statep

                if train_ep > 0:
                    self.replay()

            print("Epoch: {}; Score: {}; Epsilon: {}".format(
                train_ep, score, self.epsilon))
            all_scores.append(score)
            if train_ep % 200 == 0:
                self.model.save('configs/model.h5')
                print("Mean score for the last 200 epochs: {}".format(
                    np.average(all_scores[:-200])))
Пример #4
0
def main_function():

    arg_parser = ArgumentParser(description="Train DQN network")
    arg_parser.add_argument(
        '-c',
        '--config-file',
        default='configs/default.yaml',
        type=str,
        dest='config_file',
        help='Default configuration file')

    arg_parser.add_argument(
        "--agent_file",
        default='DQNAgent/agent_config.yaml',
        type=str,
        help='Agent configuration file')

    arg_parser.add_argument("--optimizer", type=str, default="Adam")
    arg_parser.add_argument("--new", default=False, action='store_true')
    arg_parser.add_argument("--use_cuda", default=False, action='store_true')

    args = arg_parser.parse_args()

    config_file = args.config_file
    cfg = read_cfg(config_file)

    acfg = read_cfg(args.agent_file)

    env = FallingObjects(cfg)

    nr_actions = 2
    q_network = MyDQN(nr_actions)

    if args.optimizer == 'Adam':
        optimizer = optim.Adam(q_network.parameters(), lr=acfg.adam_lr)

    elif args.optimizer == 'RMSProp':
        optimizer = optim.RMSprop(
            q_network.parameters(),
            lr=acfg.rms_prop_lr,
            alpha=0.99,
            eps=0.01,
            weight_decay=0,
            momentum=acfg.gradient_momentum,
            centered=False)

    agent = MyAgent(nr_actions, True, args.use_cuda, (acfg, cfg))
    with open("log_info", "wt") as f:
        agent.train(env, q_network, optimizer, f, args.new)
Пример #5
0
    def evalAgent(self, nr_episodes, q_network):

        replay_memory = ReplayBuffer(5000, 4)
        episode = 0
        episodes_score = []

        env = FallingObjects(self.initial_cfg)

        while episode < nr_episodes:
            print(episode)
            episode += 1
            reward_per_episode = 0
            env = FallingObjects(self.initial_cfg)
            obs = env.reset()
            while True:

                #return  an 84 * 84 *1 image
                current_frame = processFrame(obs)
                idx = replay_memory.store_frame(current_frame)

                #get 4 frames stacked together to forward throgh network
                current_state = replay_memory.encode_recent_observation()
                best_action = self.eval_greedyPolicy(current_state, q_network)

                obs, rew, done, _ = env.step(best_action[0, 0].item() + 2)
                if rew == 0:
                    reward_per_episode += 1
                    rew  = 1
                    replay_memory.store_effect(idx, best_action[0, 0], rew, done)
                else:
                    done = True
                    replay_memory.store_effect(idx, best_action[0, 0], rew, done)
                    break


            episodes_score.append(reward_per_episode)

        mean_score = sum(episodes_score) / float(nr_episodes)
        return mean_score, episodes_score
Пример #6
0
        default='demo_agent+DemoAgent',
        type=str,
        dest='agent',
        help='The agent to test in format <module_name>+<class_name>')

    args = arg_parser.parse_args()
    config_file = args.config_file
    cfg = read_cfg(config_file)
    test_agent_name = args.agent.split("+")
    test_steps = cfg.test_steps
    test_agent = getattr(importlib.import_module(test_agent_name[0]),
                         test_agent_name[1])

    print(f"Testing agent {test_agent_name[1]}")

    env = FallingObjects(cfg)

    #agent = test_agent(max(ACTIONS.keys()))

    # Dueling Deep Q-Learning Agent
    agent = DDQNAgent()
    all_r = 0
    obs = env.reset()

    # In lieu of having a state comprised of a single observation, we stack the last 3 images
    # at any given time in order to create a state, as suggested in DeepMind's DQN paper;
    # we do this in order to preserve the movement of the falling objects.
    s1, _, r1, _ = env.step(0)
    s2, _, r2, _ = env.step(0)
    s3, _, r3, _ = env.step(0)
Пример #7
0
    def train(self, env, q_network, optimzer, logger, reloaded=False):

        cfg = self.cfg
        steps_done = 0
        nr_updates = 0
        episode = 0

        target_q_network = MyDQN(self.nr_acts)
        target_q_network.load_state_dict(q_network.state_dict())

        replay_memory = ReplayBuffer(cfg.replay_memory_size,
                                     cfg.agent_history_lenght)

        if reloaded:
            replay_memory.loadMemory()
            with open(cfg.q_network_path, "rb") as f:
                q_network.load_state_dict(torch.load(f))

            with open(cfg.target_network_path, "rb") as f:
                target_q_network.load_state_dict(torch.load(f))

            with open(cfg.extra_params_path, "rb") as f:
                episode, steps_done, nr_updates, self.reward_history = pickle.load(
                    f)

        target_q_network.train(False)
        if self.use_cuda:
            q_network.cuda()
            target_q_network.cuda()

        while episode < cfg.nr_episodes:
            episode += 1
            reward_per_episode = 0
            env = FallingObjects(self.initial_cfg)
            obs = env.reset()
            ep_steps = 0
            while True:
                steps_done += 1
                ep_steps += 1
                #return an 84 * 84 * 1 image
                current_frame = processFrame(obs)
                idx = replay_memory.store_frame(current_frame)

                #get 4 frames stacked together to forward through the network
                current_state = replay_memory.encode_recent_observation()

                eps_threshold, best_action = self.greedyPolicy(
                    current_state, q_network, steps_done)
                obs, rew, done, _ = env.step(best_action[0, 0].item() + 2)

                if rew == 0:
                    reward_per_episode += 1
                    rew = 1
                else:
                    done = True

                replay_memory.store_effect(idx, best_action[0, 0], rew, done)

                #Let the agent explore randomly for REPLAY_START_SIZE steps
                if steps_done > cfg.replay_start_size:

                    if steps_done % 4 == 0:

                        nr_updates += 1
                        self.optimze_agent(q_network, target_q_network,
                                           optimzer, replay_memory)

                        if nr_updates % cfg.update_target_network == 0:
                            logger.write("Updated target network " +
                                         str(episode) + "\n")
                            print("Updated target network" + str(episode))
                            logger.flush()
                            target_q_network.load_state_dict(
                                q_network.state_dict())

                        plot_rewardEvaluation(cfg, self, nr_updates, q_network, logger)

                if done:
                    break

            if episode > 0 and episode % 50 == 0:
                #save replay_memory
                replay_memory.saveMemory()

                #save networks
                with open(cfg.q_network_path, "wb") as f:
                    torch.save(q_network.state_dict(), f)

                with open(cfg.target_network_path, "wb") as f:
                    torch.save(target_q_network.state_dict(), f)

                with open(cfg.extra_params_path, "wb") as f:
                    extra_params = [
                        episode, steps_done, nr_updates, self.reward_history
                    ]
                    pickle.dump(extra_params, f)
                logger.write("Saved networks and parameters after " +
                             str(episode) + "\n")
                logger.flush()

            statistics(episode, steps_done, nr_updates, reward_per_episode, eps_threshold, ep_steps)
Пример #8
0
if __name__ == "__main__":
    arg_parser = ArgumentParser()

    arg_parser.add_argument('-c',
                            '--config-file',
                            default='configs/default.yaml',
                            type=str,
                            dest='config_file',
                            help='Default configuration file')

    args = arg_parser.parse_args()
    config_file = args.config_file
    cfg = read_cfg(config_file)

    env = FallingObjects(cfg)

    episode_r = []
    env.reset()
    for _ in range(1000):
        key = env.render()
        if key == "q":
            exit()
        elif key not in PLAYER_KEYS.keys():
            print(f"Unknown key: {key}")
            continue

        obs, r, done, _ = env.step(PLAYER_KEYS[key])  # take a random action
        episode_r.append(r)
        print(
            f"Reward: {r} (rewards gathered in last 100 steps: {sum(episode_r[-100:])})"
def train_dqn_model(args):
    action_size = max(ACTIONS.keys()) + 1
    env = FallingObjects(read_cfg(args.config_file))
    obs = env.reset()

    tf.reset_default_graph()

    with tf.Session() as sess:
        # Create and initialize the agent.
        agent = DQNAgent(action_size, training=True)
        agent.do_setup(args, obs, sess)

        # Tensorboard setup.
        writer = tf.summary.FileWriter("./logs")
        saver = tf.train.Saver()
        tf.summary.scalar("Loss", agent.dqn.loss)
        write_op = tf.summary.merge_all()

        # Now start learning.
        obs = env.reset()
        all_rewards = []

        # We first play a bit in order to explore the environment
        # and populate the experience buffer.
        for i in range(num_exploration_steps):
            action = agent.get_random_action()
            obs, reward, _, _ = env.step(action)
            all_rewards.append(reward)
            total_reward = sum(all_rewards[-args.stack_size:])
            # total_reward = reward
            agent.remember(obs, action, total_reward)

        all_rewards = []
        for step in range(args.num_train_steps):
            # Predict an action using an e-greedy policy, where the
            # probability of exploration is decaying in time.
            action, explore_prob = agent.predict_action(
                explore_prob_begin, explore_prob_min, decay_rate, step)

            # Apply the action and get the observation and reward from
            # the environment.
            obs, reward, _, _ = env.step(action)
            all_rewards.append(reward)

            # Save the current observation to see how the agent behaves.
            cv2.imwrite(str(step) + '.png', obs)

            # And make this part of the agent's experience.
            total_reward = sum(all_rewards[-args.stack_size:])
            agent.remember(obs, action, total_reward)
            print('Step %7d, total reward = %2d' % (step, total_reward))

            # Get a mini-batch from memory and train the net.
            mini_batch = agent.mem.sample(batch_size)
            states, actions, rewards, next_states = (list(elem)
                                                     for elem in zip(
                                                         *mini_batch))

            # Compute one-host encodings for the actions.
            actions_one_hot = np.zeros((len(actions), action_size))
            actions_one_hot[np.arange(len(actions)), actions] = 1

            target_Qs = []

            # Q values for the next states using.
            next_Qs = agent.sess.run(
                agent.dqn.output, feed_dict={agent.dqn.inputs_: next_states})

            # Q target should be reward + gamma * maxQ(s', a')
            target_Qs = np.array([
                rewards[i] + args.discount_factor * np.max(next_Qs[i])
                for i in range(batch_size)
            ])

            loss, _ = agent.sess.run(
                [agent.dqn.loss, agent.dqn.optimizer],
                feed_dict={
                    agent.dqn.inputs_: states,
                    agent.dqn.target_Q: target_Qs,
                    agent.dqn.actions_: actions_one_hot
                })

            summary = sess.run(write_op,
                               feed_dict={
                                   agent.dqn.inputs_: states,
                                   agent.dqn.target_Q: target_Qs,
                                   agent.dqn.actions_: actions_one_hot
                               })

            writer.add_summary(summary, step)
            writer.flush()

            # Save the model every 10 steps.
            if step % 10 == 0:
                saver.save(sess, './models/' + args.model_name + '.ckpt')
Пример #10
0
    arg_parser.add_argument(
        '-a',
        '--agent',
        default='demo_agent+DemoAgent',
        type=str,
        dest='agent',
        help='The agent to test in format <module_name>+<class_name>')

    args = arg_parser.parse_args()
    config_file = args.config_file
    cfg = read_cfg(config_file)
    test_agent_name = args.agent.split("+")
    test_steps = cfg.test_steps
    test_agent = getattr(importlib.import_module(test_agent_name[0]),
                         test_agent_name[1])

    print(f"Testing agent {test_agent_name[1]}")

    env = FallingObjects(cfg)

    agent = test_agent(1 + max(ACTIONS.keys()))
    all_r = 0
    obs = env.reset()

    for _ in range(test_steps):
        action = agent.act(obs)
        obs, r, done, _ = env.step(action)  # take a random action
        all_r += r

    print(f"Reward for {test_steps} steps: {all_r} ")