def qlearning(self): train_scores = [] eval_scores = [] cfg = read_cfg(self.config_file) all_scores = [] for train_ep in range(self.epochs): score = 0 env = FallingObjects(cfg) obs = env.reset() state, _ = self.get_state(obs) for i in range(self.moves_per_epoch): actions = self.actions action = self.epsilon_greedy(self.Q, state, actions, self.epsilon) obs, r, done, _ = env.step(action) statep, r = self.get_state(obs) if train_ep > 1: print(statep, r) cv2.imshow('hehe', obs) cv2.waitKey(0) score += r maximum = -float('inf') actionsp = self.actions for actionp in actionsp: if self.Q.get((statep, actionp), 0) > maximum: maximum = self.Q.get((statep, actionp), 0) self.Q[(state, action)] = self.Q.get( (state, action), 0) + self.learning_rate * ( r + self.discount * maximum - self.Q.get( (state, action), 0)) state = statep if self.epsilon > self.epsilon_min: self.epsilon *= 0.99999 print("Epoch: {}; Score: {}; Epsilon: {}".format( train_ep, score, self.epsilon)) all_scores.append(score) if train_ep % 200 == 0 and train_ep > 0: self.save_q() print("Mean score for the last 200 epochs: {}".format( np.average(all_scores[:-200])))
def qlearning(self): train_scores = [] eval_scores = [] cfg = read_cfg(self.config_file) all_scores = [] for train_ep in range(self.epochs): if train_ep <= 10: self.epsilon = 1 elif self.done_pre == False: self.done_pre = True self.epsilon = 0.25 score = 0 env = FallingObjects(cfg) obs = env.reset() obs, _ = self.get_state(obs) stack_frame = deque([obs for _ in range(self.frame_size)], maxlen=self.frame_size) state, stack_frame = self.get_frame(stack_frame, obs) state = np.reshape(state, [1, 1, self.frame_size, 86, 86]) for i in range(self.moves_per_epoch): actions = self.actions action = self.epsilon_greedy(state, actions, self.epsilon) obs, r, done, _ = env.step(actions[action]) obs, r = self.get_state(obs) print("Move: {}; action: {}; reward: {}; epsilon: {}".format( i, actions[action], r, self.epsilon)) statep, stack_frame = self.get_frame(stack_frame, obs) statep = np.reshape(statep, [1, 1, self.frame_size, 86, 86]) score += r self.memory.append((state, action, r, statep)) state = statep if train_ep > 10: self.replay() print("Episode: {}; score: {}".format(train_ep, score)) all_scores.append(score) if train_ep % 20 == 0 and train_ep > 0: print("Mean score for the last 200 epochs: {}".format( np.average(all_scores[:-200]))) torch.save(self.model, 'model.pt')
def qlearning(self): cfg = read_cfg(self.config_file) all_scores = [] for train_ep in range(self.epochs): if train_ep <= 10: self.epsilon = 0.02 elif self.done_pre == False: self.done_pre = True self.epsilon = 0.6 score = 0 env = FallingObjects(cfg) obs = env.reset() obs, _ = self.get_state(obs) #stack_frame = deque([obs for _ in range(self.frames)], maxlen=self.frames) #state, stack_frame = self.get_frame(stack_frame, obs) #state = np.reshape(state, [1, self.frames, 86, 86, 1]) state = obs for i in range(self.moves_per_epoch): actions = self.actions action = self.epsilon_greedy(state, actions, self.epsilon) #print("Move: {}; action: {}".format(i, actions[action])) obs, r, done, _ = env.step(actions[action]) if train_ep > 10000: print(statep, r) cv2.imshow('hehe', obs) cv2.waitKey(0) obs, r = self.get_state(obs) #statep, stack_frame = self.get_frame(stack_frame, obs) #statep = np.reshape(statep, [1, self.frames, 86, 86, 1]) statep = obs score += r self.memory.append((state, action, r, statep)) state = statep if train_ep > 0: self.replay() print("Epoch: {}; Score: {}; Epsilon: {}".format( train_ep, score, self.epsilon)) all_scores.append(score) if train_ep % 200 == 0: self.model.save('configs/model.h5') print("Mean score for the last 200 epochs: {}".format( np.average(all_scores[:-200])))
def evalAgent(self, nr_episodes, q_network): replay_memory = ReplayBuffer(5000, 4) episode = 0 episodes_score = [] env = FallingObjects(self.initial_cfg) while episode < nr_episodes: print(episode) episode += 1 reward_per_episode = 0 env = FallingObjects(self.initial_cfg) obs = env.reset() while True: #return an 84 * 84 *1 image current_frame = processFrame(obs) idx = replay_memory.store_frame(current_frame) #get 4 frames stacked together to forward throgh network current_state = replay_memory.encode_recent_observation() best_action = self.eval_greedyPolicy(current_state, q_network) obs, rew, done, _ = env.step(best_action[0, 0].item() + 2) if rew == 0: reward_per_episode += 1 rew = 1 replay_memory.store_effect(idx, best_action[0, 0], rew, done) else: done = True replay_memory.store_effect(idx, best_action[0, 0], rew, done) break episodes_score.append(reward_per_episode) mean_score = sum(episodes_score) / float(nr_episodes) return mean_score, episodes_score
print(f"Testing agent {test_agent_name[1]}") env = FallingObjects(cfg) #agent = test_agent(max(ACTIONS.keys())) # Dueling Deep Q-Learning Agent agent = DDQNAgent() all_r = 0 obs = env.reset() # In lieu of having a state comprised of a single observation, we stack the last 3 images # at any given time in order to create a state, as suggested in DeepMind's DQN paper; # we do this in order to preserve the movement of the falling objects. s1, _, r1, _ = env.step(0) s2, _, r2, _ = env.step(0) s3, _, r3, _ = env.step(0) all_r += (r1 + r2 + r3) curr_obs = [s1, s2, s3] # Lambda function to reshape, convert to grayscale and stack the images in our observation list. make_obs = lambda obs_list: np.stack( (cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY).reshape((1, 86, 86)) for obs in obs_list), axis=3) for i in range(test_steps): # curr_obs is a list of the last 3 frames action = agent.act(make_obs(curr_obs))
def train(self, env, q_network, optimzer, logger, reloaded=False): cfg = self.cfg steps_done = 0 nr_updates = 0 episode = 0 target_q_network = MyDQN(self.nr_acts) target_q_network.load_state_dict(q_network.state_dict()) replay_memory = ReplayBuffer(cfg.replay_memory_size, cfg.agent_history_lenght) if reloaded: replay_memory.loadMemory() with open(cfg.q_network_path, "rb") as f: q_network.load_state_dict(torch.load(f)) with open(cfg.target_network_path, "rb") as f: target_q_network.load_state_dict(torch.load(f)) with open(cfg.extra_params_path, "rb") as f: episode, steps_done, nr_updates, self.reward_history = pickle.load( f) target_q_network.train(False) if self.use_cuda: q_network.cuda() target_q_network.cuda() while episode < cfg.nr_episodes: episode += 1 reward_per_episode = 0 env = FallingObjects(self.initial_cfg) obs = env.reset() ep_steps = 0 while True: steps_done += 1 ep_steps += 1 #return an 84 * 84 * 1 image current_frame = processFrame(obs) idx = replay_memory.store_frame(current_frame) #get 4 frames stacked together to forward through the network current_state = replay_memory.encode_recent_observation() eps_threshold, best_action = self.greedyPolicy( current_state, q_network, steps_done) obs, rew, done, _ = env.step(best_action[0, 0].item() + 2) if rew == 0: reward_per_episode += 1 rew = 1 else: done = True replay_memory.store_effect(idx, best_action[0, 0], rew, done) #Let the agent explore randomly for REPLAY_START_SIZE steps if steps_done > cfg.replay_start_size: if steps_done % 4 == 0: nr_updates += 1 self.optimze_agent(q_network, target_q_network, optimzer, replay_memory) if nr_updates % cfg.update_target_network == 0: logger.write("Updated target network " + str(episode) + "\n") print("Updated target network" + str(episode)) logger.flush() target_q_network.load_state_dict( q_network.state_dict()) plot_rewardEvaluation(cfg, self, nr_updates, q_network, logger) if done: break if episode > 0 and episode % 50 == 0: #save replay_memory replay_memory.saveMemory() #save networks with open(cfg.q_network_path, "wb") as f: torch.save(q_network.state_dict(), f) with open(cfg.target_network_path, "wb") as f: torch.save(target_q_network.state_dict(), f) with open(cfg.extra_params_path, "wb") as f: extra_params = [ episode, steps_done, nr_updates, self.reward_history ] pickle.dump(extra_params, f) logger.write("Saved networks and parameters after " + str(episode) + "\n") logger.flush() statistics(episode, steps_done, nr_updates, reward_per_episode, eps_threshold, ep_steps)
arg_parser = ArgumentParser() arg_parser.add_argument('-c', '--config-file', default='configs/default.yaml', type=str, dest='config_file', help='Default configuration file') args = arg_parser.parse_args() config_file = args.config_file cfg = read_cfg(config_file) env = FallingObjects(cfg) episode_r = [] env.reset() for _ in range(1000): key = env.render() if key == "q": exit() elif key not in PLAYER_KEYS.keys(): print(f"Unknown key: {key}") continue obs, r, done, _ = env.step(PLAYER_KEYS[key]) # take a random action episode_r.append(r) print( f"Reward: {r} (rewards gathered in last 100 steps: {sum(episode_r[-100:])})" )
def train_dqn_model(args): action_size = max(ACTIONS.keys()) + 1 env = FallingObjects(read_cfg(args.config_file)) obs = env.reset() tf.reset_default_graph() with tf.Session() as sess: # Create and initialize the agent. agent = DQNAgent(action_size, training=True) agent.do_setup(args, obs, sess) # Tensorboard setup. writer = tf.summary.FileWriter("./logs") saver = tf.train.Saver() tf.summary.scalar("Loss", agent.dqn.loss) write_op = tf.summary.merge_all() # Now start learning. obs = env.reset() all_rewards = [] # We first play a bit in order to explore the environment # and populate the experience buffer. for i in range(num_exploration_steps): action = agent.get_random_action() obs, reward, _, _ = env.step(action) all_rewards.append(reward) total_reward = sum(all_rewards[-args.stack_size:]) # total_reward = reward agent.remember(obs, action, total_reward) all_rewards = [] for step in range(args.num_train_steps): # Predict an action using an e-greedy policy, where the # probability of exploration is decaying in time. action, explore_prob = agent.predict_action( explore_prob_begin, explore_prob_min, decay_rate, step) # Apply the action and get the observation and reward from # the environment. obs, reward, _, _ = env.step(action) all_rewards.append(reward) # Save the current observation to see how the agent behaves. cv2.imwrite(str(step) + '.png', obs) # And make this part of the agent's experience. total_reward = sum(all_rewards[-args.stack_size:]) agent.remember(obs, action, total_reward) print('Step %7d, total reward = %2d' % (step, total_reward)) # Get a mini-batch from memory and train the net. mini_batch = agent.mem.sample(batch_size) states, actions, rewards, next_states = (list(elem) for elem in zip( *mini_batch)) # Compute one-host encodings for the actions. actions_one_hot = np.zeros((len(actions), action_size)) actions_one_hot[np.arange(len(actions)), actions] = 1 target_Qs = [] # Q values for the next states using. next_Qs = agent.sess.run( agent.dqn.output, feed_dict={agent.dqn.inputs_: next_states}) # Q target should be reward + gamma * maxQ(s', a') target_Qs = np.array([ rewards[i] + args.discount_factor * np.max(next_Qs[i]) for i in range(batch_size) ]) loss, _ = agent.sess.run( [agent.dqn.loss, agent.dqn.optimizer], feed_dict={ agent.dqn.inputs_: states, agent.dqn.target_Q: target_Qs, agent.dqn.actions_: actions_one_hot }) summary = sess.run(write_op, feed_dict={ agent.dqn.inputs_: states, agent.dqn.target_Q: target_Qs, agent.dqn.actions_: actions_one_hot }) writer.add_summary(summary, step) writer.flush() # Save the model every 10 steps. if step % 10 == 0: saver.save(sess, './models/' + args.model_name + '.ckpt')
arg_parser.add_argument( '-a', '--agent', default='demo_agent+DemoAgent', type=str, dest='agent', help='The agent to test in format <module_name>+<class_name>') args = arg_parser.parse_args() config_file = args.config_file cfg = read_cfg(config_file) test_agent_name = args.agent.split("+") test_steps = cfg.test_steps test_agent = getattr(importlib.import_module(test_agent_name[0]), test_agent_name[1]) print(f"Testing agent {test_agent_name[1]}") env = FallingObjects(cfg) agent = test_agent(1 + max(ACTIONS.keys())) all_r = 0 obs = env.reset() for _ in range(test_steps): action = agent.act(obs) obs, r, done, _ = env.step(action) # take a random action all_r += r print(f"Reward for {test_steps} steps: {all_r} ")