def do_q_learning(env, reward_function, train_episodes, figure=False): alpha = 0.01 gamma = 0.9 epsilon = 0.1 policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2, output=4) # 4 actions output, up, right, down, left replay_buffer = ReplayBuffer() # Play with a random policy and see # run_current_policy(env.env, policy) agg_interval = 100 avg_history = {'episodes': [], 'timesteps': [], 'reward': []} # Train the network to predict actions for each of the states for episode_i in range(train_episodes): episode_timestep = 0 episode_reward = 0.0 env.__init__() # todo : the first current state should be 0 cur_state = env.cur_state counter = 0 done = False while not done: # Let each episode be of 30 steps counter += 1 done = counter >= 30 # todo : check if this line is working action = policy.select_action(cur_state.reshape(1, -1), epsilon) # take action in the environment next_state = env.step(action) reward = reward_function(next_state) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_history['episodes'].append(episode_i + 1) avg_history['timesteps'].append(episode_timestep) avg_history['reward'].append(episode_reward) learning_policy_progress.update() if figure: plt.plot(avg_history['episodes'], avg_history['reward']) plt.title('Reward') plt.xlabel('Episode') plt.ylabel('Reward') plt.show() return policy.q_model
while not done: # select action action = cp_policy.select_action(cur_state.reshape(1, -1), cp_epsilon) # take action in the environment next_state, reward, done, info = cp_env.step(action) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions cp_policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_reward += episode_reward avg_timestep += episode_timestep if (episode_i + 1) % agg_interval == 0: cp_avg_history['episodes'].append(episode_i + 1) cp_avg_history['timesteps'].append(avg_timestep / float(agg_interval)) cp_avg_history['reward'].append(avg_reward / float(agg_interval)) avg_timestep = 0 avg_reward = 0.0
env.render() # Keep track of max position if next_state[0] > max_position: max_position = next_state[0] # Adjust reward for task completion if next_state[0] >= 0.5: reward += 10 replay_buffer.add(cur_state, action, next_state, reward, done) # TODO : Change the sample size and check any improvements sampled_transitions = replay_buffer.sample() # the q updation occurs for all transitions in all episodes, just like TD updates env_policy.update_policy(**sampled_transitions) ep_reward += reward ep_timesteps += 1 cur_state = next_state history['reward'].append(ep_reward) history['timesteps'].append(ep_timesteps) history['episodes'].append(episode+1) if episode % mod_episode == 0: print('Epoch : {} Success : {} Avg Reward : {} Timesteps : {} Max position : {}'.format( episode, max_position >= 0.5, history['reward'][-1], history['timesteps'][-1], max_position)) # decay the epsilon after every episode epsilon -= epsilon_decay