switch = True next_state = states[-1] if len( states ) == 1: # this happens if option was chosen in its termination state tdes = [0.] # no update else: tdes = util.q_learning_update_option_sequence(gamma, alpha, \ agent_smdp.q_func, states, \ rewards, opt.identifier) tot_td += np.sum(tdes) reward_record.append(rewards) cur_state = next_state steps += len(states) if done: break prev_steps = hist[itr - 1, 0] ret = util.discounted_return(reward_record, gamma) greedy_steps, greedy_choices, greedy_ret, greedy_success = util.switching_greedy_eval( agent_smdp, gamma, max_options, 100) hist[itr, :] = np.array([ prev_steps + steps, tot_td / (steps), ret, greedy_ret, greedy_success, greedy_steps, greedy_choices ]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3])) print("DONE. ({} seconds elapsed)".format(time.time() - start_time)) util.plot_and_pickle(env, agent_smdp, hist)
rewards.append(reward) stp += 1 #replay for agent if batch_size < len(agent_q.memory): err, _ = replay(batch_size) tot_td += err # stp += replay_stp cur_state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( itr, iterations, stp, epsilon)) break # record results for this iteration prev_steps = hist[itr - 1, 0] greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval( agent_q, gamma, max_steps, 100) hist[itr, :] = np.array([ prev_steps + stp, tot_td / (stp), util.discounted_return(rewards, gamma), greedy_ret, greedy_success, greedy_steps, greedy_choices ]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3])) print("DONE. ({} seconds elapsed)".format(time.time() - start_time)) util.plot_and_pickle(env, agent_q, hist)
option_index = plan[0] * agent_plan.num_actions + plan[1] states, actions, rewards, done = env.step_plan(agent_plan.sebango) ret = util.discounted_return(rewards, gamma) # Add in a bonus at the end of the first option for subsequent options rewards2 = rewards[0] + [ util.discounted_return(rewards[1:], gamma) * gamma**(len(rewards[0]) + 1) ] steps += np.sum([len(s) for s in states]) if actions[0] == [None]: # no valid action chosen states[0] = [cur_state, cur_state] # no transition # update q-table tdes = util.q_learning_update_plan_options(gamma, alpha, \ agent_plan.q_func, states[0], \ rewards2, option_index) tot_tde += np.sum(tdes) prev_steps = hist[itr - 1, 0] greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval( agent_plan, gamma, max_plans, 100) hist[itr, :] = np.array([ prev_steps + steps, tot_tde / (steps), ret, greedy_ret, greedy_success, greedy_steps, greedy_choices ]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3])) print("DONE. ({} seconds elapsed)".format(time.time() - start_time)) util.plot_and_pickle(env, agent_plan, hist)