#epsilon = np.max([0.1,1.-itr/(iterations/2.)]) # linear epsilon-decay opt = agent_smdp.pick_option_greedy_epsilon(cur_state, eps=epsilon) states, actions, rewards, done = env.step_option( opt, agent_smdp.sebango) next_state = states[-1] tdes = util.q_learning_update_option_sequence(gamma, alpha, \ agent_smdp.q_func, states, \ rewards, opt.identifier) tot_td += np.sum(tdes) reward_record.append(rewards) cur_state = next_state steps += len(states) if done: break prev_steps = hist[itr - 1, 0] ret = util.discounted_return( reward_record, gamma ) # CURRENTLY USING SWITCHING!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # greedy_steps, greedy_choices, greedy_ret, greedy_success = util.switching_greedy_eval(agent_smdp,gamma,max_options,100) greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval( agent_smdp, gamma, max_options, 100) hist[itr, :] = np.array([ prev_steps + steps, tot_td / (steps), ret, greedy_ret, greedy_success, greedy_steps, greedy_choices ]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3])) print("DONE. ({} seconds elapsed)".format(time.time() - start_time)) util.plot_and_pickle(env, agent_smdp, hist)
rewards.append(reward) stp += 1 #replay for agent if batch_size < len(agent_q.memory): err, _ = replay(batch_size) tot_td += err # stp += replay_stp cur_state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( itr, iterations, stp, epsilon)) break # record results for this iteration prev_steps = hist[itr - 1, 0] greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval( agent_q, gamma, max_steps, 100) hist[itr, :] = np.array([ prev_steps + stp, tot_td / (stp), util.discounted_return(rewards, gamma), greedy_ret, greedy_success, greedy_steps, greedy_choices ]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3])) print("DONE. ({} seconds elapsed)".format(time.time() - start_time)) util.plot_and_pickle(env, agent_q, hist)
start_time = time.time() for itr in range(iterations): initial_state = env.reset(random_placement=True) done = False # Since plan length is 2, each "action" is evaluated once (no replan) plan = agent_plan.make_plan_epsilon_greedy(initial_state,epsilon=epsilon) option_index = plan[0]*agent_plan.num_actions+plan[1] states, actions, rewards, done = env.step_plan(agent_plan.sebango) ret = util.discounted_return(rewards,gamma) # Add in a bonus at the end of the first option for subsequent options rewards2 = rewards[0]+[util.discounted_return(rewards[1:],gamma)*gamma**(len(rewards[0])+1)] steps = np.sum([len(s) for s in states]) if actions[0] == [None]: # no valid action chosen states[0] = [initial_state,initial_state] # no transition # update q-table tdes = util.q_learning_update_plan_options(gamma, alpha, \ agent_plan.q_func, states[0], \ rewards2, option_index) tot_td = np.sum(tdes) prev_steps = hist[itr-1,0] greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(agent_plan,gamma,1,100) hist[itr,:] = np.array([prev_steps+steps, tot_td/(steps), ret, greedy_ret, greedy_success, greedy_steps, greedy_choices]) if itr % report_freq == 0: # evaluation print("Itr %i # Average reward: %.2f" % (itr, hist[itr,3])) print("DONE. ({} seconds elapsed)".format(time.time()-start_time)) util.plot_and_pickle(env,agent_plan,hist)