예제 #1
0
        #epsilon = np.max([0.1,1.-itr/(iterations/2.)]) # linear epsilon-decay
        opt = agent_smdp.pick_option_greedy_epsilon(cur_state, eps=epsilon)
        states, actions, rewards, done = env.step_option(
            opt, agent_smdp.sebango)
        next_state = states[-1]
        tdes = util.q_learning_update_option_sequence(gamma, alpha, \
                                    agent_smdp.q_func, states, \
                                    rewards, opt.identifier)
        tot_td += np.sum(tdes)
        reward_record.append(rewards)
        cur_state = next_state
        steps += len(states)
        if done:
            break
    prev_steps = hist[itr - 1, 0]
    ret = util.discounted_return(
        reward_record, gamma
    )  # CURRENTLY USING SWITCHING!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # greedy_steps, greedy_choices, greedy_ret, greedy_success = util.switching_greedy_eval(agent_smdp,gamma,max_options,100)
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(
        agent_smdp, gamma, max_options, 100)
    hist[itr, :] = np.array([
        prev_steps + steps, tot_td / (steps), ret, greedy_ret, greedy_success,
        greedy_steps, greedy_choices
    ])

    if itr % report_freq == 0:  # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3]))

print("DONE. ({} seconds elapsed)".format(time.time() - start_time))
util.plot_and_pickle(env, agent_smdp, hist)
예제 #2
0
        rewards.append(reward)
        stp += 1
        #replay for agent
        if batch_size < len(agent_q.memory):
            err, _ = replay(batch_size)
            tot_td += err
            # stp += replay_stp

        cur_state = next_state

        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(
                itr, iterations, stp, epsilon))
            break

    # record results for this iteration
    prev_steps = hist[itr - 1, 0]
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(
        agent_q, gamma, max_steps, 100)
    hist[itr, :] = np.array([
        prev_steps + stp, tot_td / (stp),
        util.discounted_return(rewards, gamma), greedy_ret, greedy_success,
        greedy_steps, greedy_choices
    ])

    if itr % report_freq == 0:  # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3]))

print("DONE. ({} seconds elapsed)".format(time.time() - start_time))
util.plot_and_pickle(env, agent_q, hist)
예제 #3
0
start_time  = time.time()

for itr in range(iterations):
    initial_state = env.reset(random_placement=True)
    done = False
    # Since plan length is 2, each "action" is evaluated once (no replan) 
    plan = agent_plan.make_plan_epsilon_greedy(initial_state,epsilon=epsilon)
    option_index = plan[0]*agent_plan.num_actions+plan[1]
    states, actions, rewards, done = env.step_plan(agent_plan.sebango)
    ret = util.discounted_return(rewards,gamma)
    # Add in a bonus at the end of the first option for subsequent options
    rewards2 = rewards[0]+[util.discounted_return(rewards[1:],gamma)*gamma**(len(rewards[0])+1)]
    steps = np.sum([len(s) for s in states]) 
    if actions[0] == [None]: # no valid action chosen
        states[0] = [initial_state,initial_state] # no transition
        
    # update q-table
    tdes    = util.q_learning_update_plan_options(gamma, alpha, \
                                agent_plan.q_func, states[0], \
                                rewards2, option_index)
    tot_td  = np.sum(tdes)
    prev_steps = hist[itr-1,0]
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(agent_plan,gamma,1,100)
    hist[itr,:] = np.array([prev_steps+steps, tot_td/(steps), ret, greedy_ret, greedy_success, greedy_steps, greedy_choices])
    
    if itr % report_freq == 0: # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr,3]))
        
print("DONE. ({} seconds elapsed)".format(time.time()-start_time))
util.plot_and_pickle(env,agent_plan,hist)