예제 #1
0
                        switch = True
        next_state = states[-1]
        if len(
                states
        ) == 1:  # this happens if option was chosen in its termination state
            tdes = [0.]  # no update
        else:
            tdes = util.q_learning_update_option_sequence(gamma, alpha, \
                                    agent_smdp.q_func, states, \
                                    rewards, opt.identifier)
        tot_td += np.sum(tdes)
        reward_record.append(rewards)
        cur_state = next_state
        steps += len(states)
        if done:
            break
    prev_steps = hist[itr - 1, 0]
    ret = util.discounted_return(reward_record, gamma)
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.switching_greedy_eval(
        agent_smdp, gamma, max_options, 100)
    hist[itr, :] = np.array([
        prev_steps + steps, tot_td / (steps), ret, greedy_ret, greedy_success,
        greedy_steps, greedy_choices
    ])

    if itr % report_freq == 0:  # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3]))

print("DONE. ({} seconds elapsed)".format(time.time() - start_time))
util.plot_and_pickle(env, agent_smdp, hist)
예제 #2
0
        rewards.append(reward)
        stp += 1
        #replay for agent
        if batch_size < len(agent_q.memory):
            err, _ = replay(batch_size)
            tot_td += err
            # stp += replay_stp

        cur_state = next_state

        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(
                itr, iterations, stp, epsilon))
            break

    # record results for this iteration
    prev_steps = hist[itr - 1, 0]
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(
        agent_q, gamma, max_steps, 100)
    hist[itr, :] = np.array([
        prev_steps + stp, tot_td / (stp),
        util.discounted_return(rewards, gamma), greedy_ret, greedy_success,
        greedy_steps, greedy_choices
    ])

    if itr % report_freq == 0:  # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3]))

print("DONE. ({} seconds elapsed)".format(time.time() - start_time))
util.plot_and_pickle(env, agent_q, hist)
예제 #3
0
        option_index = plan[0] * agent_plan.num_actions + plan[1]
        states, actions, rewards, done = env.step_plan(agent_plan.sebango)
        ret = util.discounted_return(rewards, gamma)
        # Add in a bonus at the end of the first option for subsequent options
        rewards2 = rewards[0] + [
            util.discounted_return(rewards[1:], gamma) *
            gamma**(len(rewards[0]) + 1)
        ]
        steps += np.sum([len(s) for s in states])
        if actions[0] == [None]:  # no valid action chosen
            states[0] = [cur_state, cur_state]  # no transition

        # update q-table
        tdes    = util.q_learning_update_plan_options(gamma, alpha, \
                                    agent_plan.q_func, states[0], \
                                    rewards2, option_index)
        tot_tde += np.sum(tdes)
    prev_steps = hist[itr - 1, 0]
    greedy_steps, greedy_choices, greedy_ret, greedy_success = util.greedy_eval(
        agent_plan, gamma, max_plans, 100)
    hist[itr, :] = np.array([
        prev_steps + steps, tot_tde / (steps), ret, greedy_ret, greedy_success,
        greedy_steps, greedy_choices
    ])

    if itr % report_freq == 0:  # evaluation
        print("Itr %i # Average reward: %.2f" % (itr, hist[itr, 3]))

print("DONE. ({} seconds elapsed)".format(time.time() - start_time))
util.plot_and_pickle(env, agent_plan, hist)