Пример #1
0
def main(lr, epsilon, gamma, decay_lr, decay_epsilon, modelfile):
    seed = 42
    np.random.seed(seed)
    env = GridWorld()
    agent = Agent(env.get_state_dims(),
                  env.action_size,
                  lr,
                  epsilon,
                  gamma,
                  decay_lr,
                  decay_epsilon,
                  supervisor=True)
    exp_name = "gridworld_lr{}_ep{}_gamma{}_decaylr{}_decayep{}_s{}".format(
        lr, epsilon, gamma, decay_lr, decay_epsilon, seed)

    print_freq = 10000
    logger = Logger(print_freq)

    for epochs in range(500000):
        s, done, trajectory, score, steps = env.reset(), False, [], 0, 0
        while not done and steps < 60:  #100:
            a = agent.Pi(s, env)
            sprime, r, done, interrupt = env.step(a)
            a2 = a
            if not interrupt:
                agent.update(s, a, r, sprime, done)
            else:
                # interrupt service routine will handle it
                # do not update Q table's values
                action_dict = {
                    0: Action.Drop1,
                    1: Action.Drop2,
                    2: Action.Drop3,
                    3: Action.Pick1,
                    4: Action.Pick2,
                    5: Action.Pick3
                }
                s = env.ar.get_state() - 1 if env.ar.is_active() else 5
                # for easy debugging purposes
                a2 = action_dict[s]

            trajectory.append([s, a, a2, r, sprime, done])
            s = sprime
            score += r
            steps += 1

        if print_traj:
            print(trajectory)
        logger.update(epochs, score, steps, env)
        if epochs % print_freq == print_freq - 1:
            logger.log(epochs)
            # print(generate_best_trajectory(env, agent))
            agent.decay()

    f = open(modelfile, "wb")
    pickle.dump(agent, f)
    f.close()