def main(lr, epsilon, gamma, decay_lr, decay_epsilon, modelfile): seed = 42 np.random.seed(seed) env = GridWorld() agent = Agent(env.get_state_dims(), env.action_size, lr, epsilon, gamma, decay_lr, decay_epsilon, supervisor=True) exp_name = "gridworld_lr{}_ep{}_gamma{}_decaylr{}_decayep{}_s{}".format( lr, epsilon, gamma, decay_lr, decay_epsilon, seed) print_freq = 10000 logger = Logger(print_freq) for epochs in range(500000): s, done, trajectory, score, steps = env.reset(), False, [], 0, 0 while not done and steps < 60: #100: a = agent.Pi(s, env) sprime, r, done, interrupt = env.step(a) a2 = a if not interrupt: agent.update(s, a, r, sprime, done) else: # interrupt service routine will handle it # do not update Q table's values action_dict = { 0: Action.Drop1, 1: Action.Drop2, 2: Action.Drop3, 3: Action.Pick1, 4: Action.Pick2, 5: Action.Pick3 } s = env.ar.get_state() - 1 if env.ar.is_active() else 5 # for easy debugging purposes a2 = action_dict[s] trajectory.append([s, a, a2, r, sprime, done]) s = sprime score += r steps += 1 if print_traj: print(trajectory) logger.update(epochs, score, steps, env) if epochs % print_freq == print_freq - 1: logger.log(epochs) # print(generate_best_trajectory(env, agent)) agent.decay() f = open(modelfile, "wb") pickle.dump(agent, f) f.close()