def learn_model_fn(model, timesteps, save, period): return learn(model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period)
def learn_model_fn(model, timesteps, save, period): save_thresh = 19.2 if args.selfplay else None return learn(model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period, save_thresh=save_thresh)
optim_param=[alpha], loss_function=nn.MSELoss(), tau=1, device=device) buffer = QBuffer(memory_size, batch_size, device) learning_policy = EpsDecay(eps_start, eps_min, eps_decay, env.action_space.n) playing_policy = Greedy() agent = Agent(model=model, buffer=buffer, learn_every=4, update_every=4, policy_learning=learning_policy, policy_playing=playing_policy) scores = util.learn(env, goal_size, average_goal, agent, max_step, nb_epi_max, gamma, learning_policy) print(len(buffer)) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() for i in range(10): state = env.reset() score = 0 env.render() for j in range(max_step): action = agent.act(state)