n_actions=n_actions, memory_size=memory_size, batch_size=batch_size, gamma=gamma, alpha=alpha, lr=lr, action_bounds=action_bounds, reward_scale=reward_scale) if TRAIN: for episode in range(1, MAX_EPISODES + 1): state = env.reset() episode_reward = 0 done = 0 start_time = time.time() while not done: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.store(state, reward, done, action, next_state) value_loss, q_loss, policy_loss = agent.train() if episode % 250 == 0: agent.save_weights() episode_reward += reward state = next_state log(episode, start_time, episode_reward, value_loss, q_loss, policy_loss, len(agent.memory)) else: player = Play(env, agent) player.evaluate()
brain.schedule_lr() brain.schedule_clip_range(iteration) episode_reward = evaluate_policy(env_name, brain, state_shape) if iteration == 1: running_reward = episode_reward else: running_reward = 0.99 * running_reward + 0.01 * episode_reward if iteration % log_period == 0: print(f"Iter: {iteration}| " f"Ep_reward: {episode_reward:.3f}| " f"Running_reward: {running_reward:.3f}| " f"Total_loss: {total_loss:.3f}| " f"Explained variance:{ev:.3f}| " f"Entropy: {entropy:.3f}| " f"Iter_duration: {time.time() - start_time:.3f}| " f"Lr: {brain.scheduler.get_last_lr()}| " f"Clip_range:{brain.epsilon:.3f}") brain.save_params(iteration, running_reward) with SummaryWriter(env_name + "/logs") as writer: writer.add_scalar("running reward", running_reward, iteration) writer.add_scalar("episode reward", episode_reward, iteration) writer.add_scalar("explained variance", ev, iteration) writer.add_scalar("loss", total_loss, iteration) writer.add_scalar("entropy", entropy, iteration) else: play = Play(env_name, brain) play.evaluate()