state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= int(args.start_timesteps): policy.train(_replay_buffer, args.batch_size) if done: print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) logger.store(EpRet=episode_reward, EpLen=episode_timesteps) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 if (t + 1) % args.eval_freq == 0: test_agent(policy, eval_env, args.seed, logger) if args.save_model: policy.save(f"./models/{file_name}") logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t + 1) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
print( f"Warning: trajectory cut off by local epoch at {episode_timesteps} steps.", flush=True) if timeout_done or epoch_done: _, _, v = policy.select_action(state) else: v = 0 _replay_buffer.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=episode_reward, EpLen=episode_timesteps) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 # perform VPG update policy.train(_replay_buffer) test_agent(policy, eval_env, args.seed, logger) if args.save_model: policy.save(f"./models/{file_name}") logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", (epoch + 1) * args.steps_per_epoch) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
step += 1 eps_schedule.update( step) # Annealing the epsilon, for exploration strategy states = next_states # If memory fill 50K and mod 4 == 0 (for speed issue), update the policy if (step >= args.start_timesteps) and (step % args.update_freq == 0): policy.train(_replay_buffer, batch_size=args.batch_size) # print log and save model if t % args.eval_freq == 0: if args.save_model: policy.save(f"./models/{file_name}") # check time interval time_interval = round(time.time() - start_time, 2) mean_100_ep_return = round( np.mean([epinfo['r'] for epinfo in epinfobuf]), 2) # calculate mean return print( f"Used Step: {step} | Epsilon: {round(eps_schedule.value, 3)} " f"| Mean ep 100 return: {mean_100_ep_return} " f"| Used Time: {time_interval}") # store the logger logger.log_tabular("MeanEpReward", mean_100_ep_return) logger.log_tabular("TotalEnvInteracts", step) logger.log_tabular("Time", time_interval) logger.dump_tabular() print("The training is done!")