# Start training logger.info('Train agent on {} env'.format({env.unwrapped.spec.id})) logger.info('Doing {} timesteps'.format(args.timesteps)) logger.info('Start at timestep {0} with t = {1}'.format(timestep, t)) logger.info('Start training at {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime()))) while timestep <= args.timesteps: ou_noise.reset() epoch_return = 0 state = torch.Tensor([env.reset()]).to(device) while True: if args.render_train: env.render() action = agent.calc_action(state, ou_noise) next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) timestep += 1 epoch_return += reward mask = torch.Tensor([done]).to(device) reward = torch.Tensor([reward]).to(device) next_state = torch.Tensor([next_state]).to(device) memory.push(state, action, mask, next_state, reward) state = next_state epoch_value_loss = 0 epoch_policy_loss = 0
agent.load_checkpoint() # Load the agents parameters agent.set_eval() for _ in range(args.episodes): step = 0 returns = list() state = torch.Tensor([env.reset()]).to(device) episode_return = 0 while True: if args.render: env.render() action = agent.calc_action(state, action_noise=None) q_value = agent.critic(state, action) next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) episode_return += reward state = torch.Tensor([next_state]).to(device) step += 1 if done: logger.info(episode_return) returns.append(episode_return) break mean = np.mean(returns) variance = np.var(returns)