예제 #1
0
    # Start training
    logger.info('Train agent on {} env'.format({env.unwrapped.spec.id}))
    logger.info('Doing {} timesteps'.format(args.timesteps))
    logger.info('Start at timestep {0} with t = {1}'.format(timestep, t))
    logger.info('Start training at {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime())))

    while timestep <= args.timesteps:
        ou_noise.reset()
        epoch_return = 0

        state = torch.Tensor([env.reset()]).to(device)
        while True:
            if args.render_train:
                env.render()

            action = agent.calc_action(state, ou_noise)
            next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
            timestep += 1
            epoch_return += reward

            mask = torch.Tensor([done]).to(device)
            reward = torch.Tensor([reward]).to(device)
            next_state = torch.Tensor([next_state]).to(device)

            memory.push(state, action, mask, next_state, reward)

            state = next_state

            epoch_value_loss = 0
            epoch_policy_loss = 0
예제 #2
0
    agent.load_checkpoint()

    # Load the agents parameters
    agent.set_eval()

    for _ in range(args.episodes):
        step = 0
        returns = list()
        state = torch.Tensor([env.reset()]).to(device)
        episode_return = 0
        while True:
            if args.render:
                env.render()

            action = agent.calc_action(state, action_noise=None)
            q_value = agent.critic(state, action)
            next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
            episode_return += reward

            state = torch.Tensor([next_state]).to(device)

            step += 1

            if done:
                logger.info(episode_return)
                returns.append(episode_return)
                break

    mean = np.mean(returns)
    variance = np.var(returns)