Пример #1
0
def fill_replay_separated(env, actor, critic, replay):
    obs = torch.from_numpy(env.reset()).float()
    total_reward = 0
    current_reward = 0
    num_episodes = 0
    for i in range(replay._capacity):
        s0 = obs
        action_logits = actor(obs)
        value = critic(obs)
        action = Categorical(logits = action_logits).sample()
        obs, rews, dones, infos = env.step(action.numpy())
        obs = torch.from_numpy(obs).float()
        total_reward += rews
        current_reward += rews
        replay.remember(s0, 
                       action_logits, 
                       action,  
                       rews,
                       value, 
                       dones)

        if dones:
            obs = torch.from_numpy(env.reset()).float()
            num_episodes += 1
            print(current_reward)
            current_reward=0
        env.render()

    final_value = critic(obs)
    replay.prep(final_value)  
    return total_reward/num_episodes      
Пример #2
0
def fill_replay(env, agent, replay):
    """
    This function uses our agent and our environemnt
    to fill our replay buffer.
    """

    #initialize the env and convert the obs to torch
    obs = torch.from_numpy(env.reset()).float()

    #book keeping variables
    total_reward = 0
    current_reward = 0
    num_episodes = 0

    for i in range(replay._capacity):
        #see storage.py for details on the Memory class
        s0 = obs
        action_logits, value = agent(obs)
        """
        for cartpole, we have two actions: left and right.
        the output  of our neural network is just a vector of two numbers.
        we need to represent those numbers as probabilities, so they 
        must be on [0,1] and sum to 1. 
        
        now, we COULD just shift and normalize the output, but this is 
        bad because we would like larger outputs to correspond to more 
        confidence.  if we just shift and norm, then only the relative 
        ratio between the outputs will affect the confidence.  to get around 
        this we normalize after an exponential transform, which is known as the 
        Softmax funciton: prob[i] = exp(outputs[i])/sum(exp(outputs)).

        once we have this vector of probabilities, we want to sample it 
        to get the action.  we could have used np.random.choice with a 
        specific probability array, but the torch categorical distribution
        handles all the softmax crap for us internally, so we use that instead
        """
        action = Categorical(logits=action_logits).sample()
        obs, rews, dones, infos = env.step(action.numpy())
        obs = torch.from_numpy(obs).float()
        current_reward += rews

        #now that we have our transition, we store it for later
        replay.remember(s0, action_logits, action, rews, value, dones)

        if dones:
            obs = torch.from_numpy(env.reset()).float()
            num_episodes += 1
            total_reward += current_reward
            print("episode " + str(num_episodes) + ":", current_reward)
            current_reward = 0

        env.render()
    """
    the infinite horizon stochastic return is defined by a sum over an 
    infinate number of time steps.  we obviously cannot do this, so we bootstrap
    the calculation using our value funciton to approximate the sum of the terms
    from N to infinity.
    """
    _, final_value = agent(obs)
    replay.compute_returns(final_value)
    return total_reward / num_episodes