def fill_replay_separated(env, actor, critic, replay): obs = torch.from_numpy(env.reset()).float() total_reward = 0 current_reward = 0 num_episodes = 0 for i in range(replay._capacity): s0 = obs action_logits = actor(obs) value = critic(obs) action = Categorical(logits = action_logits).sample() obs, rews, dones, infos = env.step(action.numpy()) obs = torch.from_numpy(obs).float() total_reward += rews current_reward += rews replay.remember(s0, action_logits, action, rews, value, dones) if dones: obs = torch.from_numpy(env.reset()).float() num_episodes += 1 print(current_reward) current_reward=0 env.render() final_value = critic(obs) replay.prep(final_value) return total_reward/num_episodes
def fill_replay(env, agent, replay): """ This function uses our agent and our environemnt to fill our replay buffer. """ #initialize the env and convert the obs to torch obs = torch.from_numpy(env.reset()).float() #book keeping variables total_reward = 0 current_reward = 0 num_episodes = 0 for i in range(replay._capacity): #see storage.py for details on the Memory class s0 = obs action_logits, value = agent(obs) """ for cartpole, we have two actions: left and right. the output of our neural network is just a vector of two numbers. we need to represent those numbers as probabilities, so they must be on [0,1] and sum to 1. now, we COULD just shift and normalize the output, but this is bad because we would like larger outputs to correspond to more confidence. if we just shift and norm, then only the relative ratio between the outputs will affect the confidence. to get around this we normalize after an exponential transform, which is known as the Softmax funciton: prob[i] = exp(outputs[i])/sum(exp(outputs)). once we have this vector of probabilities, we want to sample it to get the action. we could have used np.random.choice with a specific probability array, but the torch categorical distribution handles all the softmax crap for us internally, so we use that instead """ action = Categorical(logits=action_logits).sample() obs, rews, dones, infos = env.step(action.numpy()) obs = torch.from_numpy(obs).float() current_reward += rews #now that we have our transition, we store it for later replay.remember(s0, action_logits, action, rews, value, dones) if dones: obs = torch.from_numpy(env.reset()).float() num_episodes += 1 total_reward += current_reward print("episode " + str(num_episodes) + ":", current_reward) current_reward = 0 env.render() """ the infinite horizon stochastic return is defined by a sum over an infinate number of time steps. we obviously cannot do this, so we bootstrap the calculation using our value funciton to approximate the sum of the terms from N to infinity. """ _, final_value = agent(obs) replay.compute_returns(final_value) return total_reward / num_episodes