# caclulate features features, next_hidden_state_h, next_hidden_state_c = agent.feature_net(map_state, depth_state, goal_state, hidden_state_h, hidden_state_c) # caclulate action and state values dist, value, std = agent.ac_model( features) total_std.append(std[1].cpu().numpy()) action = dist.sample() # this is a x,1 tensor is kontains alle the possible actions # the cpu command move it from a gpu tensor to a cpu tensor next_map_state, next_depth_state, next_goal_state, reward, done, _ = envs.step(action.cpu().numpy()) # count reached goal and reset stacked frames for i in range(0, num_envs): if (done[i] == True): number_of_episodes += 1 if (reward[i] >= 0.2): number_reached_goal += 1 reach_goal.append(1) else: reach_goal.append(0) _, stacked_map_frames = reset_single_frame(stacked_map_frames, next_map_state[i], stack_size, i) _, stacked_depth_frames = reset_single_frame(stacked_depth_frames, next_depth_state[i], stack_size, i) _, stacked_goal_frames = reset_single_frame(stacked_goal_frames, next_goal_state[i], stack_size, i)
while not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] for _ in range(NB_STEP): state = torch.FloatTensor(state) value = model.predict_value(state) action = model.get_action(state) action = action.squeeze(0) next_state, reward, done, _ = envs.step(action) log_prob = model.get_log_prob(state, action) log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1)) states.append(state) actions.append(action) state = next_state frame_idx += 1 if frame_idx % 1000 == 0: test_reward = np.mean([test_env() for _ in range(10)])
def main(): mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e3) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 #Init a2c and rmsprop actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # actor_critic = actor_critic.cuda() rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): action = actor_critic.act(autograd.Variable(state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() if i_update % num_frames == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "actor_critic_" + mode) import time def displayImage(image, step, reward): #clear_output(True) s = "step: " + str(step) + " reward: " + str(reward) plt.figure(figsize=(10, 3)) plt.title(s) plt.imshow(image) plt.show() time.sleep(0.1) env = MiniPacman(mode, 1000) done = False state = env.reset() total_reward = 0 step = 1 while not done: current_state = torch.FloatTensor(state).unsqueeze(0) #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = env.step(action.data[0, 0]) total_reward += reward state = next_state image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy() displayImage(image, step, total_reward) step += 1
if __name__ == '__main__': envs = [make_env for i in range(N_ENVS)] envs = SubprocVecEnv(envs) obs = envs.reset() print("OBSERVATION ", obs[0]) obs = obs.reshape(-1) obs_shape = obs.shape envs = VecNormalize(envs, obs_shape, ob=False, gamma=0.99) n_steps = 100 bar = ProgBar(n_steps, bar_char='█') for i_episode in range(2): ## reinitialize the environment observation = envs.reset() ## the simulation for n_steps timesteps for t in range(n_steps): ## value, is_rate, is_producer, is_open actions_inje = [[randint(410, 430), False, False, True] for _ in range(8)] actions_prod = [[randint(220, 250), False, True, True] for _ in range(4)] ## Advance the simulation forward observation, reward, done, observation_full = \ envs.step([(actions_inje + actions_prod) for _ in range(N_ENVS)]) # print (reward) bar.update() if done.any(): print("Episode finished after {} timesteps".format(t + 1)) break envs.close()
class RolloutCollector: def __init__(self, num_env_workers, make_env_func, agent, batch_size, rollout_length, num_recurrence_steps, state_shape, action_shape, stats): ''' -one agent is assigned to a collector. -a collector runs a bunch of envs in paralel to feed to that agent -you could run a bunch of collectors simultaniously, |- and then use weight mixing on the agents seperately ''' self.num_env_workers = num_env_workers self.envs = SubprocVecEnv( [make_env_func() for i in range(num_env_workers)]) self.agent = agent self.batch_size = batch_size self.rollout_length = rollout_length self.num_recurrence_steps = num_recurrence_steps self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.buffer_full = False self.GAE_calculated = False self.gamma = 0.8 self.tau = 0.8 self.rollout_indices = np.zeros(batch_size) self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1 self.states = torch.zeros( (batch_size, self.buffer_width + 1, *state_shape), dtype=torch.float32).to(self.agent.device) self.actions = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.log_probs = torch.zeros( (batch_size, self.buffer_width + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.values = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.done_masks = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.advantages = torch.zeros( (batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1), dtype=torch.float32).to(self.agent.device) self.state = self.envs.reset() self.hidden_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) def collect_samples(self): if self.buffer_full: raise Exception( "tried to collect more samples when buffer already full") num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers) with torch.no_grad(): self.hidden_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.num_env_workers, self.agent.hidden_state_size)).to(self.agent.device) for collection_run in range(num_runs_to_full): start_index = collection_run * self.num_env_workers end_index_exclusive = min(start_index + self.num_env_workers, self.batch_size) run_indices = torch.arange(start_index, end_index_exclusive, dtype=torch.long) worker_indices = run_indices % self.num_env_workers for rollout_idx in range(self.buffer_width + 1): state = torch.Tensor(self.state).float().to( self.agent.device) # for recurrences lstm_input = state.view(-1, 1, *self.state_shape) output, (hidden, cell) = self.agent.lstm( lstm_input, (self.hidden_state, self.cell_state)) output = output.reshape(self.num_env_workers, self.agent.hidden_state_size) policy_dist = self.agent.actor(output) action = policy_dist.sample() action = action.clamp(-1, 1) # depends on env state_, reward, done, info = self.envs.step( action.cpu().numpy()) value = self.agent.critic(output) log_prob = policy_dist.log_prob(action) reward = torch.Tensor(reward).float().unsqueeze(1).to( self.agent.device) done_masks = torch.Tensor(1.0 - done).float().unsqueeze(1).to( self.agent.device) self.states[run_indices, rollout_idx] = state[worker_indices] self.actions[run_indices, rollout_idx] = action[worker_indices] self.log_probs[run_indices, rollout_idx] = log_prob[worker_indices] self.values[run_indices, rollout_idx] = value[worker_indices] self.rewards[run_indices, rollout_idx] = reward[worker_indices] self.done_masks[run_indices, rollout_idx] = done_masks[worker_indices] self.hidden_state[0, worker_indices] *= self.done_masks[ run_indices, rollout_idx].expand(-1, self.agent.hidden_state_size) self.cell_state[0, worker_indices] *= self.done_masks[ run_indices, rollout_idx].expand(-1, self.agent.hidden_state_size) self.state = state_ self.buffer_full = True self.stats.update_collection_stats( num_samples_collected_inc=self.batch_size * self.rollout_length) def compute_gae(self): if not self.buffer_full: raise Exception( "buffer is not full of new samples yet (so not ready for GAE)") gae = torch.zeros((self.batch_size, 1)).to(self.agent.device) for i in reversed(range(self.buffer_width)): delta = self.rewards[:, i] + self.gamma * self.values[:, i + 1] * self.done_masks[:, i] - self.values[:, i] gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae self.returns[:, i] = gae + self.values[:, i] self.advantages[:, i] = gae self.GAE_calculated = True def get_leading_states(self, index): indices_with_leading_states = torch.arange( self.num_recurrence_steps) - self.num_recurrence_steps + 1 + index leading_states = self.states[:, indices_with_leading_states] # some of the leading states might be from previous episodes # # in which case, we dont want to consider those at all. leading_state_indices = indices_with_leading_states[:-1] leading_dones = 1 - self.done_masks[:, leading_state_indices] last_leading_dones = leading_dones.nonzero()[:, :2] for batch_index, last_done in last_leading_dones: previous_episode_indices = torch.arange(last_done + 1) leading_states[batch_index, previous_episode_indices] = 0 return leading_states def random_batch_iter(self): if not self.buffer_full and not self.GAE_calculated: raise Exception( "buffer is not ready for sampling yet. (not full/no GAE)") '''-theres no way all the workers are aligned, especially after an episode or so. so we might just be able to use a vertical index''' batch_indices = torch.randperm(self.rollout_length) # recurrence stuff if self.num_recurrence_steps > 0: batch_indices = torch.randperm( self.rollout_length) + self.num_recurrence_steps - 1 self.hidden_state = torch.zeros( (1, self.batch_size, self.agent.hidden_state_size)).to(self.agent.device) self.cell_state = torch.zeros( (1, self.batch_size, self.agent.hidden_state_size)).to(self.agent.device) for i in range(self.rollout_length): index = batch_indices[i] leading_states = self.get_leading_states(index) output, (hidden, cell) = self.agent.lstm( leading_states, (self.hidden_state, self.cell_state)) state = output[:, -1, :] action = self.actions[:, index] log_prob = self.log_probs[:, index] advantage = self.advantages[:, index] return_ = self.returns[:, index] yield state, action, log_prob, advantage, return_ def reset(self): self.buffer_full = False self.GAE_calculated = False
print("=> loaded checkpoint '{}' (global_t {})" .format(best_path, checkpoint['global_t'])) else: global_t=0 print("=> no checkpoint found at '{}'".format(best_path)) count=0 running_corrects=0 running_corrects1 =0 writer=SummaryWriter() for i_update in range(num_frames): optimizer.zero_grad() for step in range(num_steps): action= actor_critic.act(Variable(pstate1),Variable(pstate2),Variable(pstate3),Variable(state),Variable(gstate),Variable(pre_action)) #print("act:",action) pim1,pim2,pim3,next_state,g_state, reward, done,gt_action,gt_state, shortest,pre_action = envs.step(action.cpu().data.numpy()) for i in range(num_envs): my_path[i][-1]+=1 if reward[i]>5: episode_success[i][-1]+=1 if done[i]: shortest_path[i].append(shortest[i]) episode_success[i].append(0) my_path[i].append(1) #===================================== reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1) count+=(1-masks).sum() #final_rewards *= masks final_rewards += (1-masks) * episode_rewards
action_bound=None, rollout_steps=ROLLOUT_STEPS, memory_capacity=4096, summary_writer=None, mode=0) states = envs.reset() states = [utils.combine_env_states(*state) for state in states] padded_states = np.array( [utils.pad_data(state, MAX_NUM_NODES, [1]) for state in states]) for step in range(PPO_STEPS): if (step + 1) % 10 == 0: print('Step', step) padded_actions = swarmnet_agent.act_batch( [padded_states, padded_edge_types], masks)[0] next_states, rewards, dones, infos = envs.step([ padded_action[-num_boid:, :] for padded_action, num_boid in zip(padded_actions, env_num_boids) ]) padded_states = np.array([ utils.pad_data(utils.combine_env_states(*state), MAX_NUM_NODES, [1]) for state in next_states ]) end_t = time.time() print("Time spent", end_t - start_t)
class PPO(object): """Main PPO class""" def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path) #self.delta = (self.args.lr-self.args.lr_end)/1e6 def train(self): """Main training function""" frame_idx = 0 state = self.envs.reset() mean_100_reward = -np.inf self.info() while frame_idx < self.args.max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = self.args.entropy for _ in range(self.args.nb_steps): state = torch.FloatTensor(state).to(self.device) dist, value = self.actor_critic(state) action = dist.sample() # Make sure action is loaded to CPU (not GPU) next_state, reward, done, _ = self.envs.step( action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(self.device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(self.device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 #self.scheduler() # Evaluate training process and write data to tensorboard if frame_idx % 1000 == 0: test_reward = np.mean( [self.test_env(self.args.vis) for _ in range(10)]) self.test_rewards.append(test_reward) if self.args.play is False: print("Mean reward: ", np.round(np.mean(self.test_rewards[-101:-1]), 0)) if mean_100_reward < np.round( np.mean(self.test_rewards[-101:-1]), 0): mean_100_reward = np.round( np.mean(self.test_rewards[-101:-1]), 0) self.save_network(mean_100_reward) if len(self.test_rewards) >= 10: self.writer.add_scalar( 'data/reward', np.mean(self.test_rewards[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/ppo_loss', np.mean(self.loss[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/nb_actions_outside_range', np.mean(self.action_bang_bang[-11:-1]), frame_idx * self.args.num_envs) # if test_reward > threshold_reward: early_stop = True next_state = torch.FloatTensor(next_state).to(self.device) _, next_value = self.actor_critic(next_state) returns = self.calc_gae(next_value, rewards, masks, values, self.args.gamma, self.args.tau) # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size, states, actions, log_probs, returns, advantage, self.args.clip) def make_env(self): # Private trunk function for calling the SubprocVecEnv class def _trunk(): env = self.args.env # in this simple case the class TestEnv() is called (see openAI for more envs) env.seed(self.args.seed) env.set_scaling(self.args.output_scaling) return env return _trunk def test_env(self, vis=False): state = self.env_test.reset() if vis: self.env_test.render() done = False total_reward = 0 action_bang_bang = 0 step = 0 while not done: step += 1 state = torch.FloatTensor(state).unsqueeze(0).to(self.device) dist, _ = self.actor_critic(state) action = dist.sample().cpu().numpy()[0] force = action * self.args.output_scaling next_state, reward, done, _ = self.env_test.step(action) if force > 0.5 or force < -0.5: action_bang_bang += 1 state = next_state if vis: self.env_test.render() total_reward += reward self.action_bang_bang.append(action_bang_bang / step) return total_reward # Plain functions except that one can call them from an instance or the class @staticmethod def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns @staticmethod def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter( mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.actor_critic(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.loss.append(loss.item()) # Important step: self.optimizer.zero_grad() #pdb.set_trace() loss.backward() if self.args.grad_norm is not None: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.args.grad_norm) self.optimizer.step() def save_network(self, reward): network_path = self.output_path + "/network" + str(reward) pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb")) def load_network(self, path): network_new = pickle.load(open(path, "rb")) self.actor_critic.load_state_dict(network_new) def random_seed(self): torch.manual_seed(self.args.seed) random.seed(self.args.seed) np.random.seed(self.args.seed) def scheduler(self): for g in self.optimizer.param_groups: lr = g["lr"] if self.args.lr_end > lr: lr = self.args.lr_end else: lr -= self.delta self.lr.append(lr) g["lr"] = lr def info(self): fhandler = logging.FileHandler(filename=self.output_path + '/mylog.log', mode='a') logger.addHandler(fhandler) logger.info("--- INFO ---") logger.info("args: {}".format(self.args))
sess.run(tf.global_variables_initializer()) while frame_idx < max_frames and not early_stop: log_probs = [] values = [] obs = [] acs = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): ac = ppo.get_action(ob) next_ob, reward, done, _ = envs.step(ac) value = ppo.get_value(ob) values.append(value) rewards.append(reward[:, np.newaxis]) masks.append((1 - done)[:, np.newaxis]) obs.append(ob) acs.append(ac) ob = next_ob frame_idx += 1 if frame_idx % 1000 == 0: test_reward = np.mean([test_env(ppo) for _ in range(10)]) test_rewards.append(test_reward)
def main(): num_envs = 16 envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make("CartPole-v0") num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n # Hyper params: hidden_size = 256 lr = 3e-4 num_steps = 5 model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device) optimizer = optim.Adam(model.parameters()) max_frames = 20000 frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 #每个子网络运行num_steps个steps,实现n步采样 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() #记录下这num_steps步的各子网络相关参数 log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_rewards.append(np.mean([test_env(model, env) for _ in range(10)])) plot(frame_idx, test_rewards) #将子网络的参数传给主网络,并进行参数更新 next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) #将5个step的值串起来 log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values #计算loss均值 actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step()
def train(env, agent, flags): """""" # set random seeds (for reproducibility) torch.manual_seed(flags['seed']) torch.cuda.manual_seed_all(flags['seed']) envs = [make_env(flags['env'], flags['seed'], i) for i in range(flags['num_envs'])] envs = SubprocVecEnv(envs) # instantiate the policy and optimiser num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n optimizer = optim.Adam(model.parameters(), lr=learning_rate) current_step_number = 0 test_rewards = [] state = envs.reset() while current_step_number < flags['max_steps']: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(flags['num_step_td_update']): # sample an action from the distribution action = agent.act(state) # take a step in the environment next_state, reward, done, _ = envs.step(action.cpu().numpy()) # compute the log probability log_prob = dist.log_prob(action) # compute the entropy entropy += dist.entropy().mean() # save the log probability, value and reward log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) # if done, save episode rewards state = next_state current_step_number += 1 if current_step_number % 1000 and flags['plot_test'] == 0: test_rewards.append(np.mean([test_env(model) for _ in range(10)])) plot(current_step_number, test_rewards) next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) # calculate the discounted return of the episode returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() # loss function loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() return rewards
def dqn_algorithm(ENV_NAME, NUM_ENV=8, SEED=1, TOTAL_TIMESTEPS=100000, GAMMA=0.95, MEMORY_SIZE=1000, BATCH_SIZE=32, EXPLORATION_MAX=1.0, EXPLORATION_MIN=0.02, EXPLORATION_FRACTION=0.7, TRAINING_FREQUENCY=1000, FILE_PATH='results/', SAVE_MODEL=False, MODEL_FILE_NAME='model', LOG_FILE_NAME='log', TIME_FILE_NAME='time', PRINT_FREQ=100, N_EP_AVG=100, VERBOSE='False', MLP_LAYERS=[64, 64], MLP_ACTIVATIONS=['relu', 'relu'], LEARNING_RATE=1e-3, EPOCHS=1, GRAD_CLIP=False, DOUBLE_DQN=False, USE_TARGET_NETWORK=True, TARGET_UPDATE_FREQUENCY=5000, LOAD_WEIGHTS=False, LOAD_WEIGHTS_MODEL_PATH='results/model0.h5'): ''' DQN Algorithm execution env_name : string for a gym environment num_env : no. for environment vectorization (multiprocessing env) total_timesteps : Total number of timesteps training_frequency : frequency of training (experience replay) gamma : discount factor : buffer_size : Replay buffer size batch_size : batch size for experience replay exploration_max : maximum exploration at the begining exploration_min : minimum exploration at the end exploration_fraction : fraction of total timesteps on which the exploration decay takes place output_folder : output filepath save_model : boolean to specify whether the model is to be saved model_file_name : name of file to save the model at the end learning log_file_name : name of file to store DQN results time_file_name : name of file to store computation time print_frequency : results printing episodic frequency n_ep_avg : no. of episodes to be considered while computing average reward verbose : print episodic results mlp_layers : list of neurons in each hodden layer of the DQN network mlp_activations : list of activation functions in each hodden layer of the DQN network learning_rate : learning rate for the neural network epochs : no. of epochs in every experience replay grad_clip : boolean to specify whether to use gradient clipping in the optimizer (graclip value 10.0) double_dqn : boolean to specify whether to employ double DQN use_target_network : boolean to use target neural network in DQN target_update_frequency : timesteps frequency to do weight update from online network to target network load_weights : boolean to specify whether to use a prespecified model to initializa the weights of neural network load_weights_model_path : path for the model to use for weight initialization ''' before = time.time() num_envs = NUM_ENV env_name = ENV_NAME if TOTAL_TIMESTEPS % NUM_ENV: print('Error: total timesteps is not divisible by no. of envs') return def make_env(): def _thunk(): env = gym.make(env_name) env.seed(SEED) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) # for reproducibility set_seed(SEED) observation_space = envs.observation_space.shape[0] action_space = envs.action_space.n dqn_solver = DQNSolver(observation_space, action_space, MLP_LAYERS, MLP_ACTIVATIONS, LEARNING_RATE, EPOCHS, USE_TARGET_NETWORK, GRAD_CLIP, DOUBLE_DQN, LOAD_WEIGHTS, LOAD_WEIGHTS_MODEL_PATH, TOTAL_TIMESTEPS, MEMORY_SIZE, BATCH_SIZE, GAMMA, EXPLORATION_MAX, EXPLORATION_MIN, EXPLORATION_FRACTION) envs = ParallelEnvWrapper(envs) t = 0 episode_rewards = [0.0] * num_envs explore_percent, episodes, mean100_rew, steps, NN_tr_loss = [],[],[],[],[] while True: state = envs.reset() # state = np.reshape(state, [1, observation_space]) while True: t += num_envs dqn_solver.eps_timestep_decay(t) action = dqn_solver.act(state) state_next, reward, terminal, _ = envs.step(action) # print(terminal) # reward = reward if not terminal else -reward # state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) if t % TRAINING_FREQUENCY == 0: dqn_solver.experience_replay() state = state_next episode_rewards[-num_envs:] = [ i + j for (i, j) in zip(episode_rewards[-num_envs:], reward) ] # num_episodes = len(episode_rewards) # print(terminal) if (t % PRINT_FREQ == 0): explore_percent.append(dqn_solver.exploration_rate * 100) episodes.append(len(episode_rewards)) mean100_rew.append( round(np.mean(episode_rewards[(-1 - N_EP_AVG):-1]), 1)) steps.append(t) NN_tr_loss.append(dqn_solver.loss) if VERBOSE: print('Exploration %: ' + str(int(explore_percent[-1])) + ' ,Episodes: ' + str(episodes[-1]) + ' ,Mean_reward: ' + str(mean100_rew[-1]) + ' ,timestep: ' + str(t) + ' , tr_loss: ' + str(round(NN_tr_loss[-1], 4))) if t > TOTAL_TIMESTEPS: output_table = np.stack((steps, mean100_rew, episodes, explore_percent, NN_tr_loss)) if not os.path.exists(FILE_PATH): os.makedirs(FILE_PATH) file_name = str(FILE_PATH) + LOG_FILE_NAME + '.csv' np.savetxt( file_name, np.transpose(output_table), delimiter=',', header= 'Timestep,Rewards,Episodes,Exploration %,Training Score') after = time.time() time_taken = after - before np.save(str(FILE_PATH) + TIME_FILE_NAME, time_taken) if SAVE_MODEL: file_name = str(FILE_PATH) + MODEL_FILE_NAME + '.h5' dqn_solver.model.save(file_name) return dqn_solver.model if USE_TARGET_NETWORK and t % TARGET_UPDATE_FREQUENCY == 0: dqn_solver.update_target_network() # print(t) if terminal.all(): episode_rewards += [0.0] * num_envs break
n_updates = int(N_FRAMES // N_STEPS // N_ENVS) for update_i in tqdm(range(n_updates)): # Generate samples for step in range(N_STEPS): # Generate and take an action with torch.no_grad(): value, action, action_log_prob = policy.act( rollouts.observations[step]) take_actions = action.squeeze(1).cpu().numpy() if len(take_actions.shape) == 1: take_actions = np.expand_dims(take_actions, axis=-1) obs, reward, done, info = envs.step(take_actions) # convert to pytorch tensor reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done]) # update reward info for logging episode_rewards += reward final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # Update our current observation tensor current_obs *= masks update_current_obs(obs)
while frame_idx < max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step( np.clip(action.cpu().numpy(), 0, 1)) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1
early_stop = False while frame_idx < max_frames and not early_stop: i_update += 1 values = [] obs = [] acs = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): ac = ppo.get_action(ob) next_ob, _, done, _ = envs.step(ac) reward = discriminator.get_reward(np.concatenate([ob, ac], axis=1)) value = ppo.get_value(ob) values.append(value) rewards.append(reward[:, np.newaxis]) masks.append((1-done)[:, np.newaxis]) obs.append(ob) acs.append(ac) ob = next_ob frame_idx += 1 if frame_idx % 1000 == 0: test_reward = np.mean([test_env(ppo) for _ in range(10)])
while frame_idx < max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(np.clip(action.cpu().numpy(), 0, 1)) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1
while not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] for _ in tqdm(range(PPO_STEPS), ascii=True): state = torch.FloatTensor(state).permute(0, 3, 1, 2).to(device) dist, value = model(state) action = dist.sample() # each state, reward, done is a list of results from each parallel environment next_state, reward, done, _ = envs.step(torch.argmax(action, dim=1, keepdim=True).cpu().numpy()) log_prob = dist.log_prob(action) log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 next_state = torch.FloatTensor(next_state).permute(0, 3, 1, 2).to(device) _, next_value = model(next_state)
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n num_rewards = len(task_rewards[mode]) full_rollout = True env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards) env_model.load_state_dict(torch.load("env_model_" + mode)) distil_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n) distil_optimizer = optim.Adam(distil_policy.parameters()) imagination = ImaginationCore(1, state_shape, num_actions, num_rewards, env_model, distil_policy, full_rollout=full_rollout) actor_critic = I2A(state_shape, num_actions, num_rewards, 256, imagination, full_rollout=full_rollout) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # env_model = env_model.cuda() # distil_policy = distil_policy.cuda() # actor_critic = actor_critic.cuda() gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e5) rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() current_state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(current_state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() current_state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, current_state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_logit, _, _, _ = distil_policy.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_loss = 0.01 * (F.softmax(logit).detach() * F.log_softmax(distil_logit)).sum(1).mean() values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() distil_optimizer.zero_grad() distil_loss.backward() optimizer.step() if i_update % 100 == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "i2a_" + mode)
log_probs = [] values = [] rewards = [] masks = [] entropy = 0 # rollout trajectory for _ in range(num_steps): state = torch.FloatTensor(state).to(device) # get a state from env dist, value = model( state ) # run the state through the network to get an action distribution and value of state action = dist.sample( ) # pick an action from the action distribution output by the model next_state, reward, done, _ = envs.step(action.cpu().numpy( )) # take the action, and get a new state and reward log_prob = dist.log_prob(action) # log prob of the action entropy += dist.entropy().mean() # entropy log_probs.append(log_prob) # add the log prob to a list values.append(value) # add a list of predicted values rewards.append(torch.FloatTensor(reward).unsqueeze(1).to( device)) # add to a list of rewards masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_rewards.append(np.mean([test_env() for _ in range(10)]))
final_rewards = torch.zeros(num_envs, 1) actor_critic.to(DEVICE) rollout.to(DEVICE) state = envs.reset() state = torch.FloatTensor(np.float32(state)).to(DEVICE) rollout.states[0].copy_(state) for i_update in range(last_num_frames, num_frames): for step in range(num_steps): action = actor_critic.act(state) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks masks.to(DEVICE) state = torch.FloatTensor(np.float32(next_state)).to(DEVICE) rollout.insert(step, state, action.data, reward, masks) with torch.no_grad(): _, next_value = actor_critic(rollout.states[-1])
class RolloutCollector: def __init__(self, num_env_workers, make_env_func, agent, batch_size, rollout_length, state_shape, action_shape, stats): ''' -one agent is assigned to a collector. -a collector runs a bunch of envs in paralel to feed to that agent -you could run a bunch of collectors simultaniously, |- and then use weight mixing on the agents seperately ''' #self.storage_device = torch.device("cpu") self.num_env_workers = num_env_workers self.envs = SubprocVecEnv( [make_env_func() for i in range(num_env_workers)]) self.agent = agent self.batch_size = batch_size self.rollout_length = rollout_length self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.buffer_full = False self.GAE_calculated = False self.gamma = 0.8 self.tau = 0.8 self.rollout_indices = np.zeros(batch_size) self.states = torch.zeros( (batch_size, rollout_length + 1, *state_shape), dtype=torch.float32).to(self.agent.device) self.actions = torch.zeros( (batch_size, rollout_length + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.log_probs = torch.zeros( (batch_size, rollout_length + 1, *action_shape), dtype=torch.float32).to(self.agent.device) self.values = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.rewards = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.done_masks = torch.zeros( (batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.advantages = torch.zeros( (batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.returns = torch.zeros((batch_size, rollout_length + 1, 1), dtype=torch.float32).to(self.agent.device) self.state = self.envs.reset() def collect_samples(self): if self.buffer_full: raise Exception( "tried to collect more samples when buffer already full") num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers) with torch.no_grad(): for collection_run in range(num_runs_to_full): start_index = collection_run * self.num_env_workers end_index_exclusive = min(start_index + self.num_env_workers, self.batch_size) run_indices = torch.arange(start_index, end_index_exclusive, dtype=torch.long) worker_indices = run_indices % self.num_env_workers for rollout_idx in range(self.rollout_length + 1): state = torch.Tensor(self.state).float().to( self.agent.device) policy_dist = self.agent.actor(state) action = policy_dist.sample() if self.agent.tanh_action_clamping: action = torch.tanh(action) else: action = action.clamp(-1, 1) # depends on env cpu_actions = action.cpu().numpy() state_, reward, done, info = self.envs.step(cpu_actions) value = self.agent.critic(state) log_prob = policy_dist.log_prob(action) reward = torch.Tensor(reward).float().unsqueeze(1).to( self.agent.device) done_masks = torch.Tensor(1.0 - done).float().unsqueeze(1).to( self.agent.device) self.states[run_indices, rollout_idx] = state[worker_indices] self.actions[run_indices, rollout_idx] = action[worker_indices] self.log_probs[run_indices, rollout_idx] = log_prob[worker_indices] self.values[run_indices, rollout_idx] = value[worker_indices] self.rewards[run_indices, rollout_idx] = reward[worker_indices] self.done_masks[run_indices, rollout_idx] = done_masks[worker_indices] self.state = state_ self.buffer_full = True self.stats.update_collection_stats( num_samples_collected_inc=self.batch_size * self.rollout_length) def compute_gae(self): if not self.buffer_full: raise Exception( "buffer is not full of new samples yet (so not ready for GAE)") gae = torch.zeros((self.batch_size, 1)).to(self.agent.device) for i in reversed(range(self.rollout_length)): delta = self.rewards[:, i] + self.gamma * self.values[:, i + 1] * self.done_masks[:, i] - self.values[:, i] gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae self.returns[:, i] = gae + self.values[:, i] self.advantages[:, i] = gae self.GAE_calculated = True def random_batch_iter(self): if not self.buffer_full and not self.GAE_calculated: raise Exception( "buffer is not ready for sampling yet. (not full/no GAE)") '''-theres no way all the workers are aligned, especially after an episode or so. so we might just be able to use a vertical index''' batch_indices = torch.randperm(self.rollout_length) for i in range(self.rollout_length): index = batch_indices[i] state = self.states[:, index] action = self.actions[:, index] log_prob = self.log_probs[:, index] advantage = self.advantages[:, index] return_ = self.returns[:, index] yield state, action, log_prob, advantage, return_ def reset(self): self.buffer_full = False self.GAE_calculated = False
while not completed: # reset data for new epoch, i.e. on ploicy training log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] for i in range(num_steps_for_batch): state = torch.tensor(state, dtype=torch.float32, device=device) normal_dist, critic_value = model.forward(state) action = normal_dist.sample() next_state, reward, done, _ = envs.step(action.detach().numpy()) states.append(state) actions.append(action) rewards.append(reward) masks.append((1 - done)) log_probs.append(normal_dist.log_prob(action)) values.append(critic_value) state = next_state total_steps += 1 # validation every 4000 steps if total_steps % 4000 == 0: test_reward, max_distance = test_model(number_of_test_runs)
class env_cover(): def __init__(self, config, dev): self.dev = dev self.num_env = config['num_envs'] self.get_img_from_render = config['get_img_from_render'] self.obs_shape = (self.num_env, ) + config['obs_space'][1:] # print(self.obs_shape) self.reward_shape = (self.num_env, ) + config['reward_space'][1:] self.gamma_shape = (self.num_env, ) + config['gamma_space'][1:] if self.num_env == 1: self.env = gym.make(config['game_name']) else: def make_env(): def _thunk(): env = gym.make(config['game_name']) return env return _thunk envs = [make_env() for i in range(self.num_env)] self.env = SubprocVecEnv(envs) # #def obs_preproc(x): # if IMG_GET_RENDER ==False: # return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0) # x = np.dot(x, np.array([[0.299, 0.587, 0.114]]).T) # x = np.reshape(x, (1,x.shape[1], x.shape[0])) # return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)/255 # def reset(self): st = self.env.reset() if self.get_img_from_render: st = self.env.render(mode='rgb_array') st = np.resize(st, self.obs_shape) / 255. return torch.FloatTensor(st).reshape(self.obs_shape).to( self.dev), torch.zeros(self.reward_shape).to( self.dev), torch.zeros(self.gamma_shape).to(self.dev) #return st, 0,False # def get_obs(self,obs): # return torch.from_numpy(obs).detach().float().view(1,config['obs_space']) def step(self, action): st, rt, dt, _ = self.env.step(action) if self.get_img_from_render: st = self.env.render(mode='rgb_array') st = np.resize(st, self.obs_shape) / 255. # print(st) st = torch.FloatTensor(st).reshape(self.obs_shape).to(self.dev) rt = torch.FloatTensor([rt]).reshape(self.reward_shape).to(self.dev) if self.num_env == 1: dt = torch.FloatTensor([dt]).reshape(self.gamma_shape).to(self.dev) else: dt = torch.FloatTensor(dt.astype(int)).reshape( self.gamma_shape).to(self.dev) return st, rt, dt def end_dummy(self): return torch.zeros(self.obs_shape).to(self.dev), torch.zeros( self.reward_shape).to(self.dev), torch.zeros(self.gamma_shape).to( self.dev) def render(self): self.env.render() def close(self): self.env.close()
while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 # rollout trajectory for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_rewards.append(np.mean([test_env() for _ in range(10)])) plot(frame_idx, test_rewards)
def train(args): # hyper-params: frame_idx = 0 hidden_size = args.hidden_size lr = args.lr num_steps = args.num_steps mini_batch_size = args.mini_batch_size ppo_epochs = args.ppo_epochs threshold_reward = args.threshold_reward max_frames = args.max_frames # test_rewards = [] num_envs = args.num_envs test_epochs = args.test_epochs resume_training = args.resume_training best_test_reward = 0.0 urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf") log_dir = args.log_dir now = datetime.now() log_dir = log_dir + '_' + now.strftime('%d_%m_%Y_%H_%M_%S') # Check cuda availability. use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") p.connect(p.DIRECT) writer = SummaryWriter(log_dir) # Create training log. textio = utils.IOStream(os.path.join(log_dir, 'train.log'), args=args) # textio.log_params(device, num_envs, lr, threshold_reward) utils.logFiles(log_dir) # create multiple environments. envs = [utils.make_env(p, urdf_path, args=args) for i in range(num_envs)] envs = SubprocVecEnv(envs) # pdb.set_trace() # Debug num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.shape[0] # Create Policy/Network net = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(net.parameters(), lr=lr) # If use pretrained policy. if resume_training: if os.path.exists(resume_training): checkpoint = torch.load(resume_training) frame_idx = checkpoint['frame_idx'] net.load_state_dict(checkpoint['model']) best_test_reward = checkpoint['best_test_reward'] # Initial Reset for Environment. state = envs.reset() early_stop = False # Create env for policy testing. robot = snake.Snake(p, urdf_path, args=args) env = SnakeGymEnv(robot, args=args) print_('\nTraining Begins ...', color='r', style='bold') textio.log('Training Begins ...') while frame_idx < max_frames and not early_stop: print_('\nTraining Policy!', color='r', style='bold') textio.log('\n############## Epoch: %0.5d ##############'%(int(frame_idx/20))) # Memory buffers log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 total_reward = 0.0 for i in range(num_steps): print('Steps taken: {} & Epoch: {}\r'.format(i, int(frame_idx/20)), end="") state = torch.FloatTensor(state).to(device) # Find action using policy. dist, value = net(state) action = dist.sample() action = action #HACK # Take actions and find MDP. next_state, reward, done, _ = envs.step(action.cpu().numpy()) total_reward += sum(reward) textio.log('Steps: {} and Reward: {}'.format(int(frame_idx%20), total_reward)) # Calculate log(policy) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() # Create Experiences log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) # Update state. state = next_state frame_idx += 1 # Test Trained Policy. if frame_idx % 40 == 0: print_('\n\nEvaluate Policy!', color='bl', style='bold') test_reward = np.mean([utils.test_env(env, net, test_idx) for test_idx in range(test_epochs)]) # test_rewards.append(test_reward) # utils.plot(frame_idx, test_rewards) # not required due to tensorboardX. writer.add_scalar('test_reward', test_reward, frame_idx) print_('\nTest Reward: {}\n'.format(test_reward), color='bl', style='bold') textio.log('Test Reward: {}'.format(test_reward)) # Save various factors of training. snap = {'frame_idx': frame_idx, 'model': net.state_dict(), 'best_test_reward': best_test_reward, 'optimizer' : optimizer.state_dict()} if best_test_reward < test_reward: save_checkpoint(snap, os.path.join(log_dir, 'weights_bestPolicy.pth')) best_test_reward = test_reward save_checkpoint(snap, os.path.join(log_dir,'weights.pth')) if test_reward > threshold_reward: early_stop = True if frame_idx % 1000 == 0: if not os.path.exists(os.path.join(log_dir, 'models')): os.mkdir(os.path.join(log_dir, 'models')) save_checkpoint(snap, os.path.join(log_dir, 'models', 'weights_%0.5d.pth'%frame_idx)) # Calculate Returns next_state = torch.FloatTensor(next_state).to(device) _, next_value = net(next_state) returns = compute_gae(next_value, rewards, masks, values) # Concatenate experiences for multiple environments. returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values writer.add_scalar('reward/episode', total_reward, frame_idx) textio.log('Total Training Reward: {}'.format(total_reward)) # Update the Policy. ppo_update(net, optimizer, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage, writer, frame_idx)