def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] for n in range(self.nsteps): obs = np.float32(self.obs / 255.) policy = self.step_actor.predict(obs) actions = [] for i in range(policy.shape[0]): action = np.random.choice(self.action_size, 1, p=policy[i]) actions.append(action) actions = np.array(actions) values = self.step_critic.predict(obs) values = values[:, 0] mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_dones = mb_dones[:, 1:] last_values = self.step_critic.predict(self.obs).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + value, dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() return mb_obs, mb_actions, mb_rewards, mb_values
def run_nsteps(self): b_observations, b_rewards, b_actions, b_values, b_dones = [], [], [], [], [] b_states = self.states for m in range(self.nsteps): time.sleep(0.001) # action, ap, value, states, _ = self.model.step([self.obs], [self.done], self.states) # action = np.random.choice(np.arange(self.n_actions), p=ap[0]) # reward = env.act(self.action_set[action]) a_dist, value, states = self.model.step([self.obs], [self.done], self.states) action = np.random.choice(np.arange(self.n_actions), p=a_dist[0]) reward = env.act(self.action_set[action]) reward += 0.1 # print('%s %s -- %s' % (ap[0], action, reward) if env.game_over(): done = 1 obs = list(env.getGameState()) if abs(self.obs[0] - self.obs[3]) <= 24 or abs(self.obs[0] - self.obs[4]) <= 24: reward = -3. # penalize it less if agent hits tunnel edges else: done = 0 obs = list(env.getGameState()) if reward == 1.: print(reward, self.obs) print(reward, obs) self.states = states self.done = done b_dones.append(done) b_observations.append(np.copy(self.obs)) b_actions.append(action) b_rewards.append(reward) b_values.append(value) self.obs = obs # obs = next_obs if done: break self.total_return += reward # convert lists to numpy arrays and flatten arrays b_observations = np.asarray(b_observations, dtype=np.float32) b_rewards = np.asarray(b_rewards, dtype=np.int8).flatten() b_actions = np.asarray(b_actions, dtype=np.uint8).flatten() b_values = np.asarray(b_values, dtype=np.float32).flatten() b_dones = np.asarray(b_dones, dtype=np.int8).flatten() next_value = self.model.value([self.obs], [self.done], self.states) if b_dones[-1] == 0: discounted_rewards = discount_with_dones( b_rewards.tolist() + next_value.tolist(), b_dones.tolist() + [0], self.discount)[:-1] else: discounted_rewards = discount_with_dones(b_rewards, b_dones, self.discount) return b_observations, discounted_rewards, b_actions, b_values, b_dones, b_states
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_raw_rewards = [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): actions, values, entropy, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs_all, raw_rewards, dones, _ = self.env.step(actions) obs = [obs_index['image'] for obs_index in obs_all] obs = np.asarray(obs) #rewards = raw_rewards + entropy * self.entropy_coef rewards = raw_rewards self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_raw_rewards.append(raw_rewards) mb_dones.append(self.dones) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_raw_rewards = np.asarray(mb_raw_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_raw_rewards = mb_raw_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_raw_rewards
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts if self.from_pixels: # (nstep, nenv, 100, 150, nstack) before below mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) else: mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def collect_rollout(env, agent, initial_obs, max_t, gamma=0.99): roll_obs, roll_rew, roll_act, roll_vals, roll_dones = [], [], [], [], [] obs = initial_obs dones = [False for _ in range(env.num_env)] for n in range(max_t): actions, values = agent(obs) roll_obs.append(obs) roll_act.append(actions) roll_vals.append(values) roll_dones.append(dones) obs, rewards, dones, _ = env.step(actions) roll_rew.append(rewards) roll_obs = np.asarray(roll_obs, dtype=np.float32).swapaxes(1, 0).reshape((env.num_env * max_t,) + env.obs_shape) roll_rew = np.asarray(roll_rew, dtype=np.float32).swapaxes(1, 0) roll_act = np.asarray(roll_act, dtype=np.int32).swapaxes(1, 0) roll_vals = np.asarray(roll_vals, dtype=np.float32).swapaxes(1, 0) _, last_values = agent(obs) for n, (rewards, ep_dones, value) in enumerate(zip(roll_rew, roll_dones, last_values)): rewards = rewards.tolist() ep_dones = ep_dones.tolist() if ep_dones[-1] == 0: rewards = discount_with_dones(rewards + [value], ep_dones + [0], gamma)[:-1] else: rewards = discount_with_dones(rewards, ep_dones, gamma) roll_rew[n] = rewards roll_rew = roll_rew.flatten() roll_act = roll_act.flatten() roll_vals = roll_vals.flatten() return obs, roll_obs, roll_rew, roll_act, roll_vals
def learn(self): if not self.args.no_sil: sil_model = sil_module(self.net, self.args, self.optimizer) num_updates = self.args.total_frames // (self.args.num_processes * self.args.nsteps) # get the reward to calculate other information episode_rewards = torch.zeros([self.args.num_processes, 1]) final_rewards = torch.zeros([self.args.num_processes, 1]) # start to update for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) _, pi = self.net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) # step obs, rewards, dones, _ = self.envs.step(cpu_actions) # process rewards... raw_rewards = copy.deepcopy(rewards) rewards = np.sign(rewards) # start to store the rewards self.dones = dones if not self.args.no_sil: sil_model.step(input_tensor.detach().cpu().numpy(), cpu_actions, raw_rewards, dones) mb_rewards.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs raw_rewards = torch.from_numpy( np.expand_dims(np.stack(raw_rewards), 1)).float() episode_rewards += raw_rewards # get the masks masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in dones]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values, _ = self.net(input_tensor) # compute returns for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.args.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.args.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() # start to update network vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions) # start to update the sil_module if not self.args.no_sil: mean_adv, num_samples = sil_model.train_sil_model() if update % self.args.log_interval == 0: if not self.args.no_sil: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \ 'Ent: {:.2f}, Min: {}, Max:{}, BR:{}, E:{}, VS:{}, S:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), sil_model.get_best_reward(), \ sil_model.num_episodes(), num_samples, sil_model.num_steps())) else: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \ 'Ent: {:.2f}, Min: {}, Max:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max())) torch.save(self.net.state_dict(), self.model_path + 'model.pt')
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_rawrewards = [],[],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, pi, values, states, _ = self.model.step( self.obs, self.states) # , self.dones) ? mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) obs = normalize_obs(obs) self.logger.debug('Observations: %s' % obs) # render only every i-th episode if self.show_interval != 0: if (self.ep_idx[0] % self.show_interval) == 0: self.env.render() self.eplength = [ self.eplength[i] + 1 for i in range(self.nenv) ] # Todo use already implemented functions in run_ple_utils!!! self.epreturn = [ self.epreturn[i] + rewards[i] for i in range(self.nenv) ] [ self.reward_window[i].append(rewards[i]) for i in range(self.nenv) ] # Check for terminal states in every env for i, done in enumerate(dones): # i -> environment ID if done: self.ep_idx[i] += 1 self.obs[i] = self.obs[i] * 0 # update tensorboard summary if self.summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/environment%s/episode_length' % i, simple_value=self.eplength[i]) summary.value.add( tag='envs/environment%s/episode_reward' % i, simple_value=self.epreturn[i]) self.summary_writer.add_summary( summary, self.ep_idx[i]) #self.global_step.eval()) self.summary_writer.flush() # self.retbuffer.append(self.epreturn[i]) if self.epreturn[i] > self.return_threshold: self.return_threshold = self.epreturn[i] self.logger.info('Save model at max reward %s' % self.return_threshold) self.model.save('inter_model') self.eplength[i] = 0 self.epreturn[i] = 0 # # Is not necessary, as the environment is continuous now! # # Reset RNN state vector to 0 if previous sample is a terminating one. # # As no history should be used in rnn training then. # if states: # env_was_done = False # for i, done in enumerate(self.dones): # if done and not env_was_done: # env_was_done = True # c_new = states[0] # h_new = states[1] # c_new[i] = np.zeros_like(c_new[i]) # h_new[i] = np.zeros_like(h_new[i]) # elif done: # c_new[i] = np.zeros_like(c_new[i]) # h_new[i] = np.zeros_like(h_new[i]) # if env_was_done: # states = tf.contrib.rnn.LSTMStateTuple(c_new, h_new) # # print(states) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) # mb_masks = mb_dones[:, :-1] ? mb_rawrewards = np.copy(mb_rewards) mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) self.logger.debug('Discounted rewards: %s' % rewards) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() self.logger.debug('Actions: %s' % mb_actions) self.logger.debug('Q values: %s' % mb_values) self.logger.debug('Observations: %s' % mb_obs) return mb_obs, mb_states, mb_rewards, mb_actions, mb_values, self.reward_window, mb_rawrewards # self.avg_return_n_episodes
def learn(self): num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps) # get the reward to calculate other information episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) # start to update for update in range(num_updates): if self.args.lr_decay: self._adjust_learning_rate(update, num_updates) mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_obs_, mb_v_ex, mb_v_mix = [], [], [], [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) v_mix, pi = self.net(input_tensor) _, v_ex = self.intrinsic_net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) mb_v_ex.append(v_ex.detach().cpu().numpy().squeeze()) mb_v_mix.append(v_mix.detach().cpu().numpy().squeeze()) # step obs_, rewards, dones, _ = self.envs.step(cpu_actions) # store the observation next mb_obs_.append(np.copy(obs_)) # start to store the rewards self.dones = dones mb_rewards_ex.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n]*0 self.obs = obs_ episode_rewards += rewards # get the masks masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) """ mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0) """ # calculate the intrinsic rewards and make sure the dimensional is right mb_rewards_in = self._compute_intrinsic_rewards(mb_obs, mb_obs_) mb_rewards_in = mb_rewards_in.reshape((self.args.num_workers, self.args.nsteps)) # --- next mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32).swapaxes(1, 0) # calculate the mix reward mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0) mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0) # masks mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # calculate the last value with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values_mix, _ = self.net(input_tensor) last_values_mix = last_values_mix.detach().cpu().numpy().squeeze() # then last value ex _, last_values_ex = self.intrinsic_net(input_tensor) last_values_ex = last_values_ex.detach().cpu().numpy().squeeze() # get the returns ex and in mb_returns_ex, mb_returns_mix = np.zeros(mb_rewards_ex.shape), np.zeros(mb_rewards_in.shape) # compute returns for n, (rewards_ex, rewards_mix, dones, value_mix, value_ex) in enumerate(zip(mb_rewards_ex, mb_rewards_mix, mb_dones, last_values_mix, last_values_ex)): rewards_ex = rewards_ex.tolist() rewards_mix = rewards_mix.tolist() dones = dones.tolist() if dones[-1] == 0: returns_ex = discount_with_dones(rewards_ex+[value_ex], dones+[0], self.args.gamma)[:-1] returns_mix = discount_with_dones(rewards_mix+[value_mix], dones+[0], self.args.gamma)[:-1] else: returns_ex = discount_with_dones(rewards_ex, dones, self.args.gamma) returns_mix = discount_with_dones(rewards_mix, dones, self.args.gamma) mb_returns_ex[n] = returns_ex mb_returns_mix[n] = returns_mix # flatten stuffs mb_rewards_ex = mb_rewards_ex.flatten() mb_rewards_in = mb_rewards_in.flatten() mb_returns_ex = mb_returns_ex.flatten() mb_returns_mix = mb_returns_mix.flatten() mb_actions = mb_actions.flatten() mb_v_ex = mb_v_ex.flatten() mb_v_mix = mb_v_mix.flatten() mb_dones = mb_dones.flatten() mb_masks = mb_masks.flatten() # before the training calculate the matrix """ here - we calculate the coefficient matrix """ dis_v_mix_last = np.zeros([mb_obs.shape[0]], np.float32) coef_mat = np.zeros([mb_obs.shape[0], mb_obs.shape[0]], np.float32) for i in range(mb_obs.shape[0]): dis_v_mix_last[i] = self.args.gamma ** (self.args.nsteps - i % self.args.nsteps) * last_values_mix[i // self.args.nsteps] coef = 1.0 for j in range(i, mb_obs.shape[0]): if j > i and j % self.args.nsteps == 0: break coef_mat[i][j] = coef coef *= self.args.gamma if mb_dones[j]: dis_v_mix_last[i] = 0 break # start to update network vl, al, ent = self._update_network(mb_obs, mb_obs_, mb_masks, mb_actions, mb_rewards_ex, mb_returns_ex, mb_v_ex, mb_v_mix, \ dis_v_mix_last, coef_mat) if update % self.args.log_interval == 0: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}, R_in: {:.3f}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in))) torch.save(self.net.state_dict(), self.model_path + 'model.pt')
def run(self): mb_obs, mb_rewards, mb_rewards_square, mb_actions, mb_values, mb_moments, mb_dones = [],[],[],[],[],[],[] mb_states = self.states epinfos = [] for n in range(self.nsteps): actions, values, moments, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_moments.append(moments) mb_dones.append(self.dones) obs, rewards, dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.counters_fixed.append(self.counters[n]) self.counters[n] = 0 else: self.counters[n] += rewards[n] self.obs = obs rewards = np.sign(rewards) mb_rewards.append(rewards) mb_rewards_square.append(rewards) # MAYBE ERRRRORR mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_rewards_square = np.asarray(mb_rewards_square, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_moments = np.asarray(mb_moments, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] values_temp, moments_temp = self.model.value(self.obs, self.states, self.dones) last_values = values_temp.tolist() last_moments = moments_temp.tolist() #discount/bootstrap off value fn for n, (rewards, dones, value, moment) in enumerate( zip(mb_rewards, mb_dones, last_values, last_moments)): rewards_square = rewards.copy() rewards_square = rewards_square.tolist() rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: sigma = np.sqrt(np.maximum(moment - value**2, 0)) prob_value = np.random.normal(loc=value, scale=sigma) # prob_value = value + sigma rewards = discount_with_dones(rewards + [prob_value], dones + [0], self.gamma)[:-1] rewards_square = discount_moments_with_dones(rewards_square, dones, self.gamma, flag=True, value=value, moment=moment) else: rewards = discount_with_dones(rewards, dones, self.gamma) rewards_square = discount_moments_with_dones( rewards_square, dones, self.gamma) mb_rewards[n] = rewards mb_rewards_square[n] = rewards_square mb_rewards = mb_rewards.flatten() mb_rewards_square = mb_rewards_square.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_moments = mb_moments.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_rewards_square, mb_masks, mb_actions, mb_values, mb_moments, epinfos
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] mb_actions_n = [] mb_raw_rewards = [] mb_states = self.states int_reward = np.zeros(self.nenvs) if self.first_flag == 1: self.actions, self.values, self.act_pred, _, _ = self.model.step( self.obs, self.states, self.dones) self.first_flag = 0 self.last_actpred = np.copy(self.act_pred) print('first_flag:', self.first_flag) for n in range(self.nsteps): mb_obs.append(np.copy(self.obs)) mb_actions.append(self.actions) mb_values.append(self.values) mb_dones.append(self.dones) obs_all, raw_rewards, dones, _ = self.env.step(self.actions) obs = [obs_index['image'] for obs_index in obs_all] obs = np.asarray(obs) rewards = np.array(raw_rewards, dtype=np.float32) self.dones = dones # add by lilijuan at 2018.9.25 last_obs = np.copy(self.obs) last_actions = np.copy(self.actions) for i in range(self.nenvs): int_reward[i] = self.actions[i] != self.last_actpred[i] int_reward = np.array(int_reward * 0.001, dtype=np.float32) rewards += int_reward self.last_actpred = np.copy(self.act_pred) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_raw_rewards.append(raw_rewards) self.actions, self.values, self.act_pred, _, _ = self.model.step( self.obs, self.states, self.dones) mb_actions_n.append(self.actions) # add by lilijuan at 2018.9.25 store(obs,action,action_n,reward,done) # self.model.sil.step(last_obs, last_actions, # self.actions, raw_rewards, dones) # self.model.sil.step(last_obs, last_actions, self.actions, # raw_rewards, rewards, dones) mb_dones.append(self.dones) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_raw_rewards = np.asarray(mb_raw_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_actions_n = np.asarray(mb_actions_n, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # last_values = self.model.value( # self.obs, self.states, self.dones).tolist() last_values = self.values.tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_raw_rewards = mb_raw_rewards.flatten() mb_actions = mb_actions.flatten() mb_actions_n = mb_actions_n.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_raw_rewards, mb_masks, mb_actions, mb_values, mb_actions_n, int_reward
def run(self): mb_obs, mb_maps, mb_coords, mb_states, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [], [], [], [] for n in range(self.nsteps): #actions, values, states = self.model.step(self.obs, self.states, self.dones) actions, values, states, maps = self.model.step(self.sf.obs, maps=self.maps, coords=self.sf.coords) mb_obs.append(np.copy(self.sf.obs)) mb_coords.append(np.copy(self.sf.coords)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones # the outer loop is per-env done, # the inner loop is per-player done for i, done in enumerate(dones): if done[0]: self.sf.obs[i, :] = self.sf.obs[i, :]*0 self.sf.coords[i, :] = self.sf.coords[i, :]*0 if self.maps != []: self.maps[i] = self.maps[i]*0 self.sf.update_obs(obs) mb_states.append(np.copy(self.states)) mb_maps.append(np.copy(maps)) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_states = np.asarray(mb_states, dtype=np.float32).swapaxes(1, 0).reshape([self.nenv*self.nsteps, -1]) mb_maps = np.asarray(mb_maps, dtype=np.float32).swapaxes(1, 0).reshape([self.nenv*self.nsteps] + self.map_size) mb_coords = np.asarray(mb_coords, dtype=np.uint8).swapaxes(1, 0).reshape([self.nenv*self.nsteps, -1, 2]) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0).swapaxes(2, 1) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0).swapaxes(2, 1) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0).swapaxes(2, 1) mb_masks = mb_dones[:, :, :-1] mb_dones = mb_dones[:, :, 1:] last_values = self.model.value(self.sf.obs, maps=self.maps, coords=self.sf.coords) #discount/bootstrap off value fn for i, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() for j, (r, d, v) in enumerate(zip(rewards, dones, value)): if d[-1] == 0: r = discount_with_dones(np.asarray(r+[v]), d+[0], self.gamma)[:-1] else: r = discount_with_dones(np.asarray(r), d, self.gamma) mb_rewards[i, j] = r """ if dones[-1] == 0: rewards = discount_with_dones(np.asarray(rewards+[value]), dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(np.asarray(rewards), dones, self.gamma) mb_rewards[n] = rewards """ def _flatten(arr): return arr.reshape(-1) mb_rewards = _flatten(mb_rewards.swapaxes(2,1)) mb_values = _flatten(mb_values.swapaxes(2,1)) mb_masks = mb_masks.swapaxes(2,1) mb_masks = mb_masks.reshape(mb_masks.shape[0]*mb_masks.shape[1], -1) mb_actions = _flatten(mb_actions) return mb_obs, mb_maps, mb_coords, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(self): num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps) # get the reward to calculate other information episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) # start to update for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) _, pi = self.net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) # step obs, rewards, dones, _ = self.envs.step(cpu_actions) # start to store the rewards self.dones = dones mb_rewards.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs episode_rewards += rewards # get the masks masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # calculate the last value with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values, _ = self.net(input_tensor) # compute returns for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.args.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.args.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() # start to update network vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions) if update % self.args.log_interval == 0: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max())) torch.save(self.net.state_dict(), self.model_path + 'model.pt')