def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) if self.render: self.env.render() if self.record: frame = obs[0,:,:,:3] self.recording.append(frame) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n]*0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() # Iterate rewards step-by-step and add to final scores if done for i in range(self.nsteps): # Add reward to episode reward self.episode_rewards[n] += rewards[i] if dones[i] == 1: # Add final result to episode rewards self.final_rewards.append(self.episode_rewards[n]) # Reset local episode reward self.episode_rewards[n] = 0 #Save current game as a video if self.record: self.makevideo() # Discount rewards if dones[-1] == 0: rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) # TODO: surrogate reward self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray( mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: #discount/bootstrap off value fn last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run( self ): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] mb_states = self.states epinfos = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init actions, values, states, _ = self.model.step( self.obs, S=self.states, M=self.dones ) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # Take actions in env and look the results obs, rewards, dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_raw_rewards = [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): actions, values, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) #print('actions:', actions) #obs_all, raw_rewards, dones, _ = self.env.step(actions) obs_all, raw_rewards, dones, _ = self.env.step(actions) obs = [obs_index['image'] for obs_index in obs_all] obs = np.asarray(obs) rewards = raw_rewards self.states = states self.dones = dones if hasattr(self.model, 'sil'): self.model.sil.step(self.obs, actions, raw_rewards, dones) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_raw_rewards.append(raw_rewards) mb_dones.append(self.dones) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_raw_rewards = np.asarray( mb_raw_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value( self.obs, self.states, self.dones).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones( rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_raw_rewards = mb_raw_rewards.flatten() #print('mb_rewards:', mb_rewards.shape) #print('mb_raw_rewards:', mb_raw_rewards.shape) mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_raw_rewards
def run(self): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states epinfos = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # Take actions in env and look the results obs, rewards, dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states epinfos = [] for n in range(self.nsteps): actions, values, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
def run(self): """ Run a learning step of the model :return: ([float], [float], [float], [bool], [float], [float]) observations, states, rewards, masks, actions, values """ mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] mb_states = self.states for _ in range(self.n_steps): actions, values, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) episode_values = mb_values[:, n] episode_rewards = mb_rewards[:, n] mean_value_error = np.absolute( np.subtract(episode_values, episode_rewards)) self.env.report(mean_value_error) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self, verbose=False): """ Get batchwise data for training """ mb_obs, mb_rewards, mb_actions, mb_values, mb_dones,mb_states = [],[],[],[],[],[] for n in range(self.p.N_STEPS): actions, values, _ = self.model.step(self.obs, self.states) mb_obs.append(np.copy(self.obs)) mb_states.append(np.copy(self.states)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, states = self.env.step(actions, verbose=verbose) self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs self.states = states mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.p.BATCH_OBS_SHAPE) mb_states = np.asarray(mb_states, dtype=np.float).swapaxes( 1, 0).reshape(self.p.BATCH_STATE_SHAPE) mb_rewards = np.asarray(mb_rewards, dtype=np.float).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = np.array(dones).astype('int32').tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.p.GAMMA_DF)[:-1] else: rewards = discount_with_dones(rewards, dones, self.p.GAMMA_DF) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_states
def discount_reward(self, gamma, last_values): if gamma > 0.0: # Discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(self.r, self.dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, gamma) self.r[n] = rewards
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states = self.model.step(self.stochastic, self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for i, done in enumerate(dones): if done: self.obs[i] = 0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() raw_rewards = mb_rewards.flatten() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_obs = mb_obs.reshape([-1] + list(mb_obs.shape[2:])) mb_returns = mb_rewards.flatten() # because it contains returns mb_actions = np.reshape(mb_actions, [-1] + list(mb_actions.shape[2:])) mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, raw_rewards, mb_returns, mb_masks, mb_actions, mb_values
def add_episode(self, trajectory): obs = [] actions = [] rewards = [] dones = [] if self.stack > 1: ob_shape = list(trajectory[0][0].shape) nc = ob_shape[-1] ob_shape[-1] = nc * self.stack stacked_ob = np.zeros(ob_shape, dtype=trajectory[0][0].dtype) for (ob, action, reward) in trajectory: if ob is not None: x = self.fn_obs(ob) if self.fn_obs is not None else ob if self.stack > 1: stacked_ob = np.roll(stacked_ob, shift=-nc, axis=2) stacked_ob[:, :, -nc:] = x obs.append(stacked_ob) else: obs.append(x) else: obs.append(None) actions.append(action) rewards.append( self.fn_reward(reward) ) if self.fn_reward is not None else rewards.append(reward) dones.append(False) dones[len(dones) - 1] = True returns = discount_with_dones(rewards, dones, self.gamma) for (ob, action, R) in list(zip(obs, actions, returns)): self.buffer.add(ob, action, R)
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): #run tmax steps actions, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step( actions) #Next obs after taking the step self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 #reset self.update_obs(obs) #Roll the obs. keeps latest 4 frames mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n]*0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] mb_batchactions = [] mb_states = self.states epinfos = [] for _ in range(self.nsteps): actions, values, self.states, neglogpacs, batchactions = self.model.step( self.obs, self.states, self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) mb_batchactions.append(batchactions) self.obs[:], rewards, self.dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_batchactions = np.asarray(mb_batchactions, dtype=np.float32) last_values = self.model.value(self.obs, self.states, self.dones) #discount/bootstrap off value fn if self.gae is True: mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.gamma * nextvalues * nextnonterminal - mb_values[ t] mb_advs[ t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values else: mb_returns = discount_with_dones(mb_rewards, mb_dones, self.gamma) mb_returns = np.array(mb_returns) return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, mb_batchactions)), mb_states, epinfos)
def learn_hierarchical(env, batch_size, total_epoches, gamma, g_ob_size, g_act_size, g_latents, g_lr, g_ac, l_ob_size, l_act_size, l_latents, l_lr, l_ac): """ Learn a hierarchical model """ model = Hierarchical(g_ob_size, g_act_size, g_latents, g_lr, g_ac, l_ob_size, l_act_size, l_latents, l_lr, l_ac) mb_obs, mb_acts, mb_rews, mb_vals, mb_dones = [], [], [], [], [] tqdm.write('training hierarchical model') for ep in tqdm(range(total_epoches)): obs = env.reset(model.local_model, model.predictor) # initial observation done = False while not done: action = model.step(obs) # RL choose action based on observation value = model.value(obs) action = int(action) value = int(value) mb_obs.append(obs) mb_acts.append(action) mb_vals.append(value) obs_, reward, done, info = env.step(action) mb_rews.append(reward) mb_dones.append(done) obs = obs_ if ep % batch_size == 0: mb_rews = discount_with_dones(mb_rews, mb_dones, gamma) model.global_model.train(np.array(mb_obs), np.array(mb_acts), np.array(mb_rews), np.array(mb_vals)) mb_obs, mb_acts, mb_rews, mb_vals, mb_dones = [], [], [], [], [] return model
def run(self): # curiosity = True # curiosity = False # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_next_states = [],[],[],[],[],[] mb_states = self.states icm_testing_rewards = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) if self.curiosity == True: icm_states = self.obs # Take actions in env and look the results obs, rewards, dones, _ = self.env.step(actions) # print("received Rewards from step function ") # print("received Rewards ",rewards) if self.curiosity == True: icm_next_states = obs icm_rewards = self.icm.calculate_intrinsic_reward( icm_states, icm_next_states, actions) # print("shape of icm rewards ",np.shape(icm_rewards)) icm_testing_rewards.append(icm_rewards) # icm_rewards = [icm_rewards] * len(rewards) # icm_rewards = icm_rewards * 2 # print("intrinsic Reward : ",icm_rewards) # icm_rewards = np.clip(icm_rewards,-constants['REWARD_CLIP'], constants['REWARD_CLIP']) # print("icm _ rewards : ",icm_rewards) # rewards = icm_rewards + rewards # print("Rewards icm {} , commulative reward {} ".format(icm_rewards , rewards)) # rewards = np.clip(rewards,-constants['REWARD_CLIP'], +constants['REWARD_CLIP']) # print("icm rewards ", rewards) # print("calculated rewards ",rewards) mb_next_states.append(np.copy(obs)) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_next_states = np.asarray(mb_next_states, dtype=self.ob_dtype).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) # > testing mean std of rewards if self.curiosity: icm_testing_rewards = np.asarray(icm_testing_rewards, dtype=np.float32).swapaxes(1, 0) # print("Icm rewards" ,icm_testing_rewards) # > testing mean std of rewards mb_actions = np.asarray( mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # > passing reward to reward forward filter # print("Merged things obs {} rewards {} actions {} dones {}". # format(np.shape(mb_obs) , np.shape(mb_rewards) , np.shape(mb_actions) , np.shape(mb_dones))) # > # rffs = np.array([self.rff.update(rew) for rew in mb_rewards.T]) if self.curiosity == True: rffs = np.array( [self.rff.update(rew) for rew in icm_testing_rewards.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = icm_testing_rewards / np.sqrt(self.rff_rms.var) # mb_rewards = rews mb_rewards = mb_rewards + rews # now clipping the reward (-1,1) # mb_rewards = np.clip(mb_rewards,-constants['REWARD_CLIP'], constants['REWARD_CLIP']) # print(mb_rewards) # print(" shape of normalized reward ", np.shape(rews)) # icm_testing_rewards = (icm_testing_rewards > rffs_mean).astype(np.float32) # np.place(icm_testing_rewards, icm_testing_rewards > 0, 0.2) # np.interp(icm_testing_rewards.ravel() , (rffs_mean+) , ()) # icm_testing_rewards = icm_testing_rewards.ravel() # print("\n\nIcm Rewards : ",icm_testing_rewards) # print(" icm testing rewards ") # print("icm testing reward : mean {} , std {} , division {} ".format(rffs_mean , rffs_std , ((rffs_mean + rffs_std)/2 ) ) ) # print("ICM testing rewards " , icm_testing_rewards) # icm_testing_rewards[icm_testing_rewards > rffs_mean] = 0.5 # icm_testing_rewards[icm_testing_rewards < rffs_mean] = 0 # icm_testing_rewards[icm_testing_rewards < rffs_mean] = 0 # print("icm rewards ", icm_testing_rewards) # mb_rewards = icm_testing_rewards + mb_rewards # print( mb_rewards) # mb_rewards = mb_rewards[mb_rewards > 1] # mb_rewards = [1 if mb_rewards[mb_rewards >1 ] else 1] # mb_rewards[mb_rewards > 1] = 1 # mask = mb_rewards[((icm_testing_rewards + mb_rewards ) % 2) == 0] # print("Mask ",mask) # mb_rewards[mask == 0] = 1 # print("Mb reward ",mb_rewards ) # print("Icm Rewards : ",icm_testing_rewards) # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) # rews = mb_rewards / np.sqrt(self.rff_rms.var) # > # print("update : rffs_mean {} , rffs_std {} , rffs_count {} ".format( # np.shape(rffs_mean),np.shape(rffs_std),np.shape(rffs_count))) # print(" update : final rews {} rff_rms.var {} ".format( # rews , np.shape(self.rff_rms.var))) # print(">> the shape of rffs testing ", np.shape(rffs)) # mb_rewards_copy = mb_rewards if self.curiosity == True: if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() # if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] # else: # rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards else: # print(" Before discount_with_dones ") # print("Rewards " , mb_rewards) # print("Before rewards and values ") # print("Reward {} values {} ".format(mb_rewards , mb_values)) if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards # print(" After discount_with_dones ") # print("Orgnal discounterd Rewards " , np.shape(mb_rewards)) # rffs_mean, rffs_std, rffs_count = mpi_moments(mb_rewards.ravel()) # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) # mb_rewards = mb_rewards_copy / np.sqrt(self.rff_rms.var) mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if self.curiosity == True: mb_rews_icm = rews.flatten() # mb_new_updated_reward = mb_rews_icm + mb_rewards # print("New udpated rewards ",mb_new_updated_reward) # rffs_mean, rffs_std, rffs_count = mpi_moments(mb_new_updated_reward.ravel()) # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) # rews = mb_new_updated_reward / np.sqrt(self.rff_rms.var) # print("After normalized",rews) # mb_new_rew = rews.flatten() # print("Flatten rewards and values ") # print("Reward {} ".format(mb_rewards )) # print("Merged things after obs {} rewards {} actions {} masks {}". # format(np.shape(mb_obs) , np.shape(mb_rewards) , np.shape(mb_actions) , np.shape(mb_masks))) return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_next_states # , mb_rews_icm, mb_new_updated_reward #, mb_new_rew
def run(self): mb_obs, mb_options, mb_rewards, mb_actions, mb_values, mb_dones, mb_costs = [],[],[],[],[],[],[] for n in range(self.nsteps): actions, values = self.model.step(self.obs, self.options) mb_obs.append(np.copy(self.obs)) mb_options.append(np.copy(self.options)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) # Update current option self.options, costs = self.model.update_options( self.obs, self.options, self.option_eps, self.delib_cost) mb_costs.append(costs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_options = np.asarray(mb_options, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_costs = np.asarray(mb_costs, dtype=np.float32).swapaxes(1, 0) mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value[0]], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_options = mb_options.flatten() mb_values = mb_values.flatten() mb_costs = mb_costs.flatten() return mb_obs, mb_options, mb_rewards, mb_actions, mb_values, mb_costs
def run(self): mb_obs, mb_td_targets, mb_base_actions, \ mb_xy0, mb_xy1, \ mb_values, mb_dones \ = [], [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): # pi, pi2, x1, y1, x2, y2, v0 pi1, pi_xy0, pi_xy1, values, states = self.model.step( self.obs, self.states, self.dones) pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 base_actions = np.argmax( pi1 * self.base_act_mask + pi1_noise, axis=1) xy0 = np.argmax(pi_xy0, axis=1) x0 = (xy0 % 32).astype(int) y0 = (xy0 / 32).astype(int) xy1 = np.argmax(pi_xy1, axis=1) x1 = (xy1 % 32).astype(int) y1 = (xy1 / 32).astype(int) # Scripted Agent Hacking for env_num in range(self.nenv): if env_num >= self.nscripts: # only for scripted agents continue ob = self.obs[env_num, :, :, :] player_relative = ob[:, :, -1] self.group_list[env_num] = common.update_group_list2( self.control_groups[env_num]) if len(self.action_queue[env_num]) == 0: self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \ common.solve_tsp(player_relative, self.selected[env_num][0], self.group_list[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num]) base_actions[env_num] = 0 x0[env_num] = 0 y0[env_num] = 0 x1[env_num] = 0 y1[env_num] = 0 if len(self.action_queue[env_num]) > 0: action = self.action_queue[env_num].pop(0) base_actions[env_num] = action.get("base_action", 0) x0[env_num] = action.get("x0", 0) y0[env_num] = action.get("y0", 0) xy0[env_num] = y0[env_num] * 32 + x0[env_num] x1[env_num] = action.get("x1", 0) y1[env_num] = action.get("y1", 0) xy1[env_num] = y1[env_num] * 32 + x1[env_num] base_actions = self.valid_base_action(base_actions) new_base_actions = self.trans_base_actions(base_actions) base_action_spec = self.env.action_spec(new_base_actions) # print("base_actions:", base_actions) actions = self.construct_action( base_actions, base_action_spec, x0, y0, x1, y1 ) mb_obs.append(np.copy(self.obs)) mb_base_actions.append(base_actions) mb_xy0.append(xy0) mb_xy1.append(xy1) mb_values.append(values) mb_dones.append(self.dones) #print("final acitons : ", actions) obs, rewards, dones,\ available_actions, army_counts,\ control_groups, selected, xy_per_marine\ = self.env.step( actions=actions) self.army_counts = army_counts self.control_groups = control_groups self.selected = selected for env_num, data in enumerate(xy_per_marine): self.xy_per_marine[env_num] = data self.update_available(available_actions) self.states = states self.dones = dones mean_100ep_reward_a2c = 0 for n, done in enumerate(dones): self.total_reward[n] += float(rewards[n]) if done: self.obs[n] = self.obs[n] * 0 self.episodes += 1 num_episodes = self.episodes self.episode_rewards.append(self.total_reward[n]) model = self.model mean_100ep_reward = round( np.mean(self.episode_rewards[-101:]), 1) if (n < self.nscripts): # scripted agents self.episode_rewards_script.append( self.total_reward[n]) mean_100ep_reward_script = round( np.mean(self.episode_rewards_script[-101:]), 1) nsml.report( reward_script=self.total_reward[n], mean_reward_script=mean_100ep_reward_script, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals() ) else: self.episode_rewards_a2c.append(self.total_reward[n]) mean_100ep_reward_a2c = round( np.mean(self.episode_rewards_a2c[-101:]), 1) nsml.report( reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals() ) print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) if self.callback is not None: self.callback(locals(), globals()) self.total_reward[n] = 0 self.group_list[n] = [] self.update_obs(obs) mb_td_targets.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray( mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) mb_base_actions = np.asarray( mb_base_actions, dtype=np.int32).swapaxes(1, 0) mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate( zip(mb_td_targets, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_td_targets[n] = rewards mb_td_targets = mb_td_targets.flatten() mb_base_actions = mb_base_actions.flatten() mb_xy0 = mb_xy0.flatten() mb_xy1 = mb_xy1.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_td_targets, mb_masks, \ mb_base_actions, mb_xy0, mb_xy1, mb_values
####################### modify the dataframe dimension ######################## if (sample_time-1) % 10 == 0: IsPlot = True else: IsPlot = False if (sample_time % train_freq == 0): states = np.vstack(states) actions_idx = np.vstack(actions_idx) actions = np.array(actions) rewards_tmp = rewards.copy() last_value = expected_sarsa(model,last_state,K,C,action_low,action_high,False,random_choose,num=100) rewards_tmp.append(last_value) Q_target = discount_with_dones(rewards_tmp, dones+[last_done], gamma) Q_target = np.float32(np.vstack(Q_target))[:-1] R_buffer_sample = replay_buffer.sample(np.min([minibatch,timestep])) next_states_sampled = np.squeeze(R_buffer_sample[3], axis=1) dones_sampled = R_buffer_sample[4] reward_sampled = R_buffer_sample[2] last_v = [expected_sarsa(model,np.reshape(state_tmp,(1,-1)),K,C,action_low,action_high,True,random_choose,num=100) for state_tmp in next_states_sampled] last_v = np.vstack(last_v) Q_target_hist = reward_sampled + last_v * (1-dones_sampled) * gamma states_sampled1 = np.squeeze(R_buffer_sample[0], axis=1) states_sampled2 = states states_sampled = np.concatenate((states_sampled1,states_sampled2), axis = 0) actions_sampled1 = R_buffer_sample[1]
def run(self): mb_obs, mb_r_ex, mb_r_in, mb_ac, mb_v_ex, mb_v_mix, mb_dones = [],[],[],[],[],[],[] mb_policy_states = [] ep_info, ep_r_ex, ep_r_in, ep_len = [], [], [], [] for n in range(self.nsteps): mb_policy_states.append(self.policy_states) ac, v_ex, v_mix, policy_states, _ = self.model.step( self.obs, self.policy_states, self.dones) mb_obs.append(np.copy(self.obs)) mb_ac.append(ac) mb_v_ex.append(v_ex) mb_v_mix.append(v_mix) mb_dones.append(self.dones) obs, r_ex, dones, infos = self.env.step(ac) r_in = self.model.intrinsic_reward(self.obs, ac) mb_r_ex.append(r_ex) mb_r_in.append(r_in) self.policy_states = policy_states self.dones = dones self.ep_r_ex += r_ex self.ep_r_in += r_in self.ep_len += 1 for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: ep_info.append(maybeepinfo) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 ep_r_ex.append(self.ep_r_ex[n]) ep_r_in.append(self.ep_r_in[n]) ep_len.append(self.ep_len[n]) self.ep_r_ex[n], self.ep_r_in[n], self.ep_len[n] = 0, 0, 0 self.obs = obs mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_r_ex = np.asarray(mb_r_ex, dtype=np.float32).swapaxes(1, 0) mb_r_in = np.asarray(mb_r_in, dtype=np.float32).swapaxes(1, 0) mb_r_mix = self.r_ex_coef * mb_r_ex + self.r_in_coef * mb_r_in mb_ac = np.asarray(mb_ac, dtype=np.int32).swapaxes(1, 0) mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0) mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_v_ex, last_v_mix = self.model.value(self.obs, self.policy_states, self.dones) last_v_ex, last_v_mix = last_v_ex.tolist(), last_v_mix.tolist() #discount/bootstrap off value fn mb_ret_ex, mb_ret_mix = np.zeros(mb_r_ex.shape), np.zeros( mb_r_mix.shape) for n, (r_ex, r_mix, dones, v_ex, v_mix) in enumerate( zip(mb_r_ex, mb_r_mix, mb_dones, last_v_ex, last_v_mix)): r_ex, r_mix = r_ex.tolist(), r_mix.tolist() dones = dones.tolist() if dones[-1] == 0: ret_ex = discount_with_dones(r_ex + [v_ex], dones + [0], self.gamma)[:-1] ret_mix = discount_with_dones(r_mix + [v_mix], dones + [0], self.gamma)[:-1] else: ret_ex = discount_with_dones(r_ex, dones, self.gamma) ret_mix = discount_with_dones(r_mix, dones, self.gamma) mb_ret_ex[n], mb_ret_mix[n] = ret_ex, ret_mix mb_r_ex = mb_r_ex.flatten() mb_r_in = mb_r_in.flatten() mb_ret_ex = mb_ret_ex.flatten() mb_ret_mix = mb_ret_mix.flatten() mb_ac = mb_ac.flatten() mb_v_ex = mb_v_ex.flatten() mb_v_mix = mb_v_mix.flatten() mb_masks = mb_masks.flatten() mb_dones = mb_dones.flatten() return mb_obs, mb_ac, mb_policy_states, mb_r_in, mb_r_ex, mb_ret_ex, mb_ret_mix,\ mb_v_ex, mb_v_mix, last_v_ex, last_v_mix, mb_masks, mb_dones,\ ep_info, ep_r_ex, ep_r_in, ep_len
def run(self): self.tot_rewards = [] mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_rs, mb_rr = [], [] for n in range(self.nsteps): actions, values, rs, rr = self.model.act(self.obs) actions = np.array(actions) values = np.array(values) mb_rs.append(rs) mb_rr.append(rr) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.ep_rewards += rewards self.dones = dones for n, done in enumerate(dones): if done: self.tot_rewards.append(self.ep_rewards[n]) self.ep_rewards[n] = 0 self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rs = np.asarray(mb_rs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_rs_shape) mb_rr = np.asarray(mb_rr, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_rr_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = [] for i in range(self.nenv): last_values.append( self.model.value(np.expand_dims(self.obs[i], axis=0)).tolist()) #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() ep_reward_means = np.mean( self.tot_rewards) if len(self.tot_rewards) > 0 else None return mb_obs, mb_rs, mb_rr, mb_rewards, mb_masks, mb_actions, mb_values, ep_reward_means
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_decoded = [],[],[],[],[],[] mb_states = self.states if self.render: plt.ion() ## Note this correction fig = plt.figure(figsize=(5,4)) axes = [] axes.append(fig.add_subplot(3,4,1)) axes.append(fig.add_subplot(3,4,2)) axes.append(fig.add_subplot(3,4,3)) axes.append(fig.add_subplot(3,4,4)) axes.append(fig.add_subplot(3,1,2)) axes.append(fig.add_subplot(3,4,9)) axes.append(fig.add_subplot(3,4,10)) axes.append(fig.add_subplot(3,4,11)) axes.append(fig.add_subplot(3,4,12)) for n in range(self.nsteps): actions, values, decoded, encoded, states = self.model.step(self.obs, self.states, self.dones) if self.render: obs = self.obs[0] obs = np.swapaxes(obs, 0, 2) obs = np.swapaxes(obs, 1, 2) imgs = decoded[0] imgs = np.swapaxes(imgs, 0, 2) imgs = np.swapaxes(imgs, 1, 2) for i in range(len(imgs)): ob = obs[i] axes[i].clear() axes[i].set_yticklabels([]) axes[i].imshow(ob, cmap='gray', interpolation='nearest', aspect='equal') img = imgs[i] axes[i+5].clear() axes[i+5].set_yticklabels([]) axes[i+5].imshow(img, cmap='gray', interpolation='nearest', aspect='equal') axes[4].clear() axes[4].imshow(encoded[0].reshape(8,98), interpolation='nearest', aspect='equal') plt.show() plt.pause(0.000001) # Note this correction mb_obs.append(np.copy(self.obs)) mb_decoded.append(decoded) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n]*0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_decoded = np.asarray(mb_decoded, dtype=np.float32).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_decoded = mb_decoded.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_decoded
def run(self): # reset env self.obs = np.zeros(self.obs.shape) obs = self.env.reset() self.update_obs(obs) # run env until all threads finish episode_over = [-1 for i in range(self.nenv)] mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_u1, mb_u2 = [], [], [], [], [], [], [] mb_states = self.states step = 0 while not all([e >= 0 for e in episode_over]): actions, u1, u2, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) mb_u1.append(u1) mb_u2.append(u2) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 if episode_over[n] == -1: episode_over[n] = step self.update_obs(obs) mb_rewards.append(rewards) step += 1 mb_dones.append(self.dones) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] mb_u1 = np.asarray(mb_u1, dtype=np.float32).swapaxes(1, 0) mb_u2 = np.asarray(mb_u2, dtype=np.float32).swapaxes(1, 0) # discount/bootstrap off value fn _obs, _rewards, _actions, _values, _masks, _u1, _u2 = [], [], [], [], [], [], [] for n, (obs, rewards, actions, values, dones, masks, u1, u2) in enumerate(zip(mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_masks, mb_u1, mb_u2)): # pull out data rewards = rewards.tolist() self.rewards.append(sum(rewards)) actions = actions.tolist() values = values.tolist() dones = dones.tolist() masks = masks.tolist() u1, u2 = u1.tolist(), u2.tolist() # get length of this episode episode_length = episode_over[n]+1 # crop out only played experience obs = obs[:episode_length] rewards = rewards[:episode_length] actions = actions[:episode_length] values = values[:episode_length] dones = dones[:episode_length] u1 = u1[:episode_length] u2 = u2[:episode_length] assert dones[-1] == True masks = masks[:episode_length] # discount the rewards rewards = discount_with_dones(rewards, dones, self.gamma) _obs.extend(obs) _rewards.extend(rewards) _actions.extend(actions) _values.extend(values) _masks.extend(masks) _u1.extend(u1) _u2.extend(u2) self.rewards = self.rewards[-100:] # make numpy mb_obs = np.asarray(_obs) mb_rewards = np.asarray(_rewards) mb_actions = np.asarray(_actions) mb_values = np.asarray(_values) mb_masks = np.asarray(_masks) mb_u1 = np.asarray(_u1) mb_u2 = np.asarray(_u2) self._num_rollouts += 1 self._num_steps += len(rewards) * 4 # FRAME STACK ave_r = np.mean(self.rewards) #print("Episode {}, Ave R {}".format(self._num_rollouts, ave_r)) logger.record_tabular("ave_r", ave_r) logger.record_tabular("last_r", self.rewards[-1]) logger.record_tabular("num_rollouts", self._num_rollouts) logger.record_tabular("l", len(rewards) * 4) #logger.dump_tabular() END = False #print(self._num_steps, len(rewards)) #if self._num_steps > 5000000: if np.mean(self.rewards) >= 195.:#195.: #if self._num_rollouts > 1000: logger.record_tabular("finished_in", self._num_rollouts) logger.record_tabular("total_steps", self._num_steps) logger.dump_tabular() END = True return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_u1, mb_u2, END
def run(self): if hasattr(self.model, 'step_env_random'): step_env_random = self.model.step_env_random sample_normal_op = tf.truncated_normal(shape=[7, 7, 1]) new_normal_placeholder = tf.placeholder( shape=[self.nenvs, 7, 7, 1], dtype=tf.float32) assign_normal_op = tf.assign(ref=step_env_random, value=new_normal_placeholder) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states dropout_strength_tup = (self.model.DROPOUT_STRENGTH, get_dropout_strength( self.hparams, self.model.lr.n + self.nbatch)) for n in range(self.nsteps): actions, values, states = self.model.step( self.obs, self.states, self.dones, _dropout_strength=dropout_strength_tup) if hasattr(self.model, 'target_model'): values = self.model.target_model.value( self.obs, self.states, self.dones, _dropout_strength=dropout_strength_tup).tolist() mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 if hasattr(self.model, 'step_env_random'): [cur_rand, new_rand] = self.model.sess.run( [step_env_random, sample_normal_op]) cur_rand[n] = new_rand self.model.sess.run( assign_normal_op, feed_dict={new_normal_placeholder: cur_rand}) self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if hasattr(self.model, 'target_model'): last_values = self.model.target_model.value( self.obs, self.states, self.dones, _dropout_strength=dropout_strength_tup).tolist() else: last_values = self.model.value( self.obs, self.states, self.dones, _dropout_strength=dropout_strength_tup).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states stepdict = {"S": self.states, "M": self.dones} if self.dropoutpi != 1.0: stepdict["dropoutpi_keep_prob"] = 1.0 if self.dropoutvf != 1.0: stepdict["dropoutvf_keep_prob"] = 1.0 if self.isbnpitrainmode != None: stepdict["isbnpitrainmode"] = False if self.isbnvftrainmode != None: stepdict["isbnvftrainmode"] = False for n in range(self.nsteps): # Given observations, take action and value (V(s)) actions, values, states, _ = self.model.step(self.obs, **stepdict) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # Take actions in env and look the results obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray( mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(self.obs, **stepdict).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_chosen_probs = [],[],[],[],[],[] # Minibatch = mb mb_states = self.states for n in range(self.nsteps): actions, values, aprobs, states = self.model.step( self.obs, self.states, self.dones) mb_chosen_probs.append(aprobs[range(len(actions)), actions]) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_chosen_probs = np.asarray(mb_chosen_probs, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_chosen_probs = mb_chosen_probs.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() mb_advs = mb_rewards - mb_values corrected_advs = [] if 'MIS_ADV' in os.environ: for adv_i, adv in enumerate(mb_advs): if adv < 0: adv_scale = 1. + mb_chosen_probs[adv_i] / ( 1. - mb_chosen_probs[adv_i]) # adv_scale = 1.0 # Disable with minimal code change # adv_scale = min(adv_scale, 5.0) corrected_advs.append(adv * adv_scale) else: corrected_advs.append(adv) mb_advs = np.array(corrected_advs) return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_advs
def run(self): mb_obs, mb_td_targets, mb_base_actions, \ mb_xy0, mb_xy1, \ mb_values, mb_dones \ = [], [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): # pi, pi2, x1, y1, x2, y2, v0 pi1, pi_xy0, pi_xy1, values, states = self.model.step( self.obs, self.states, self.dones) pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 base_actions = np.argmax( pi1 * self.base_act_mask + pi1_noise, axis=1) xy0 = np.argmax(pi_xy0, axis=1) x0 = (xy0 % 32).astype(int) y0 = (xy0 / 32).astype(int) xy1 = np.argmax(pi_xy1, axis=1) x1 = (xy1 % 32).astype(int) y1 = (xy1 / 32).astype(int) # Scripted Agent Hacking for env_num in range(self.nenv): if env_num >= self.nscripts: # only for scripted agents continue ob = self.obs[env_num, :, :, :] player_relative = ob[:, :, -1] self.group_list[env_num] = common.update_group_list2( self.control_groups[env_num]) if len(self.action_queue[env_num]) == 0: self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \ common.solve_tsp(player_relative, self.selected[env_num][0], self.group_list[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num]) base_actions[env_num] = 0 x0[env_num] = 0 y0[env_num] = 0 x1[env_num] = 0 y1[env_num] = 0 if len(self.action_queue[env_num]) > 0: action = self.action_queue[env_num].pop(0) base_actions[env_num] = action.get("base_action", 0) x0[env_num] = action.get("x0", 0) y0[env_num] = action.get("y0", 0) xy0[env_num] = y0[env_num] * 32 + x0[env_num] x1[env_num] = action.get("x1", 0) y1[env_num] = action.get("y1", 0) xy1[env_num] = y1[env_num] * 32 + x1[env_num] base_actions = self.valid_base_action(base_actions) new_base_actions = self.trans_base_actions(base_actions) base_action_spec = self.env.action_spec(new_base_actions) # print("base_actions:", base_actions) actions = self.construct_action( base_actions, base_action_spec, x0, y0, x1, y1 ) mb_obs.append(np.copy(self.obs)) mb_base_actions.append(base_actions) mb_xy0.append(xy0) mb_xy1.append(xy1) mb_values.append(values) mb_dones.append(self.dones) #print("final acitons : ", actions) obs, rewards, dones,\ available_actions, army_counts,\ control_groups, selected, xy_per_marine\ = self.env.step( actions=actions) self.army_counts = army_counts self.control_groups = control_groups self.selected = selected for env_num, data in enumerate(xy_per_marine): self.xy_per_marine[env_num] = data self.update_available(available_actions) self.states = states self.dones = dones mean_100ep_reward_a2c = 0 for n, done in enumerate(dones): self.total_reward[n] += float(rewards[n]) if done: self.obs[n] = self.obs[n] * 0 self.episodes += 1 num_episodes = self.episodes self.episode_rewards.append(self.total_reward[n]) model = self.model mean_100ep_reward = round( np.mean(self.episode_rewards[-101:]), 1) if (n < self.nscripts): # scripted agents self.episode_rewards_script.append( self.total_reward[n]) mean_100ep_reward_script = round( np.mean(self.episode_rewards_script[-101:]), 1) nsml.report( reward_script=self.total_reward[n], mean_reward_script=mean_100ep_reward_script, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals() ) else: self.episode_rewards_a2c.append(self.total_reward[n]) mean_100ep_reward_a2c = round( np.mean(self.episode_rewards_a2c[-101:]), 1) nsml.report( reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals() ) print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) if self.callback is not None: self.callback(locals(), globals()) self.total_reward[n] = 0 self.group_list[n] = [] self.update_obs(obs) mb_td_targets.append(rewards) mb_dones.append(self.dones) #batch of steps to batch of rollouts mb_obs = np.asarray( mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) mb_base_actions = np.asarray( mb_base_actions, dtype=np.int32).swapaxes(1, 0) mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate( zip(mb_td_targets, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_td_targets[n] = rewards mb_td_targets = mb_td_targets.flatten() mb_base_actions = mb_base_actions.flatten() mb_xy0 = mb_xy0.flatten() mb_xy1 = mb_xy1.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_td_targets, mb_masks, \ mb_base_actions, mb_xy0, mb_xy1, mb_values
def run(self, update): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_pos, mb_nm, mb_nm_xy = [],[],[],[],[],[],[],[] epinfos = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init # Prepare nm_xy for i in range(self.neural_map.shape[0]): if self.use_extended_write_op: self.neural_map_xy[i, 0, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] if self.pos[i, 2] == 0: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) + 1, int(self.pos[i, 0] // self.pos_x_divisor), :] elif self.pos[i, 2] == 1: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) + 1, :] elif self.pos[i, 2] == 2: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) - 1, int(self.pos[i, 0] // self.pos_x_divisor), :] elif self.pos[i, 2] == 3: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) - 1, :] else: self.neural_map_xy[i, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] actions, values, write_vector, _ = self.model.step( self.obs, S=self.neural_map, M=self.neural_map_xy) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) mb_pos.append(self.pos.copy()) mb_nm.append(self.neural_map.copy()) mb_nm_xy.append(self.neural_map_xy.copy()) # Update neural map with write vector for i in range(self.neural_map.shape[0]): if self.use_extended_write_op: self.neural_map[i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] = write_vector[ i, 0, :] if self.pos[i, 2] == 0: self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) + 1, int(self.pos[i, 0] // self.pos_x_divisor), :] = write_vector[i, 1, :] elif self.pos[i, 2] == 1: self.neural_map[i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) + 1, :] = write_vector[i, 1, :] elif self.pos[i, 2] == 2: self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) - 1, int(self.pos[i, 0] // self.pos_x_divisor), :] = write_vector[i, 1, :] elif self.pos[i, 2] == 3: self.neural_map[i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) - 1, :] = write_vector[i, 1, :] else: self.neural_map[i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] = write_vector[ i, :] # Take actions in env and look the results # Infos contains a ton of useful informations tmp, rewards, dones, infos = self.env.step(actions) obs = tmp[:, :-3] self.pos = tmp[:, -3:] #if (update > 120000) and (update <= 125000): step_offset = (update - 1) * self.nsteps #ex.log_scalar('neural_map', mb_nm[-1].tolist(), step_offset+n) #ex.log_scalar('obs', mb_obs[-1].tolist(), step_offset+n) #ex.log_scalar('pos', mb_pos[-1].tolist(), step_offset+n) #ex.log_scalar('action', actions.tolist(), step_offset+n) #ex.log_scalar('reward', rewards.tolist(), step_offset+n) #ex.log_scalar('done', dones.tolist(), step_offset+n) #if 'goal_positions' in infos[0]: #ex.log_scalar('ep_goal_positions', infos[0]['goal_positions'], step_offset+n) if 'episode' in infos[0]: ex.log_scalar('ep_length', infos[0]['episode']['l'], step_offset + n) ex.log_scalar('ep_reward', infos[0]['episode']['r'], step_offset + n) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes( 1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray( mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] mb_pos = np.asarray(mb_pos, dtype=self.pos.dtype) mb_pos = mb_pos.swapaxes(0, 1).reshape(mb_pos.shape[0] * mb_pos.shape[1], mb_pos.shape[2]) mb_nm = np.asarray(mb_nm, dtype=self.neural_map.dtype) mb_nm = mb_nm.swapaxes(0, 1).reshape(mb_nm.shape[0] * mb_nm.shape[1], *mb_nm.shape[2:]) mb_nm_xy = np.asarray(mb_nm_xy, dtype=self.neural_map_xy.dtype) mb_nm_xy = mb_nm_xy.swapaxes(0, 1).reshape( mb_nm_xy.shape[0] * mb_nm_xy.shape[1], *mb_nm_xy.shape[2:]) if self.gamma > 0.0: # Discount/bootstrap off value fn # Prepare nm_xy for i in range(self.neural_map.shape[0]): if self.use_extended_write_op: self.neural_map_xy[i, 0, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] if self.pos[i, 2] == 0: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) + 1, int(self.pos[i, 0] // self.pos_x_divisor), :] elif self.pos[i, 2] == 1: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) + 1, :] elif self.pos[i, 2] == 2: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor) - 1, int(self.pos[i, 0] // self.pos_x_divisor), :] elif self.pos[i, 2] == 3: self.neural_map_xy[i, 1, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor) - 1, :] else: self.neural_map_xy[i, :] = self.neural_map[ i, int(self.pos[i, 1] // self.pos_y_divisor), int(self.pos[i, 0] // self.pos_x_divisor), :] last_values = self.model.value(self.obs, S=self.neural_map, M=self.neural_map_xy).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_nm, mb_rewards, mb_nm_xy, mb_actions, mb_values, mb_pos, epinfos
def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states for n in range(self.nsteps): actions, values, states = self.model.step( self.obs, self.states, self.dones ) # Comments by Fei: step_model (nstep = 1)! nenv, nenv, nenv * 2nlstm mb_obs.append( np.copy(self.obs) ) # Comments by Fei: finally will be nsteps * nenv * nh * nw * (nc*nstack) mb_actions.append( actions) # Comments by Fei: finally will be nsteps * nenv mb_values.append( values) # Comments by Fei: finally will be nsteps * nenv mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step( actions) # Comments by Fei: nenv * nh * nw * 1, nenv, nenv self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.update_obs(obs) mb_rewards.append( rewards) # Comments by Fei: finally will be nsteps * nenv mb_dones.append( self.dones) # Comments by Fei: finally will be (nsteps+1) * nenv #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape ) # Comments by Fei: (nenv*nsteps, nh, nw, nc*nstack) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes( 1, 0) # Comments by Fei: nenv * nsteps mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes( 1, 0) # Comments by Fei: nenv * nsteps mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes( 1, 0) # Comments by Fei: nenv * nsteps mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes( 1, 0) # Comments by Fei: nenv * (nsteps+1) mb_masks = mb_dones[:, : -1] # Comments by Fei: masks is nenv * nsteps (missing the last done) mb_dones = mb_dones[:, 1:] # Comments by Fei: dones is nenv * nsteps (missing the first done) last_values = self.model.value( self.obs, self.states, self.dones).tolist( ) # Comments by Fei: step_model (nstep = 1)! nenv vector #discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values)): # Comments by Fei: nenv | nsteps, nsteps, 1 rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() # Comments by Fei: nbatch vector now mb_actions = mb_actions.flatten() # Comments by Fei: nbatch vector now mb_values = mb_values.flatten() # Comments by Fei: nbatch vector now mb_masks = mb_masks.flatten() # Comments by Fei: nbatch vector now return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run( self, *, # EPOpt specific - could go in __init__ but epsilon is callable paths, epsilon): """Instead of doing a trajectory of nsteps (ie, "horizon"), do a sample N "paths" and then return the bottom epsilon-percentile """ # FIXME(cpacker): currently only works with single-threading assert (self.env.num_envs == 1) # Store all N trajectories sampled then return data of bottom-epsilon # lists -> lists of lists n_mb_obs, n_mb_rewards, n_mb_actions, n_mb_values, n_mb_dones = [ [] for _ in range(paths) ], [[] for _ in range(paths)], [[] for _ in range(paths) ], [[] for _ in range(paths) ], [[] for _ in range(paths)] num_episodes = 0 mb_states = self.states for N in range(paths): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = n_mb_obs[ N], n_mb_rewards[N], n_mb_actions[N], n_mb_values[ N], n_mb_dones[N] for _ in range(self.env.venv.envs[0].spec.max_episode_steps): actions, values, states, _ = self.model.step( self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for i, done in enumerate(dones): if done: self.obs[i] = self.obs[i] * 0 self.obs = obs mb_rewards.append(rewards) # We only want to do one episode if self.dones: break mb_dones.append(self.dones) # Compute the worst epsilon paths and concatenate them episode_returns = [sum(r) for r in n_mb_rewards] cutoff = np.percentile(episode_returns, 100 * epsilon) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for N in range(paths): #if n_mb_rewards[N] <= cutoff: if episode_returns[N] <= cutoff: # only count the episodes that are returned num_episodes += 1 # "cache" values to keep track of final ones next_obs = n_mb_obs[N] next_rewards = n_mb_rewards[N] next_actions = n_mb_actions[N] next_values = n_mb_values[N] next_dones = n_mb_dones[N] # concatenate mb_obs.extend(next_obs) mb_rewards.extend(next_rewards) mb_actions.extend(next_actions) mb_values.extend(next_values) # when constructing mb_dones, only append # next_dones[:-1] except for the last episode mb_dones.extend(next_dones[:-1]) mb_dones.append(next_dones[-1]) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).squeeze() #mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1,0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) if self.discrete: mb_actions = np.asarray(mb_actions, dtype=np.int).swapaxes(1, 0) else: mb_actions = np.asarray(mb_actions, dtype=np.float32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # We can't just use self.obs etc, because the last of the N paths # may not be included in the update last_values = self.model.value(self.obs, self.states, self.dones).tolist() # last_values = self.model.value(next_obs[-1], n_last_states[-1], next_dones[-1]).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() if self.discrete: mb_actions = mb_actions.reshape(mb_rewards.shape) else: mb_actions = mb_actions.reshape( (mb_rewards.shape[0], self.ac_space.shape[0])) mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, num_episodes
def run(self): mb_obs, prev_rewards, mb_rewards, prev_actions, mb_actions, mb_values, mb_dones, mb_masks = [], [], [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): actions, values, states, _ = self.model.step( self.obs, self.states, self.actions, self.rewards, self.dones, self.masks) mb_obs.append(np.copy(self.obs)) prev_actions.append(self.actions) mb_actions.append(actions) mb_values.append(values) prev_rewards.append(self.rewards) mb_masks.append(self.masks) mb_dones.append(self.dones) # if end_of_trial, if episode gets done in the next step, we need to reset environment parameters end_of_trial = [ self.episode_in_trial[i] == (self.episodes_per_trial - 1) for i in range(self.nenv) ] obs, rewards, dones, _ = self.env.step(actions, end_of_trial) self.actions = actions self.states = states self.dones = dones self.masks = [False for _ in range(self.nenv)] self.rewards = rewards self.obs[:] = obs mb_rewards.append(rewards) for i, done in enumerate(self.dones): if done: self.episode_in_trial[ i] += 1 # episode finished in the current step self.episode_in_trial[i] %= self.episodes_per_trial if self.episode_in_trial[i] == 0: self.masks[i] = True self.rewards[i] = 0.0 self.dones[i] = False if self.discrete: self.actions[i] = -1 else: self.actions[i] = np.zeros( (self.ac_space.shape[0]), dtype=np.float32) mb_masks.append(self.masks) # nsteps+1 records # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape( self.batch_ob_shape) prev_rewards = np.asarray(prev_rewards, dtype=np.float32).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) if self.discrete: prev_actions = np.asarray(prev_actions, dtype=np.int).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int).swapaxes(1, 0) else: prev_actions = np.asarray(prev_actions, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.float32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = np.asarray(mb_masks, dtype=np.bool).swapaxes(1, 0) last_values = self.model.value(self.obs, self.states, self.actions, self.rewards, self.dones, self.masks).tolist() num_trials = np.sum(mb_masks[:, 1:]) # discount/bootstrap off value fn discounted_rewards = [] for n, (rewards, masks, value) in enumerate( zip(mb_rewards, mb_masks[:, 1:], last_values)): rewards = rewards.tolist() masks = masks.tolist() if masks[-1] == 0: rewards = discount_with_dones(rewards + [value], masks + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, masks, self.gamma) discounted_rewards.append(rewards) discounted_rewards = np.asarray(discounted_rewards, dtype=np.float32) prev_rewards = prev_rewards.flatten() discounted_rewards = discounted_rewards.flatten() prev_actions = prev_actions.reshape(self.batch_ac_shape) mb_actions = mb_actions.reshape(self.batch_ac_shape) mb_values = mb_values.flatten() mb_masks = mb_masks[:, :-1] mb_masks = mb_masks.flatten() mb_dones = mb_dones.flatten() return mb_obs, mb_states, discounted_rewards, prev_rewards, mb_masks, prev_actions, mb_actions, mb_values, mb_dones, num_trials
def run(self): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] # mb_test = [] mb_states = self.states epinfos = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init obs_chunks = [] dones_chunks = [] # l = 2 * self.m * (self.m - 1) # mb_test.append(list(range(n * l, (n + 1) * l))) for i in range(self.m): obs_tmp = self.obs[self.models_indexes[i]] dones_tmp = np.array(self.dones)[self.models_indexes[i]] obs_chunks.append(obs_tmp) dones_chunks.append(dones_tmp) models_results = [] for i, model in enumerate(self.models): res = model.step(obs_chunks[i], S=[None], M=dones_chunks[i]) models_results.append(res) # models_results = self.tp.map(self.model_step, zip(self.models, obs_chunks, [None]*self.m, dones_chunks)) actions, values, states, _ = zip(*models_results) actions_to_send = np.zeros(shape=(2 * self.m * (self.m - 1))) values_to_send = np.zeros(shape=(2 * self.m * (self.m - 1))) for i in range(self.m): actions_to_send[self.models_indexes[i]] = actions[i] values_to_send[self.models_indexes[i]] = values[i] states = np.squeeze(states) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions_to_send) mb_values.append(values_to_send) mb_dones.append(self.dones) # Take actions in env and look the results obs, rewards, dones, infos = self.env.step(actions_to_send) result_scores = [info['score'] for info in infos[::2]] self.process_winners(result_scores) # for info in infos: # maybeepinfo = info.get('episode') # if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) # mb_obs = np.concatenate(mb_obs) # TODO: ho cambiato io mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) # mb_test = np.asarray(mb_test, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: # Discount/bootstrap off value fn # obs_chunks = [] # dones_chunks = [] last_values = [] for i, model in enumerate(self.models): obs_tmp = self.obs[self.models_indexes[i]] dones_tmp = np.array(self.dones)[self.models_indexes[i]] # obs_chunks.append(obs_tmp) # dones_chunks.append(dones_tmp) last_values_tmp = model.value(obs_tmp, S=[None], M=dones_tmp).tolist() last_values.append(last_values_tmp) # last_values = self.tp.map(self.model_value, zip(self.models, obs_chunks, [None]*self.m, dones_chunks)) # actions, values, states, _ = zip(*models_results) last_values_to_send = np.zeros(shape=(2 * self.m * (self.m - 1))) for i in range(self.m): last_values_to_send[self.models_indexes[i]] = last_values[i] last_values = last_values_to_send.tolist() # last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape(self.batch_action_shape) # mb_actions = mb_actions.T.flatten() # TODO: ho cambiato io # mb_test = mb_test.T.flatten() # TODO: ho cambiato io mb_rewards = mb_rewards.flatten() # mb_rewards = mb_rewards.T.flatten() # TODO: ho cambiato io mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos, mb_test