def space_step(self, action_e): action = action_e[(0, 0)] # single body if self.done: # space envs run continually without a central reset signal return self.space_reset() if not self.is_discrete: action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): reward_e[ab] = reward state_e[ab] = state done_e[ab] = done logger.debug( f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return reward_e, state_e, done_e
def sample(self): ''' Samples a batch from memory. Note that multitask's bodies are parallelized copies with similar envs, just to get more batch sizes ''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['actions'] = util.to_one_hot( body_batch['actions'], body.action_space.high) body_batch = util.to_torch_batch(body_batch, self.net.gpu) batches.append(body_batch) # Concat states at dim=1 for feedforward batch = { 'states': torch.cat([body_batch['states'] for body_batch in batches], dim=1), 'next_states': torch.cat([body_batch['next_states'] for body_batch in batches], dim=1), } # retain body-batches for body-wise q_targets calc batch['body_batches'] = batches return batch
def sample(self): '''Samples a batch from memory of size self.memory_spec['batch_size']''' batch = self.body.memory.sample() # one-hot actions to calc q_targets if self.body.is_discrete: batch['actions'] = util.to_one_hot(batch['actions'], self.body.action_space.high) if self.normalize_state: batch = policy_util.normalize_states_and_next_states(self.body, batch) batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic) return batch
def sample(self): '''Samples a batch from memory of size self.memory_spec['batch_size']''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['actions'] = util.to_one_hot(body_batch['actions'], body.action_space.high) batches.append(body_batch) batch = util.concat_batches(batches) batch = util.to_torch_batch(batch, self.net.gpu) return batch
def reset(self): _reward = np.nan state = self.u_env.reset() self.done = done = False if util.to_render(): self.u_env.render() logger.debug( f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return _reward, state, done
def sample(self): '''Samples a batch from memory''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['one_hot_actions'] = util.to_one_hot(body_batch['actions'], body.action_space.high) batches.append(body_batch) batch = util.concat_batches(batches) # this is safe for next_action at done since the calculated act_next_q_preds will be multiplied by (1 - batch['dones']) batch['next_actions'] = np.zeros_like(batch['actions']) batch['next_actions'][:-1] = batch['actions'][1:] batch = util.to_torch_batch(batch, self.net.gpu) return batch
def sample(self): '''Samples a batch from memory''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['one_hot_actions'] = util.to_one_hot(body_batch['actions'], body.action_space.high) batches.append(body_batch) batch = util.concat_batches(batches) # this is safe for next_action at done since the calculated act_next_q_preds will be multiplied by (1 - batch['dones']) batch['next_actions'] = np.zeros_like(batch['actions']) batch['next_actions'][:-1] = batch['actions'][1:] batch = util.to_torch_batch(batch, self.net.gpu) return batch
def sample(self): '''Samples a batch from memory''' batch = self.body.memory.sample() # one-hot actions to calc q_targets if self.body.is_discrete: batch['one_hot_actions'] = util.to_one_hot( batch['actions'], self.body.action_space.high) # this is safe for next_action at done since the calculated act_next_q_preds will be multiplied by (1 - batch['dones']) batch['next_actions'] = np.zeros_like(batch['actions']) batch['next_actions'][:-1] = batch['actions'][1:] if self.normalize_state: batch = policy_util.normalize_states_and_next_states( self.body, batch) batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic) return batch
def space_reset(self): _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[ab] = state done_e[ab] = self.done = False if util.to_render(): self.u_env.render() logger.debug( f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return _reward_e, state_e, done_e
def step(self, action): if not self.is_discrete: # guard for continuous action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep logger.debug( f'Env {self.e} step reward: {reward}, state: {state}, done: {done}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return reward, state, done
def sample(self): '''Samples a batch per body, which may experience different environment''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['actions'] = util.to_one_hot(body_batch['actions'], body.action_space.high) body_batch = util.to_torch_batch(body_batch, self.net.gpu) batches.append(body_batch) # collect per body for feedforward to hydra heads batch = { 'states': [body_batch['states'] for body_batch in batches], 'next_states': [body_batch['next_states'] for body_batch in batches], } # retain body-batches for body-wise q_targets calc batch['body_batches'] = batches return batch
def sample(self): ''' Samples a batch from memory. Note that multitask's bodies are parallelized copies with similar envs, just to get more batch sizes ''' batches = [] for body in self.agent.nanflat_body_a: body_batch = body.memory.sample() # one-hot actions to calc q_targets if body.is_discrete: body_batch['actions'] = util.to_one_hot(body_batch['actions'], body.action_space.high) body_batch = util.to_torch_batch(body_batch, self.net.gpu) batches.append(body_batch) # Concat states at dim=1 for feedforward batch = { 'states': torch.cat([body_batch['states'] for body_batch in batches], dim=1), 'next_states': torch.cat([body_batch['next_states'] for body_batch in batches], dim=1), } # retain body-batches for body-wise q_targets calc batch['body_batches'] = batches return batch