示例#1
0
 def update(self, action_a, reward_a, state_a, done_a):
     '''
     Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
     '''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.update(action_a[(e, b)], reward_a[(e, b)], state_a[(e, b)], done_a[(e, b)])
     loss_a = self.algorithm.train()
     loss_a = util.guard_data_a(self, loss_a, 'loss')
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.loss = loss_a[(e, b)]
     explore_var_a = self.algorithm.update()
     explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
     return loss_a, explore_var_a
示例#2
0
 def update(self, action_a, reward_a, state_a, done_a):
     '''
     Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
     '''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.update(action_a[(e, b)], reward_a[(e, b)], state_a[(e, b)], done_a[(e, b)])
     loss_a = self.algorithm.train()
     loss_a = util.guard_data_a(self, loss_a, 'loss')
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.loss = loss_a[(e, b)]
     explore_var_a = self.algorithm.update()
     explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
     return loss_a, explore_var_a
示例#3
0
 def space_step(self, action_e):
     action = action_e[(0, 0)]  # single body
     if self.done:  # space envs run continually without a central reset signal
         return self.space_reset()
     if not self.is_discrete:
         action = np.array([action])
     state, reward, done, _info = self.u_env.step(action)
     reward = guard_reward(reward)
     reward *= self.reward_scale
     if util.to_render():
         self.u_env.render()
     self.done = done = done or self.clock.get('t') > self.max_timestep
     reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for ab, body in util.ndenumerate_nonan(self.body_e):
         reward_e[ab] = reward
         state_e[ab] = state
         done_e[ab] = done
     logger.debug(
         f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}'
     )
     if isinstance(self.u_env.observation_space,
                   gym.spaces.discrete.Discrete):
         state = util.to_one_hot(state, self.u_env.observation_space.n)
     return reward_e, state_e, done_e
示例#4
0
 def act(self, state_a):
     '''Interface-level agent act method for all its bodies. Resolves state to state; get action and compose into action.'''
     data_names = ['action']
     action_a, = self.agent.agent_space.aeb_space.init_data_s(data_names, a=self.agent.a)
     for (e, b), body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[(e, b)]
         action_a[(e, b)] = self.body_act(body, state)
     return action_a
示例#5
0
 def update(self, action_a, reward_a, state_a, done_a):
     '''
     Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
     '''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.update(action_a[(e, b)], reward_a[(e, b)],
                            state_a[(e, b)], done_a[(e, b)])
     # TODO finer loss and explore_var per body
     loss = self.algorithm.train()
     explore_var = self.algorithm.update()
     data_names = ['loss', 'explore_var']
     loss_a, explore_var_a = self.agent_space.aeb_space.init_data_s(
         data_names, a=self.a)
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         loss_a[(e, b)] = loss
         explore_var_a[(e, b)] = explore_var
     return loss_a, explore_var_a
示例#6
0
 def act(self, state_a):
     '''Interface-level agent act method for all its bodies. Resolves state to state; get action and compose into action.'''
     data_names = ['action']
     action_a, = self.agent.agent_space.aeb_space.init_data_s(data_names, a=self.agent.a)
     for (e, b), body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[(e, b)]
         action_a[(e, b)] = self.body_act(body, state)
     return action_a
示例#7
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    session_data = {}
    for aeb, body in util.ndenumerate_nonan(session.aeb_space.body_space.data):
        session_data[aeb] = body.df.copy()
    return session_data
示例#8
0
 def space_reset(self):
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for ab, body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[ab] = state
         done_e[ab] = self.done = False
     if util.to_render():
         self.u_env.render()
     logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}')
     return _reward_e, state_e, done_e
示例#9
0
 def space_update(self, action_a, reward_a, state_a, done_a):
     '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
     for eb, body in util.ndenumerate_nonan(self.body_a):
         body.action_pd_update()
         body.memory.update(action_a[eb], reward_a[eb], state_a[eb],
                            done_a[eb])
     loss_a = self.algorithm.space_train()
     loss_a = util.guard_data_a(self, loss_a, 'loss')
     for eb, body in util.ndenumerate_nonan(self.body_a):
         if not np.isnan(loss_a[eb]):  # set for log_summary()
             body.loss = loss_a[eb]
     explore_var_a = self.algorithm.space_update()
     explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
     logger.debug(
         f'Agent {self.a} loss: {loss_a}, explore_var_a {explore_var_a}')
     for eb, body in util.ndenumerate_nonan(self.body_a):
         if body.env.done:
             body.epi_update()
     return loss_a, explore_var_a
示例#10
0
 def reset(self):
     self.done = False
     env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self.get_env_info(env_info_dict, a)
         self.check_u_agent_to_body(env_info_a, a)
         state = env_info_a.states[b]
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     return _reward_e, state_e, done_e
示例#11
0
 def space_act(self, state_a):
     '''Interface-level agent act method for all its bodies. Resolves state to state; get action and compose into action.'''
     data_names = ('action',)
     action_a, = self.agent.agent_space.aeb_space.init_data_s(data_names, a=self.agent.a)
     for eb, body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[eb]
         self.body = body
         action_a[eb] = self.act(state)
     # set body reference back to default
     self.body = self.agent.nanflat_body_a[0]
     return action_a
示例#12
0
 def reset(self):
     self.done = False
     env_info_dict = self.u_env.reset(train_mode=self.train_mode,
                                      config=self.spec.get('unity'))
     _reward_e, state_e, _done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self.get_env_info(env_info_dict, a)
         self.check_u_agent_to_body(env_info_a, a)
         state_e[(a, b)] = env_info_a.states[b]
     return _reward_e, state_e, _done_e
示例#13
0
 def space_reset(self):
     self._check_u_brain_to_agent()
     self.done = False
     env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self._get_env_info(env_info_dict, a)
         self._check_u_agent_to_body(env_info_a, a)
         state = env_info_a.states[b]
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}')
     return _reward_e, state_e, done_e
示例#14
0
def get_session_data(session, body_df_kind='eval', tmp_space_session_sub=False):
    '''
    Gather data from session from all the bodies
    Depending on body_df_kind, will use eval_df or train_df
    '''
    session_data = {}
    for aeb, body in util.ndenumerate_nonan(session.aeb_space.body_space.data):
        aeb_df = body.eval_df if body_df_kind == 'eval' else body.train_df
        # TODO tmp substitution since SpaceSession does not have run_eval_episode yet
        if tmp_space_session_sub:
            aeb_df = body.train_df
        session_data[aeb] = aeb_df.copy()
    return session_data
示例#15
0
 def reset(self):
     self.done = False
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     # TODO internalize render code
     if util.get_lab_mode() == 'dev':
         self.u_env.render()
     non_nan_cnt = util.count_nonan(state_e.flatten())
     assert non_nan_cnt == 1, 'OpenAI Gym supports only single body'
     return _reward_e, state_e, done_e
示例#16
0
 def reset(self):
     self.done = False
     _reward_e, state_e, _done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[(a, b)] = state
     # TODO internalize render code
     if not self.train_mode:
         self.u_env.render()
     non_nan_cnt = util.count_nonan(state_e.flatten())
     assert non_nan_cnt == 1, 'OpenAI Gym supports only single body'
     return _reward_e, state_e, _done_e
示例#17
0
 def step(self, action_e):
     # TODO implement clock_speed: step only if self.clock.to_step()
     if self.done:
         return self.reset()
     action_e = util.nanflatten(action_e)
     env_info_dict = self.u_env.step(action_e)
     reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self.get_env_info(env_info_dict, a)
         reward_e[(a, b)] = env_info_a.rewards[b]
         state_e[(a, b)] = env_info_a.states[b]
         done_e[(a, b)] = env_info_a.local_done[b]
     self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep)
     return reward_e, state_e, done_e
示例#18
0
 def space_act(self, state_a):
     '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing act() via iteration'''
     # gather and flatten
     states = []
     for eb, body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[eb]
         if self.normalize_state:
             state = policy_util.update_online_stats_and_normalize_state(body, state)
         states.append(state)
     xs = [torch.from_numpy(state).float() for state in states]
     pdparam = self.calc_pdparam(xs, evaluate=False)
     # use multi-policy. note arg change
     action_a, action_pd_a = self.action_policy(states, self, self.agent.nanflat_body_a, pdparam)
     for idx, body in enumerate(self.agent.nanflat_body_a):
         body.action_tensor, body.action_pd = action_a[idx], action_pd_a[idx]  # used for body.action_pd_update later
     return action_a.cpu().numpy()
示例#19
0
 def reset(self):
     self.done = False
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     if util.get_lab_mode() == 'dev':
         self.u_env.render()
     non_nan_cnt = util.count_nonan(state_e.flatten())
     assert non_nan_cnt == 1, 'OpenAI Gym supports only single body'
     logger.debug(
         f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}'
     )
     return _reward_e, state_e, done_e
示例#20
0
 def update(self, action_a, reward_a, state_a, done_a):
     '''
     Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
     '''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.update(action_a[(e, b)], reward_a[(e, b)],
                            state_a[(e, b)], done_a[(e, b)])
         if self.len_state_buffer > 0:
             if len(body.state_buffer) == self.len_state_buffer:
                 del body.state_buffer[0]
             body.state_buffer.append(state_a[(e, b)])
     loss_a = self.algorithm.train()
     loss_a = util.guard_data_a(self, loss_a, 'loss')
     explore_var_a = self.algorithm.update()
     explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
     return loss_a, explore_var_a
示例#21
0
 def space_reset(self):
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for ab, body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[ab] = state
         done_e[ab] = self.done = False
     if util.to_render():
         self.u_env.render()
     logger.debug(
         f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}'
     )
     if isinstance(self.u_env.observation_space,
                   gym.spaces.discrete.Discrete):
         state = util.to_one_hot(state, self.u_env.observation_space.n)
     return _reward_e, state_e, done_e
示例#22
0
 def step(self, action_e):
     assert len(action_e) == 1, 'OpenAI Gym supports only single body'
     # TODO implement clock_speed: step only if self.clock.to_step()
     if self.done:  # t will actually be 0
         return self.reset()
     action = action_e[(0, 0)]
     (state, reward, done, _info) = self.u_env.step(action)
     if util.get_lab_mode() == 'dev':
         self.u_env.render()
     reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         reward_e[(a, b)] = reward
         state_e[(a, b)] = state
         done_e[(a, b)] = done
     self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep)
     return reward_e, state_e, done_e
示例#23
0
 def act(self, state_a):
     '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing body_act'''
     # gather and flatten
     states = []
     for (e, b), body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[(e, b)]
         states.append(state)
     state = torch.tensor(states).view(-1).unsqueeze_(0).float()
     if torch.cuda.is_available() and self.net.gpu:
         state = state.cuda()
     pdparam = self.calc_pdparam(state, evaluate=False)
     # use multi-policy. note arg change
     action_a, action_pd_a = self.action_policy(pdparam, self, self.body_list)
     for idx, body in enumerate(self.body_list):
         action_pd = action_pd_a[idx]
         body.entropies.append(action_pd.entropy())
         body.log_probs.append(action_pd.log_prob(action_a[idx].float()))
     return action_a.cpu().numpy()
示例#24
0
 def space_step(self, action_e):
     # TODO implement clock_speed: step only if self.clock.to_step()
     if self.done:
         return self.space_reset()
     action_e = util.nanflatten(action_e)
     env_info_dict = self.u_env.step(action_e)
     reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(
         ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self._get_env_info(env_info_dict, a)
         reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale
         state_e[(a, b)] = env_info_a.states[b]
         done_e[(a, b)] = env_info_a.local_done[b]
     self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t)
     logger.debug(
         f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}'
     )
     return reward_e, state_e, done_e
示例#25
0
 def space_act(self, state_a):
     '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing act() via iteration'''
     # gather and flatten
     states = []
     for eb, body in util.ndenumerate_nonan(self.agent.body_a):
         state = state_a[eb]
         if self.normalize_state:
             state = policy_util.update_online_stats_and_normalize_state(
                 body, state)
         states.append(state)
     state = torch.tensor(
         states, device=self.net.device).view(-1).unsqueeze_(0).float()
     pdparam = self.calc_pdparam(state, evaluate=False)
     # use multi-policy. note arg change
     action_a, action_pd_a = self.action_policy(states, self,
                                                self.agent.nanflat_body_a,
                                                pdparam)
     for idx, body in enumerate(self.agent.nanflat_body_a):
         action_pd = action_pd_a[idx]
         body.entropies.append(action_pd.entropy())
         body.log_probs.append(action_pd.log_prob(action_a[idx].float()))
         assert not torch.isnan(body.log_probs[-1])
     return action_a.cpu().numpy()
示例#26
0
def test_ndenumerate_nonan():
    arr = np.full((2, 3), np.nan, dtype=object)
    np.fill_diagonal(arr, 1)
    for (a, b), body in util.ndenumerate_nonan(arr):
        assert a == b
        assert body == 1
示例#27
0
def test_ndenumerate_nonan():
    arr = np.full((2, 3), np.nan, dtype=object)
    np.fill_diagonal(arr, 1)
    for (a, b), body in util.ndenumerate_nonan(arr):
        assert a == b
        assert body == 1
示例#28
0
 def space_reset(self, state_a):
     '''Do agent reset per session, such as memory pointer'''
     logger.debug(f'Agent {self.a} reset')
     for eb, body in util.ndenumerate_nonan(self.body_a):
         body.memory.epi_reset(state_a[eb])
示例#29
0
 def reset(self, state_a):
     '''Do agent reset per session, such as memory pointer'''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.epi_reset(state_a[(e, b)])
示例#30
0
 def reset(self, state_a):
     '''Do agent reset per session, such as memory pointer'''
     for (e, b), body in util.ndenumerate_nonan(self.body_a):
         body.memory.epi_reset(state_a[(e, b)])