示例#1
0
 def train(self):
     '''Trains the algorithm'''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.share_architecture:
         return self.train_shared()
     else:
         return self.train_separate()
示例#2
0
def epsilon_greedy(state, algorithm, body):
    '''Epsilon-greedy policy: with probability epsilon, do random action, otherwise do default sampling.'''
    if util.get_lab_mode() == 'enjoy':
        return default(state, algorithm, body)
    epsilon = body.explore_var
    if epsilon > np.random.rand():
        return random(state, algorithm, body)
    else:
        return default(state, algorithm, body)
示例#3
0
def boltzmann(state, algorithm, body):
    '''
    Boltzmann policy: adjust pdparam with temperature tau; the higher the more randomness/noise in action.
    '''
    if util.get_lab_mode() == 'enjoy':
        return default(state, algorithm, body)
    tau = body.explore_var
    ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
    pdparam /= tau
    action, action_pd = sample_action_pd(ActionPD, pdparam, body)
    return action, action_pd
示例#4
0
 def post_init_nets(self):
     '''
     Method to conditionally load models.
     Call at the end of init_net() after setting self.net_names
     '''
     assert hasattr(self, 'net_names')
     if util.get_lab_mode() == 'enjoy':
         logger.info('Loaded algorithm models for lab_mode: enjoy')
         self.load()
     else:
         logger.info(f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}')
示例#5
0
 def reset(self):
     self.done = False
     env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self.get_env_info(env_info_dict, a)
         self.check_u_agent_to_body(env_info_a, a)
         state = env_info_a.states[b]
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     return _reward_e, state_e, done_e
示例#6
0
 def log_summary(self, df_mode):
     '''
     Log the summary for this body when its environment is done
     @param str:df_mode 'train' or 'eval'
     '''
     prefix = self.get_log_prefix()
     df = getattr(self, f'{df_mode}_df')
     last_row = df.iloc[-1]
     row_str = '  '.join([f'{k}: {v:g}' for k, v in last_row.items()])
     msg = f'{prefix} [{df_mode}_df] {row_str}'
     logger.info(msg)
     if util.get_lab_mode() == 'dev' and df_mode == 'train':  # log tensorboard only on dev mode and train df data
         self.log_tensorboard()
示例#7
0
 def space_reset(self):
     self._check_u_brain_to_agent()
     self.done = False
     env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         env_info_a = self._get_env_info(env_info_dict, a)
         self._check_u_agent_to_body(env_info_a, a)
         state = env_info_a.states[b]
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}')
     return _reward_e, state_e, done_e
示例#8
0
 def reset(self):
     _reward = np.nan
     env_info_dict = self.u_env.reset(
         train_mode=(util.get_lab_mode() != 'dev'),
         config=self.env_spec.get('unity'))
     a, b = 0, 0  # default singleton aeb
     env_info_a = self._get_env_info(env_info_dict, a)
     state = env_info_a.states[b]
     self.done = done = False
     logger.debug(
         f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}'
     )
     return _reward, state, done
示例#9
0
 def run(self):
     while self.env.clock.get(self.env.max_tick_unit) < self.env.max_tick:
         self.run_episode()
         if util.get_lab_mode() not in ('enjoy',
                                        'eval') and analysis.all_solved(
                                            self.agent):
             logger.info('All environments solved. Early exit.')
             break
     if self.eval_proc is not None:  # wait for final eval before closing
         util.run_cmd_wait(self.eval_proc)
     self.data = analysis.analyze_session(self)  # session fitness
     self.close()
     return self.data
示例#10
0
 def reset(self):
     self.done = False
     _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         state = self.u_env.reset()
         state_e[(a, b)] = state
         done_e[(a, b)] = self.done
     # TODO internalize render code
     if util.get_lab_mode() == 'dev':
         self.u_env.render()
     non_nan_cnt = util.count_nonan(state_e.flatten())
     assert non_nan_cnt == 1, 'OpenAI Gym supports only single body'
     return _reward_e, state_e, done_e
示例#11
0
def analyze_trial(trial_spec, session_metrics_list):
    '''Analyze trial and save data, then return metrics'''
    info_prepath = trial_spec['meta']['info_prepath']
    # calculate metrics
    trial_metrics = calc_trial_metrics(session_metrics_list, info_prepath)
    # plot graphs
    viz.plot_trial(trial_spec, trial_metrics)
    # zip files
    if util.get_lab_mode() == 'train':
        predir, _, _, _, _, _ = util.prepath_split(info_prepath)
        shutil.make_archive(predir, 'zip', predir)
        logger.info(f'All trial data zipped to {predir}.zip')
    return trial_metrics
示例#12
0
 def post_init_nets(self):
     '''
     Method to conditionally load models.
     Call at the end of init_net() after setting self.net_names
     '''
     assert hasattr(self, 'net_names')
     if util.get_lab_mode() == 'enjoy':
         logger.info('Loaded algorithm models for lab_mode: enjoy')
         self.load()
     else:
         logger.info(
             f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}'
         )
示例#13
0
def gather_aeb_rewards_df(aeb, session_datas, max_tick_unit):
    '''Gather rewards from each session for a body into a df'''
    aeb_session_rewards = {}
    for s, session_data in session_datas.items():
        aeb_df = session_data[aeb]
        aeb_reward_sr = aeb_df['reward']
        aeb_reward_sr.index = aeb_df[max_tick_unit]
        if util.get_lab_mode() in ('enjoy', 'eval'):
            # guard for eval appending possibly not ordered
            aeb_reward_sr.sort_index(inplace=True)
        aeb_session_rewards[s] = aeb_reward_sr
    aeb_rewards_df = pd.DataFrame(aeb_session_rewards)
    return aeb_rewards_df
示例#14
0
 def train(self):
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         loss = self.calc_policy_loss(batch)
         self.net.training_step(loss=loss)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Policy loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
示例#15
0
 def run(self):
     num_cpus = ps.get(self.spec['meta'], 'resources.num_cpus', util.NUM_CPUS)
     info_spaces = []
     for _s in range(self.spec['meta']['max_session']):
         self.info_space.tick('session')
         info_spaces.append(deepcopy(self.info_space))
     if util.get_lab_mode() == 'train' and len(info_spaces) > 1:
         session_datas = util.parallelize_fn(self.init_session_and_run, info_spaces, num_cpus)
     else:  # dont parallelize when debugging to allow render
         session_datas = [self.init_session_and_run(info_space) for info_space in info_spaces]
     self.session_data_dict = {data.index[0]: data for data in session_datas}
     self.data = analysis.analyze_trial(self)
     self.close()
     return self.data
示例#16
0
 def train(self):
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         loss = self.calc_policy_loss(batch)
         self.net.training_step(loss=loss)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Policy loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
示例#17
0
    def try_ckpt(self, agent, env):
        '''Try to checkpoint agent and run_online_eval at the start, save_freq, and the end'''
        clock = env.clock
        tick = clock.get(env.max_tick_unit)
        to_ckpt = False
        if util.get_lab_mode() not in ('enjoy',
                                       'eval') and tick <= env.max_tick:
            to_ckpt = (tick % env.save_frequency == 0) or tick == env.max_tick
        if env.max_tick_unit == 'epi':  # extra condition for epi
            to_ckpt = to_ckpt and env.done

        if to_ckpt:
            if analysis.new_best(agent):
                agent.save(ckpt='best')
            # run online eval for train mode
            if util.get_lab_mode() == 'train' and self.spec['meta'].get(
                    'training_eval', False):
                ckpt = f'epi{clock.epi}-totalt{clock.total_t}'
                agent.save(ckpt=ckpt)
                # set reference to eval process for handling
                self.eval_proc = analysis.run_online_eval(
                    self.spec, self.info_space, ckpt)
            if tick > 0:  # nothing to analyze at start
                analysis.analyze_session(self)
示例#18
0
 def step(self, action_e):
     assert len(action_e) == 1, 'OpenAI Gym supports only single body'
     # TODO implement clock_speed: step only if self.clock.to_step()
     if self.done:  # t will actually be 0
         return self.reset()
     action = action_e[(0, 0)]
     (state, reward, done, _info) = self.u_env.step(action)
     if util.get_lab_mode() == 'dev':
         self.u_env.render()
     reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
     for (a, b), body in util.ndenumerate_nonan(self.body_e):
         reward_e[(a, b)] = reward
         state_e[(a, b)] = state
         done_e[(a, b)] = done
     self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep)
     return reward_e, state_e, done_e
示例#19
0
def session_data_dict_from_file(predir, trial_index):
    '''Build trial.session_data_dict from file'''
    ckpt_str = 'ckpt-eval' if util.get_lab_mode() in ('enjoy', 'eval') else ''
    session_data_dict = {}
    for filename in os.listdir(predir):
        if f'_t{trial_index}_' in filename and filename.endswith(
                f'{ckpt_str}_session_fitness_df.csv'):
            filepath = f'{predir}/{filename}'
            fitness_df = util.read(filepath,
                                   header=[0, 1, 2, 3],
                                   index_col=0,
                                   dtype=np.float32)
            util.fix_multi_index_dtype(fitness_df)
            session_index = fitness_df.index[0]
            session_data_dict[session_index] = fitness_df
    return session_data_dict
示例#20
0
    def space_train(self):
        '''
        Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
        For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
        Otherwise this function does nothing.
        '''
        if util.get_lab_mode() == 'enjoy':
            return np.nan
        total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
        self.to_train = (total_t > self.training_min_timestep
                         and total_t % self.training_frequency == 0)
        is_per = util.get_class_name(
            self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
        if self.to_train == 1:
            total_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                batch = self.space_sample()
                for _ in range(self.training_batch_epoch):
                    with torch.no_grad():
                        q_targets = self.calc_q_targets(batch)
                        if is_per:
                            q_preds = self.net.wrap_eval(batch['states'])
                            errors = torch.abs(q_targets - q_preds)
                            errors = errors.sum(dim=1).unsqueeze_(dim=1)
                            for body in self.agent.nanflat_body_a:
                                body.memory.update_priorities(errors)
                    loss = self.net.training_step(
                        batch['states'],
                        q_targets,
                        global_net=self.global_nets.get('net'))
                    total_loss += loss
            loss = total_loss / (self.training_epoch *
                                 self.training_batch_epoch)
            # reset
            self.to_train = 0
            for body in self.agent.nanflat_body_a:
                body.entropies = []
                body.log_probs = []
            logger.debug(
                f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
示例#21
0
def analyze_trial(trial_spec, session_metrics_list):
    '''Analyze trial and save data, then return metrics'''
    info_prepath = trial_spec['meta']['info_prepath']
    # calculate metrics
    trial_metrics = calc_trial_metrics(session_metrics_list, info_prepath)
    # plot graphs
    viz.plot_trial(trial_spec, trial_metrics)
    viz.plot_trial(trial_spec, trial_metrics, ma=True)
    # manually shut down orca server to avoid zombie processes
    viz.pio.orca.shutdown_server()
    # zip files
    if util.get_lab_mode() == 'train':
        predir, _, _, _, _, _ = util.prepath_split(info_prepath)
        zipdir = util.smart_path(predir)
        shutil.make_archive(zipdir, 'zip', zipdir)
        logger.info(f'All trial data zipped to {predir}.zip')
    return trial_metrics
示例#22
0
    def end_init_nets(self):
        '''Checkers and conditional loaders called at the end of init_nets()'''
        # check all nets naming
        assert hasattr(self, 'net_names')
        for net_name in self.net_names:
            assert net_name.endswith(
                'net'
            ), f'Naming convention: net_name must end with "net"; got {net_name}'

        # load algorithm if is in train@ resume or enjoy mode
        lab_mode = util.get_lab_mode()
        if self.agent.spec['meta']['resume'] or lab_mode == 'enjoy':
            self.load()
            logger.info(f'Loaded algorithm models for lab_mode: {lab_mode}')
        else:
            logger.info(
                f'Initialized algorithm models for lab_mode: {lab_mode}')
示例#23
0
 def run_sessions(self):
     logger.info('Running sessions')
     if util.get_lab_mode() in (
             'train', 'eval') and self.spec['meta']['max_session'] > 1:
         # when training a single spec over multiple sessions
         session_datas = self.parallelize_sessions()
     else:
         session_datas = []
         for _s in range(self.spec['meta']['max_session']):
             self.info_space.tick('session')
             session = self.SessionClass(deepcopy(self.spec),
                                         deepcopy(self.info_space))
             session_data = session.run()
             session_datas.append(session_data)
             if analysis.is_unfit(session_data, session):
                 break
     return session_datas
示例#24
0
def multi_boltzmann(pdparam, algorithm, body_list):
    '''Apply Boltzmann policy body-wise'''
    # pdparam.squeeze_(dim=0)
    assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
    if util.get_lab_mode() == 'enjoy':
        return multi_default(pdparam, algorithm, body_list)
    action_list, action_pd_a = [], []
    for idx, sub_pdparam in enumerate(pdparam):
        body = body_list[idx]
        tau = body.explore_var
        sub_pdparam /= tau
        ActionPD = getattr(distributions, body.action_pdtype)
        action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
        action_list.append(action)
        action_pd_a.append(action_pd)
    action_a = torch.tensor(action_list).unsqueeze_(dim=1)
    return action_a, action_pd_a
示例#25
0
 def post_init_nets(self):
     '''
     Method to conditionally load models.
     Call at the end of init_nets() after setting self.net_names
     '''
     assert hasattr(self, 'net_names')
     if not ps.is_empty(self.global_nets):
         assert all(
             k in self.net_names for k in self.global_nets
         ), f'Provided global_nets keys: {list(self.global_nets.keys())} are inconsistent with self.net_names: {self.net_names}'
     if util.get_lab_mode() == 'enjoy':
         logger.info('Loaded algorithm models for lab_mode: enjoy')
         self.load()
     else:
         logger.info(
             f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}'
         )
示例#26
0
def multi_boltzmann(pdparam, algorithm, body_list):
    '''Apply Boltzmann policy body-wise'''
    # pdparam.squeeze_(dim=0)
    assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
    if util.get_lab_mode() == 'enjoy':
        return multi_default(pdparam, algorithm, body_list)
    action_list, action_pd_a = [], []
    for idx, sub_pdparam in enumerate(pdparam):
        body = body_list[idx]
        tau = body.explore_var
        sub_pdparam /= tau
        ActionPD = getattr(distributions, body.action_pdtype)
        action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
        action_list.append(action)
        action_pd_a.append(action_pd)
    action_a = torch.tensor(action_list).unsqueeze_(dim=1)
    return action_a, action_pd_a
示例#27
0
def multi_epsilon_greedy(pdparam, algorithm, body_list):
    '''Apply epsilon-greedy policy body-wise'''
    assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
    if util.get_lab_mode() == 'enjoy':
        return multi_default(pdparam, algorithm, body_list)
    action_list, action_pd_a = [], []
    for idx, sub_pdparam in enumerate(pdparam):
        body = body_list[idx]
        epsilon = body.explore_var
        if epsilon > np.random.rand():
            action, action_pd = random(None, algorithm, body)
        else:
            ActionPD = getattr(distributions, body.action_pdtype)
            action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
        action_list.append(action)
        action_pd_a.append(action_pd)
    action_a = torch.tensor(action_list).unsqueeze_(dim=1)
    return action_a, action_pd_a
示例#28
0
def multi_epsilon_greedy(pdparam, algorithm, body_list):
    '''Apply epsilon-greedy policy body-wise'''
    assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
    if util.get_lab_mode() == 'enjoy':
        return multi_default(pdparam, algorithm, body_list)
    action_list, action_pd_a = [], []
    for idx, sub_pdparam in enumerate(pdparam):
        body = body_list[idx]
        epsilon = body.explore_var
        if epsilon > np.random.rand():
            action, action_pd = random(None, algorithm, body)
        else:
            ActionPD = getattr(distributions, body.action_pdtype)
            action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
        action_list.append(action)
        action_pd_a.append(action_pd)
    action_a = torch.tensor(action_list).unsqueeze_(dim=1)
    return action_a, action_pd_a
示例#29
0
def load_algorithm(algorithm):
    '''Save all the nets for an algorithm'''
    agent = algorithm.agent
    net_names = algorithm.net_names
    if util.get_lab_mode() in ('enjoy', 'eval'):
        # load specific model in eval mode
        prepath = agent.info_space.eval_model_prepath
    else:
        prepath = util.get_prepath(agent.spec,
                                   agent.info_space,
                                   unit='session')
    logger.info(
        f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names}')
    for net_name in net_names:
        net = getattr(algorithm, net_name)
        model_path = f'{prepath}_{net_name}_model.pth'
        load(net, model_path)
        optim_path = f'{prepath}_{net_name}_optim.pth'
        load(net.optim, optim_path)
示例#30
0
文件: sarsa.py 项目: vhcg77/SLM-Lab
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         with torch.no_grad():
             q_targets = self.calc_q_targets(batch)
         loss = self.net.training_step(batch['states'], q_targets)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
示例#31
0
def load_algorithm(algorithm):
    '''Save all the nets for an algorithm'''
    agent = algorithm.agent
    net_names = algorithm.net_names
    model_prepath = agent.spec['meta']['model_prepath']
    if util.get_lab_mode() == 'enjoy':
        model_prepath += '_ckpt-best'
    logger.info(
        f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt'
    )
    for net_name in net_names:
        net = getattr(algorithm, net_name)
        model_path = f'{model_prepath}_{net_name}_model.pt'
        load(net, model_path)
        optim_name = net_name.replace('net', 'optim')
        optim = getattr(algorithm, optim_name, None)
        if optim is not None:  # only trainable net has optim
            optim_path = f'{model_prepath}_{net_name}_optim.pt'
            load(optim, optim_path)
示例#32
0
def save_session_df(session_data, prepath, info_space):
    '''Save session_df, and if is in eval mode, modify it and save with append'''
    filepath = f'{prepath}_session_df.csv'
    if util.get_lab_mode() in ('enjoy', 'eval'):
        ckpt = util.find_ckpt(info_space.eval_model_prepath)
        epi = int(re.search('epi(\d+)', ckpt)[1])
        totalt = int(re.search('totalt(\d+)', ckpt)[1])
        session_df = pd.concat(session_data, axis=1)
        eval_session_df = pd.DataFrame(data=[session_df.mean()])
        for aeb in util.get_df_aeb_list(eval_session_df):
            eval_session_df.loc[:, aeb + ('epi', )] = epi
            eval_session_df.loc[:, aeb + ('total_t', )] = totalt
        # if eval, save with append mode
        header = not os.path.exists(filepath)
        with open(filepath, 'a') as f:
            eval_session_df.to_csv(f, header=header)
    else:
        session_df = pd.concat(session_data, axis=1)
        util.write(session_df, filepath)
示例#33
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         with torch.no_grad():
             q_targets = self.calc_q_targets(batch)
         loss = self.net.training_step(batch['states'], q_targets)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
示例#34
0
    def train(self):
        if util.get_lab_mode() == 'enjoy':
            return np.nan
        if self.to_train == 1:
            batch = self.sample()
            loss = self.calc_policy_loss(batch)
            self.net.training_step(loss=loss,
                                   global_net=self.global_nets.get('net'))
            # reset
            self.to_train = 0
            self.body.entropies = []
            self.body.log_probs = []
            logger.debug(
                f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
示例#35
0
    def train(self):
        if util.get_lab_mode() in ('enjoy', 'eval'):
            self.body.flush()
            return np.nan
        clock = self.body.env.clock
        if self.to_train == 1:
            batch = self.sample()
            loss = self.calc_policy_loss(batch)
            self.net.training_step(loss=loss, lr_clock=clock)
            # reset
            self.to_train = 0
            self.body.flush()
            logger.debug(
                f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
示例#36
0
    def run_sessions(self):
        logger.info('Running sessions')
        info_spaces = []
        for _s in range(self.spec['meta']['max_session']):
            self.info_space.tick('session')
            info_spaces.append(deepcopy(self.info_space))

        if util.get_lab_mode() == 'train' and len(info_spaces) > 1:
            # when training a single spec over multiple sessions
            session_datas = util.parallelize_fn(
                self.init_session_and_run, info_spaces,
                ps.get(self.spec['meta'], 'resources.num_cpus', util.NUM_CPUS))
        else:
            session_datas = []
            for info_space in info_spaces:
                session_data = self.init_session_and_run(info_space)
                session_datas.append(session_data)
                if analysis.is_unfit(session_data):
                    break
        return session_datas
示例#37
0
    def __init__(self, spec):
        super().__init__(spec)
        try_register_env(spec)  # register if it's a custom gym env
        seed = ps.get(spec, 'meta.random_seed')
        episode_life = util.in_train_lab_mode()
        if self.is_venv:  # make vector environment
            self.u_env = make_gym_venv(name=self.name,
                                       num_envs=self.num_envs,
                                       seed=seed,
                                       frame_op=self.frame_op,
                                       frame_op_len=self.frame_op_len,
                                       image_downsize=self.image_downsize,
                                       reward_scale=self.reward_scale,
                                       normalize_state=self.normalize_state,
                                       episode_life=episode_life)
        else:
            self.u_env = make_gym_env(name=self.name,
                                      seed=seed,
                                      frame_op=self.frame_op,
                                      frame_op_len=self.frame_op_len,
                                      image_downsize=self.image_downsize,
                                      reward_scale=self.reward_scale,
                                      normalize_state=self.normalize_state,
                                      episode_life=episode_life)
        if self.name.startswith('Unity'):
            # Unity is always initialized as singleton gym env, but the Unity runtime can be vec_env
            self.num_envs = self.u_env.num_envs
            # update variables dependent on num_envs
            self._infer_venv_attr()
            self._set_clock()
        self._set_attr_from_u_env(self.u_env)
        self.max_t = self.max_t or self.u_env.spec.max_episode_steps
        assert self.max_t is not None

        # If single PyBullet env with lab_mode==dev, enable PyBullet's built-in GUI (before first env.reset())
        if not self.is_venv and "BulletEnv" in self.name and util.get_lab_mode(
        ) == 'dev':
            self.u_env.render()

        logger.info(util.self_desc(self))
示例#38
0
def multi_boltzmann(states, algorithm, body_list, pdparam):
    '''Apply Boltzmann policy body-wise'''
    # pdparam.squeeze_(dim=0)
    assert len(pdparam) > 1 and len(pdparam) == len(
        body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
    if util.get_lab_mode() == 'enjoy':
        return multi_default(states, algorithm, body_list, pdparam)
    action_list, action_pd_a = [], []
    for idx, sub_pdparam in enumerate(pdparam):
        body = body_list[idx]
        try_preprocess(
            states[idx], algorithm, body,
            append=True)  # for consistency with init_action_pd inner logic
        tau = body.explore_var
        sub_pdparam /= tau
        ActionPD = getattr(distributions, body.action_pdtype)
        action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
        action_list.append(action)
        action_pd_a.append(action_pd)
    action_a = torch.tensor(action_list,
                            device=algorithm.net.device).unsqueeze_(dim=1)
    return action_a, action_pd_a
示例#39
0
文件: base.py 项目: c-w-m/slm-lab
 def __init__(self, spec):
     self.env_spec = spec['env'][0]  # idx 0 for single-env
     # set default
     util.set_attr(
         self,
         dict(
             eval_frequency=10000,
             log_frequency=10000,
             frame_op=None,
             frame_op_len=None,
             image_downsize=(84, 84),
             normalize_state=False,
             reward_scale=None,
             num_envs=1,
         ))
     util.set_attr(self, spec['meta'], [
         'eval_frequency',
         'log_frequency',
     ])
     util.set_attr(self, self.env_spec, [
         'name',
         'frame_op',
         'frame_op_len',
         'image_downsize',
         'normalize_state',
         'reward_scale',
         'num_envs',
         'max_t',
         'max_frame',
     ])
     if util.get_lab_mode() == 'eval':  # override if env is for eval
         self.num_envs = ps.get(spec, 'meta.rigorous_eval')
     self.to_render = util.to_render()
     self._infer_frame_attr(spec)
     self._infer_venv_attr()
     self._set_clock()
     self.done = False
     self.total_reward = np.nan
示例#40
0
文件: dqn.py 项目: ronald-xie/SLM-Lab
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0)
     is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                     if is_per:
                         q_preds = self.net.wrap_eval(batch['states'])
                         errors = torch.abs(q_targets - q_preds)
                         errors = errors.sum(dim=1).unsqueeze_(dim=1)
                         for body in self.agent.nanflat_body_a:
                             body.memory.update_priorities(errors)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss.cpu()
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
示例#41
0
def to_assert_trained():
    '''Condition for running assert_trained'''
    return os.environ.get('PY_ENV') == 'test' or util.get_lab_mode() == 'dev'
示例#42
0
 def train(self):
     '''Implement algorithm train, or throw NotImplementedError'''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     raise NotImplementedError