示例#1
0
 def add_experience(self,
                    state,
                    action,
                    reward,
                    next_state,
                    done,
                    priority=1):
     '''Interface helper method for update() to add experience to memory'''
     self.states.append(state)
     self.actions.append(action)
     self.rewards.append(reward)
     self.next_states.append(next_state)
     self.dones.append(done)
     self.priorities.append(priority)
     # Set most recent
     self.most_recent[0] = state
     self.most_recent[1] = action
     self.most_recent[2] = reward
     self.most_recent[3] = next_state
     self.most_recent[4] = done
     self.most_recent[5] = priority
     # Track memory size and num experiences
     self.true_size += 1
     if self.true_size > 1000 and self.memory_warn_flag:
         logger.warn("Large memory size: {}".format(self.true_size))
         self.memory_warn_flag = False
     self.total_experiences += 1
     # Decide if agent is to train
     if done or (len(self.states)) == self.training_frequency:
         self.body.agent.algorithm.to_train = 1
示例#2
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    mdp_data_names = ['t', 'epi'] + data_names
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names}
    session_mdp_data, session_data = {}, {}
    for aeb in session.aeb_space.aeb_list:
        data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()}
        # trim back to remove any incomplete sessions due to multienv termination
        complete_done_h = np.trim_zeros(data_h_dict['done'], 'b')
        # offset properly to bin separate episodes
        reset_bin = np.concatenate([[0.], complete_done_h[:-1]])
        data_len = len(reset_bin)
        reset_idx = reset_bin.astype('bool')
        nonreset_idx = ~reset_idx
        data_h_dict['t'] = np.ones(reset_idx.shape)
        data_h_dict['epi'] = reset_idx.astype(int).cumsum()
        mdp_df = pd.DataFrame({
            data_name: data_h_dict[data_name][:data_len]
            for data_name in mdp_data_names})
        mdp_df = mdp_df.reindex(mdp_data_names, axis=1)
        aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df
    logger.debug(f'{session_data}')
    data_size_in_bytes = util.memory_size(session_mdp_data)
    logger.debug(f'Size of session data: {data_size_in_bytes} MB')
    if data_size_in_bytes > 25:
        logger.warn(f'Session data > 25 MB')
    return session_mdp_data, session_data
示例#3
0
def run_by_mode(spec_file, spec_name, run_mode):
    spec = spec_util.get(spec_file, spec_name)
    # TODO remove when analysis can save all plotly plots
    os.environ['run_mode'] = run_mode
    if run_mode == 'search':
        Experiment(spec).run()
    elif run_mode == 'train':
        Trial(spec).run()
    elif run_mode == 'enjoy':
        # TODO turn on save/load model mode
        # Session(spec).run()
        pass
    elif run_mode == 'generate_benchmark':
        benchmarker.generate_specs(spec, const='agent')
    elif run_mode == 'benchmark':
        # TODO allow changing const to env
        run_benchmark(spec, const='agent')
    elif run_mode == 'dev':
        os.environ['PY_ENV'] = 'test'  # to not save in viz
        spec = util.override_dev_spec(spec)
        Trial(spec).run()
    else:
        logger.warn(
            'run_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.'
        )
示例#4
0
def calc_aeb_fitness_sr(aeb_df, env_name):
    '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)'''
    no_fitness_sr = pd.Series({'strength': 0., 'speed': 0., 'stability': 0.})
    if len(aeb_df) < MA_WINDOW:
        logger.warn(
            f'Run more than {MA_WINDOW} episodes to compute proper fitness')
        return no_fitness_sr
    std = FITNESS_STD.get(env_name)
    if std is None:
        std = FITNESS_STD.get('template')
        logger.warn(
            f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.'
        )
    aeb_df['total_t'] = aeb_df['t'].cumsum()
    aeb_df['strength'] = calc_strength(aeb_df, std['rand_epi_reward'],
                                       std['std_epi_reward'])
    aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW).mean()
    aeb_df['strength_mono_inc'] = is_noisy_mono_inc(
        aeb_df['strength']).astype(int)

    strength = aeb_df['strength_ma'].max()
    speed = calc_speed(aeb_df, std['std_timestep'])
    stability = calc_stability(aeb_df)
    aeb_fitness_sr = pd.Series({
        'strength': strength,
        'speed': speed,
        'stability': stability
    })
    return aeb_fitness_sr
示例#5
0
def test_logger(test_multiline_str):
    logger.critical(test_multiline_str)
    logger.debug(test_multiline_str)
    logger.error(test_multiline_str)
    logger.exception(test_multiline_str)
    logger.info(test_multiline_str)
    logger.warn(test_multiline_str)
示例#6
0
def run_by_mode(spec_file, spec_name, lab_mode):
    logger.info(f'Running lab in mode: {lab_mode}')
    spec = spec_util.get(spec_file, spec_name)
    info_space = InfoSpace()
    analysis.save_spec(spec, info_space, unit='experiment')

    # '@' is reserved for 'enjoy@{prepath}'
    os.environ['lab_mode'] = lab_mode.split('@')[0]
    os.environ['PREPATH'] = util.get_prepath(spec, info_space)
    reload(logger)  # to set PREPATH properly

    if lab_mode == 'search':
        info_space.tick('experiment')
        Experiment(spec, info_space).run()
    elif lab_mode.startswith('train'):
        if '@' in lab_mode:
            prepath = lab_mode.split('@')[1]
            spec, info_space = util.prepath_to_spec_info_space(prepath)
        else:
            info_space.tick('trial')
        Trial(spec, info_space).run()
    elif lab_mode.startswith('enjoy'):
        prepath = lab_mode.split('@')[1]
        spec, info_space = util.prepath_to_spec_info_space(prepath)
        Session(spec, info_space).run()
    elif lab_mode.startswith('enjoy'):
        prepath = lab_mode.split('@')[1]
        spec, info_space = util.prepath_to_spec_info_space(prepath)
        Session(spec, info_space).run()
    elif lab_mode == 'dev':
        spec = util.override_dev_spec(spec)
        info_space.tick('trial')
        Trial(spec, info_space).run()
    else:
        logger.warn('lab_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.')
示例#7
0
def run_by_mode(spec_file, spec_name, lab_mode):
    logger.info(f'Running lab in mode: {lab_mode}')
    spec = spec_util.get(spec_file, spec_name)
    info_space = InfoSpace()
    os.environ['PREPATH'] = util.get_prepath(spec, info_space)
    reload(logger)  # to set PREPATH properly
    # expose to runtime, '@' is reserved for 'enjoy@{prepath}'
    os.environ['lab_mode'] = lab_mode.split('@')[0]
    if lab_mode == 'search':
        info_space.tick('experiment')
        Experiment(spec, info_space).run()
    elif lab_mode == 'train':
        info_space.tick('trial')
        Trial(spec, info_space).run()
    elif lab_mode.startswith('enjoy'):
        prepath = lab_mode.split('@')[1]
        spec, info_space = util.prepath_to_spec_info_space(prepath)
        Session(spec, info_space).run()
    elif lab_mode == 'generate_benchmark':
        benchmarker.generate_specs(spec, const='agent')
    elif lab_mode == 'benchmark':
        # TODO allow changing const to env
        run_benchmark(spec, const='agent')
    elif lab_mode == 'dev':
        spec = util.override_dev_spec(spec)
        info_space.tick('trial')
        Trial(spec, info_space).run()
    else:
        logger.warn(
            'lab_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.'
        )
示例#8
0
def test_logger(test_str):
    logger.critical(test_str)
    logger.debug(test_str)
    logger.error(test_str)
    logger.exception(test_str)
    logger.info(test_str)
    logger.warn(test_str)
示例#9
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    mdp_data_names = ['t', 'epi'] + data_names
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names}
    session_mdp_data, session_data = {}, {}
    for aeb in session.aeb_space.aeb_list:
        data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()}
        # trim back to remove any incomplete sessions due to multienv termination
        complete_done_h = np.trim_zeros(data_h_dict['done'], 'b')
        # offset properly to bin separate episodes
        reset_bin = np.concatenate([[0.], complete_done_h[:-1]])
        data_len = len(reset_bin)
        reset_idx = reset_bin.astype('bool')
        nonreset_idx = ~reset_idx
        data_h_dict['t'] = np.ones(reset_idx.shape)
        data_h_dict['epi'] = reset_idx.astype(int).cumsum()
        mdp_df = pd.DataFrame({
            data_name: data_h_dict[data_name][:data_len]
            for data_name in mdp_data_names})
        mdp_df = mdp_df.reindex(mdp_data_names, axis=1)
        aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df
    logger.debug(f'{session_data}')
    data_size_in_bytes = util.memory_size(session_mdp_data)
    logger.debug(f'Size of session data: {data_size_in_bytes} MB')
    if data_size_in_bytes > 25:
        logger.warn(f'Session data > 25 MB')
    return session_mdp_data, session_data
示例#10
0
def is_q_learning(algorithm):
    '''Check the algorithm is a Q-learning variant and action space is discrete'''
    assert hasattr(algorithm, 'body')
    is_q_algo = any(k in algorithm.algorithm_spec['name'] for k in ('DQN', 'SARSA'))
    is_q = algorithm.body.is_discrete and is_q_algo
    if not is_q:
        logger.warn('DuelingMLPNet only appropriate for Q-Learning algorithms. Currently implemented for single body algorithms in discrete action spaces')
    return is_q
示例#11
0
def test_logger(test_multiline_str):
    logger.set_level('DEBUG')
    logger.critical(test_multiline_str)
    logger.debug(test_multiline_str)
    logger.error(test_multiline_str)
    logger.exception(test_multiline_str)
    logger.info(test_multiline_str)
    logger.warn(test_multiline_str)
示例#12
0
def calc_session_metrics(session_df, env_name, info_prepath=None, df_mode=None):
    '''
    Calculate the session metrics: strength, efficiency, stability
    @param DataFrame:session_df Dataframe containing reward, frame, opt_step
    @param str:env_name Name of the environment to get its random baseline
    @param str:info_prepath Optional info_prepath to auto-save the output to
    @param str:df_mode Optional df_mode to save with info_prepath
    @returns dict:metrics Consists of scalar metrics and series local metrics
    '''
    rand_bl = random_baseline.get_random_baseline(env_name)
    if rand_bl is None:
        mean_rand_returns = 0.0
        logger.warn('Random baseline unavailable for environment. Please generate separately.')
    else:
        mean_rand_returns = rand_bl['mean']
    mean_returns = session_df['total_reward']
    frames = session_df['frame']
    opt_steps = session_df['opt_step']

    final_return_ma = mean_returns[-viz.PLOT_MA_WINDOW:].mean()
    str_, local_strs = calc_strength(mean_returns, mean_rand_returns)
    max_str, final_str = local_strs.max(), local_strs.iloc[-1]
    with warnings.catch_warnings():  # mute np.nanmean warning
        warnings.filterwarnings('ignore')
        sample_eff, local_sample_effs = calc_efficiency(local_strs, frames)
        train_eff, local_train_effs = calc_efficiency(local_strs, opt_steps)
        sta, local_stas = calc_stability(local_strs)

    # all the scalar session metrics
    scalar = {
        'final_return_ma': final_return_ma,
        'strength': str_,
        'max_strength': max_str,
        'final_strength': final_str,
        'sample_efficiency': sample_eff,
        'training_efficiency': train_eff,
        'stability': sta,
    }
    # all the session local metrics
    local = {
        'mean_returns': mean_returns,
        'strengths': local_strs,
        'sample_efficiencies': local_sample_effs,
        'training_efficiencies': local_train_effs,
        'stabilities': local_stas,
        'frames': frames,
        'opt_steps': opt_steps,
    }
    metrics = {
        'scalar': scalar,
        'local': local,
    }
    if info_prepath is not None:  # auto-save if info_prepath is given
        util.write(metrics, f'{info_prepath}_session_metrics_{df_mode}.pkl')
        util.write(scalar, f'{info_prepath}_session_metrics_scalar_{df_mode}.json')
        # save important metrics in info_prepath directly
        util.write(scalar, f'{info_prepath.replace("info/", "")}_session_metrics_scalar_{df_mode}.json')
    return metrics
示例#13
0
 def set_memory_flag(self):
     '''Flags if memory is episodic or discrete. This affects how the target and advantage functions are calculated'''
     body = self.agent.nanflat_body_a[0]
     memory = body.memory.__class__.__name__
     if memory.find('OnPolicyReplay') != -1:
         self.is_episodic = True
     elif memory.find('OnPolicyBatchReplay') != -1:
         self.is_episodic = False
     else:
         logger.warn(f'Error: Memory {memory} not recognized')
         raise NotImplementedError
示例#14
0
 def set_memory_flag(self):
     '''Flags if memory is episodic or discrete. This affects how self.sample() handles the batch it gets back from memory'''
     body = self.agent.nanflat_body_a[0]
     memory = body.memory.__class__.__name__
     if (memory.find('OnPolicyReplay') != -1) or (memory.find('OnPolicyNStepReplay') != -1):
         self.is_episodic = True
     elif (memory.find('OnPolicyBatchReplay') != -1) or (memory.find('OnPolicyNStepBatchReplay') != -1):
         self.is_episodic = False
     else:
         logger.warn(f'Error: Memory {memory} not recognized')
         raise NotImplementedError
示例#15
0
文件: net_util.py 项目: tttor/SLM-Lab
def is_q_learning(algorithm):
    '''Check the algorithm is a Q-learning variant and action space is discrete'''
    assert hasattr(algorithm, 'body')
    is_q_algo = any(k in algorithm.algorithm_spec['name']
                    for k in ('DQN', 'SARSA'))
    is_q = algorithm.body.is_discrete and is_q_algo
    if not is_q:
        logger.warn(
            'DuelingMLPNet only appropriate for Q-Learning algorithms. Currently implemented for single body algorithms in discrete action spaces'
        )
    return is_q
示例#16
0
def save_image(figure, filepath=None):
    if os.environ['PY_ENV'] == 'test':
        return
    if filepath is None:
        filepath = f'{PLOT_FILEDIR}/{ps.get(figure, "layout.title")}.png'
    filepath = util.smart_path(filepath)
    try:
        pio.write_image(figure, filepath)
        logger.info(f'Graph saved to {filepath}')
    except Exception as e:
        logger.warn(
            f'{e}\nFailed to generate graph. Fix the issue and run retro-analysis to generate graphs.'
        )
示例#17
0
    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        self.e = e
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name),
                                      worker_id=worker_id)
        # spaces for NN auto input/output inference
        logger.warn(
            'Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.'
        )
        self.observation_spaces = []
        self.action_spaces = []
        for a in range(len(self.u_env.brain_names)):
            observation_shape = (self.get_observable_dim(a)['state'], )
            if self.get_brain(a).state_space_type == 'discrete':
                observation_space = gym.spaces.Box(low=0,
                                                   high=1,
                                                   shape=observation_shape,
                                                   dtype=np.int32)
            else:
                observation_space = gym.spaces.Box(low=0.0,
                                                   high=1.0,
                                                   shape=observation_shape,
                                                   dtype=np.float32)
            self.observation_spaces.append(observation_space)
            if self.is_discrete(a):
                action_space = gym.spaces.Discrete(self.get_action_dim(a))
            else:
                action_space = gym.spaces.Box(low=0.0,
                                              high=1.0,
                                              shape=(1, ),
                                              dtype=np.float32)
            self.action_spaces.append(action_space)
        for observation_space, action_space in zip(self.observation_spaces,
                                                   self.action_spaces):
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)

        # TODO experiment to find out optimal benchmarking max_timestep, set
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False
示例#18
0
    def check_fn(*args, **kwargs):
        if not to_check_training_step():
            return fn(*args, **kwargs)

        net = args[0]  # first arg self
        # get pre-update parameters to compare
        pre_params = [param.clone() for param in net.parameters()]

        # run training_step, get loss
        loss = fn(*args, **kwargs)

        # get post-update parameters to compare
        post_params = [param.clone() for param in net.parameters()]
        if loss == 0.0:
            # if loss is 0, there should be no updates
            # TODO if without momentum, parameters should not change too
            for p_name, param in net.named_parameters():
                assert param.grad.norm() == 0
        else:
            # check parameter updates
            try:
                assert not all(
                    torch.equal(w1, w2)
                    for w1, w2 in zip(pre_params, post_params)
                ), f'Model parameter is not updated in training_step(), check if your tensor is detached from graph. Loss: {loss:g}'
                logger.info(
                    f'Model parameter is updated in training_step(). Loss: {loss: g}'
                )
            except Exception as e:
                logger.error(e)
                if os.environ.get('PY_ENV') == 'test':
                    # raise error if in unit test
                    raise (e)

            # check grad norms
            min_norm, max_norm = 0.0, 1e5
            for p_name, param in net.named_parameters():
                try:
                    grad_norm = param.grad.norm()
                    assert min_norm < grad_norm < max_norm, f'Gradient norm for {p_name} is {grad_norm:g}, fails the extreme value check {min_norm} < grad_norm < {max_norm}. Loss: {loss:g}. Check your network and loss computation.'
                    logger.info(
                        f'Gradient norm for {p_name} is {grad_norm:g}; passes value check.'
                    )
                except Exception as e:
                    logger.warn(e)
        logger.debug('Passed network parameter update check.')
        # store grad norms for debugging
        net.store_grad_norms()
        return loss
示例#19
0
 def assert_trained(post_model, loss):
     post_weights = [param.clone() for param in post_model.parameters()]
     if loss == 0:
         # TODO if without momentum, weights should not change too
         for p_name, param in post_model.named_parameters():
             assert param.grad.norm() == 0
     else:
         assert not all(torch.equal(w1, w2) for w1, w2 in zip(pre_weights, post_weights)), f'Model parameter is not updated in training_step(), check if your tensor is detached from graph. loss: {loss}'
         min_norm = 0
         max_norm = 1e5
         for p_name, param in post_model.named_parameters():
             try:
                 assert min_norm < param.grad.norm() < max_norm, f'Gradient norm fails the extreme value check {min_norm} < {p_name}:{param.grad.norm()} < {max_norm}, which is bad. Loss: {loss}. Check your network and loss computation. Consider using the "clip_grad_val" net parameter.'
             except Exception as e:
                 logger.warn(e)
     logger.debug('Passed network weight update assertation in dev lab_mode.')
示例#20
0
 def set_action_fn(self):
     '''Sets the function used to select actions. Automatically selects appropriate discrete or continuous action policy under default setting'''
     body = self.agent.nanflat_body_a[0]
     algorithm_spec = self.agent.spec['algorithm']
     action_fn = algorithm_spec['action_policy']
     if action_fn == 'default':
         if self.is_discrete:
             self.action_policy = act_fns['softmax']
         else:
             if body.action_dim > 1:
                 logger.warn(f'Action dim: {body.action_dim}. Continuous multidimensional action space not supported yet. Contact author')
                 raise NotImplementedError
             else:
                 self.action_policy = act_fns['gaussian']
     else:
         self.action_policy = act_fns[action_fn]
示例#21
0
def calc_aeb_fitness_sr(aeb_df, env_name):
    '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)'''
    std = FITNESS_STD.get(env_name)
    if std is None:
        std = FITNESS_STD.get('template')
        logger.warn(f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.')

    # calculate the strength sr and the moving-average (to denoise) first before calculating fitness
    aeb_df['strength'] = calc_strength_sr(aeb_df, std['rand_epi_reward'], std['std_epi_reward'])
    aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW, min_periods=0, center=False).mean()

    strength = calc_strength(aeb_df)
    speed = calc_speed(aeb_df, std['std_timestep'])
    stability = calc_stability(aeb_df)
    aeb_fitness_sr = pd.Series({
        'strength': strength, 'speed': speed, 'stability': stability})
    return aeb_fitness_sr
示例#22
0
 def add_experience(self,
                    state,
                    action,
                    reward,
                    next_state,
                    done,
                    priority=1):
     '''Interface helper method for update() to add experience to memory'''
     self.current_episode['states'].append(state)
     self.current_episode['actions'].append(action)
     self.current_episode['rewards'].append(reward)
     self.current_episode['next_states'].append(next_state)
     self.current_episode['dones'].append(done)
     self.current_episode['priorities'].append(priority)
     # Set most recent
     self.most_recent[0] = state
     self.most_recent[1] = action
     self.most_recent[2] = reward
     self.most_recent[3] = next_state
     self.most_recent[4] = done
     self.most_recent[5] = priority
     # If episode ended, add to memory and clear current_episode
     if done:
         self.states.append(self.current_episode['states'])
         self.actions.append(self.current_episode['actions'])
         self.rewards.append(self.current_episode['rewards'])
         self.next_states.append(self.current_episode['next_states'])
         self.dones.append(self.current_episode['dones'])
         self.priorities.append(self.current_episode['priorities'])
         self.current_episode = {
             'states': [],
             'actions': [],
             'rewards': [],
             'next_states': [],
             'dones': [],
             'priorities': []
         }
         # If agent has collected the desired number of episodes, it is ready to train
         if len(self.states) == self.num_epis_to_collect:
             self.body.agent.algorithm.to_train = 1
     # Track memory size and num experiences
     self.true_size += 1
     if self.true_size > 1000 and self.memory_warn_flag:
         logger.warn("Large memory size: {}".format(self.true_size))
         self.memory_warn_flag = False
     self.total_experiences += 1
示例#23
0
 def __init__(self, memory_spec, algorithm, body):
     super(OnPolicyReplay, self).__init__(memory_spec, algorithm, body)
     # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames
     util.set_attr(self, self.agent_spec['algorithm'], ['training_frequency'])
     self.state_buffer = deque(maxlen=0)  # for API consistency
     # Don't want total experiences reset when memory is
     self.is_episodic = True
     self.total_experiences = 0
     self.warn_size_once = ps.once(lambda msg: logger.warn(msg))
     self.reset()
示例#24
0
 def __init__(self, memory_spec, algorithm, body):
     super(OnPolicyReplay, self).__init__(memory_spec, algorithm, body)
     # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames
     util.set_attr(self, self.agent_spec['algorithm'],
                   ['training_frequency'])
     self.state_buffer = deque(maxlen=0)  # for API consistency
     # Don't want total experiences reset when memory is
     self.is_episodic = True
     self.total_experiences = 0
     self.warn_size_once = ps.once(lambda msg: logger.warn(msg))
     self.reset()
示例#25
0
def run_by_mode(spec_file, spec_name, run_mode):
    spec = spec_util.get(spec_file, spec_name)
    if run_mode == 'search':
        Experiment(spec).run()
    elif run_mode == 'train':
        Trial(spec).run()
    elif run_mode == 'enjoy':
        # TODO turn on save/load model mode
        # Session(spec).run()
        pass
    elif run_mode == 'benchmark':
        # TODO need to spread benchmark over spec on Experiment
        pass
    elif run_mode == 'dev':
        os.environ['PY_ENV'] = 'test'  # to not save in viz
        logger.set_level('DEBUG')
        spec = util.override_dev_spec(spec)
        Trial(spec).run()
    else:
        logger.warn(
            'run_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.'
        )
示例#26
0
def calc_aeb_fitness_sr(aeb_df, env_name):
    '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)'''
    no_fitness_sr = pd.Series({
        'strength': 0., 'speed': 0., 'stability': 0.})
    if len(aeb_df) < MA_WINDOW:
        logger.warn(f'Run more than {MA_WINDOW} episodes to compute proper fitness')
        return no_fitness_sr
    std = FITNESS_STD.get(env_name)
    if std is None:
        std = FITNESS_STD.get('template')
        logger.warn(f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.')
    aeb_df['total_t'] = aeb_df['t'].cumsum()
    aeb_df['strength'] = calc_strength(aeb_df, std['rand_epi_reward'], std['std_epi_reward'])
    aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW).mean()
    aeb_df['strength_mono_inc'] = is_noisy_mono_inc(aeb_df['strength']).astype(int)

    strength = aeb_df['strength_ma'].max()
    speed = calc_speed(aeb_df, std['std_timestep'])
    stability = calc_stability(aeb_df)
    aeb_fitness_sr = pd.Series({
        'strength': strength, 'speed': speed, 'stability': stability})
    return aeb_fitness_sr
示例#27
0
    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        self.e = e
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id)
        # spaces for NN auto input/output inference
        logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.')
        self.observation_spaces = []
        self.action_spaces = []
        for a in range(len(self.u_env.brain_names)):
            observation_shape = (self.get_observable_dim(a)['state'],)
            if self.get_brain(a).state_space_type == 'discrete':
                observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32)
            else:
                observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32)
            self.observation_spaces.append(observation_space)
            if self.is_discrete(a):
                action_space = gym.spaces.Discrete(self.get_action_dim(a))
            else:
                action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
            self.action_spaces.append(action_space)
        for observation_space, action_space in zip(self.observation_spaces, self.action_spaces):
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)

        # TODO experiment to find out optimal benchmarking max_timestep, set
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False
示例#28
0
文件: base.py 项目: vmuthuk2/SLM-Lab
    def __init__(self, memory_spec, body):
        '''
        @param {*} body is the unit that stores its experience in this memory. Each body has a distinct memory.
        '''
        self.memory_spec = memory_spec
        self.body = body

        # declare what data keys to store
        self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities']
        # the basic variables for every memory
        self.last_state = None
        # method to log size warning only once to prevent spamming log
        self.warn_size_once = ps.once(lambda msg: logger.warn(msg))
        # for API consistency, reset to some max_len in your specific memory class
        self.state_buffer = deque(maxlen=0)
        # total_reward and its history over episodes
        self.total_reward = 0
示例#29
0
 def init_nets(self):
     '''Initialize the neural networks used to learn the actor and critic from the spec'''
     body = self.agent.nanflat_body_a[0]  # singleton algo
     state_dim = body.state_dim
     action_dim = body.action_dim
     self.is_discrete = body.is_discrete
     net_spec = self.agent.spec['net']
     mem_spec = self.agent.spec['memory']
     net_type = self.agent.spec['net']['type']
     actor_kwargs = util.compact_dict(
         dict(
             hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
             optim_param=_.get(net_spec, 'optim_actor'),
             loss_param=_.get(net_spec,
                              'loss'),  # Note: Not used for training actor
             clamp_grad=_.get(net_spec, 'clamp_grad'),
             clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
             gpu=_.get(net_spec, 'gpu'),
         ))
     if self.agent.spec['net']['use_same_optim']:
         logger.info('Using same optimizer for actor and critic')
         critic_kwargs = actor_kwargs
     else:
         logger.info('Using different optimizer for actor and critic')
         critic_kwargs = util.compact_dict(
             dict(
                 hid_layers_activation=_.get(net_spec,
                                             'hid_layers_activation'),
                 optim_param=_.get(net_spec, 'optim_critic'),
                 loss_param=_.get(net_spec, 'loss'),
                 clamp_grad=_.get(net_spec, 'clamp_grad'),
                 clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
                 gpu=_.get(net_spec, 'gpu'),
             ))
     '''
      Below we automatically select an appropriate net based on two different conditions
        1. If the action space is discrete or continuous action
                - Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
                - Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
        2. If the actor and critic are separate or share weights
                - If the networks share weights then the single network returns a list.
                     - Continuous action spaces: The return list contains 3 elements: The first element contains the mean output for the actor (policy), the second element the std dev of the policy, and the third element is the state-value estimated by the network.
                     - Discrete action spaces: The return list contains 2 element. The first element is a tensor containing the logits for a categorical probability distribution over the actions. The second element contains the state-value estimated by the network.
        3. If the network type is feedforward, convolutional, or recurrent
                 - Feedforward and convolutional networks take a single state as input and require an OnPolicyReplay or OnPolicyBatchReplay memory
                 - Recurrent networks take n states as input and require an OnPolicyNStepReplay or OnPolicyNStepBatchReplay memory
     '''
     if net_type == 'MLPseparate':
         self.is_shared_architecture = False
         self.is_recurrent = False
         if self.is_discrete:
             self.actor = getattr(net, 'MLPNet')(state_dim,
                                                 net_spec['hid_layers'],
                                                 action_dim, **actor_kwargs)
             logger.info(
                 "Feedforward net, discrete action space, actor and critic are separate networks"
             )
         else:
             self.actor = getattr(net, 'MLPHeterogenousHeads')(
                 state_dim, net_spec['hid_layers'],
                 [action_dim, action_dim], **actor_kwargs)
             logger.info(
                 "Feedforward net, continuous action space, actor and critic are separate networks"
             )
         self.critic = getattr(net,
                               'MLPNet')(state_dim, net_spec['hid_layers'],
                                         1, **critic_kwargs)
     elif net_type == 'MLPshared':
         self.is_shared_architecture = True
         self.is_recurrent = False
         if self.is_discrete:
             self.actorcritic = getattr(net, 'MLPHeterogenousHeads')(
                 state_dim, net_spec['hid_layers'], [action_dim, 1],
                 **actor_kwargs)
             logger.info(
                 "Feedforward net, discrete action space, actor and critic combined into single network, sharing params"
             )
         else:
             self.actorcritic = getattr(net, 'MLPHeterogenousHeads')(
                 state_dim, net_spec['hid_layers'],
                 [action_dim, action_dim, 1], **actor_kwargs)
             logger.info(
                 "Feedforward net, continuous action space, actor and critic combined into single network, sharing params"
             )
     elif net_type == 'Convseparate':
         self.is_shared_architecture = False
         self.is_recurrent = False
         if self.is_discrete:
             self.actor = getattr(net,
                                  'ConvNet')(state_dim,
                                             net_spec['hid_layers'],
                                             action_dim, **actor_kwargs)
             logger.info(
                 "Convolutional net, discrete action space, actor and critic are separate networks"
             )
         else:
             self.actor = getattr(net, 'ConvNet')(state_dim,
                                                  net_spec['hid_layers'],
                                                  [action_dim, action_dim],
                                                  **actor_kwargs)
             logger.info(
                 "Convolutional net, continuous action space, actor and critic are separate networks"
             )
         self.critic = getattr(net,
                               'ConvNet')(state_dim, net_spec['hid_layers'],
                                          1, **critic_kwargs)
     elif net_type == 'Convshared':
         self.is_shared_architecture = True
         self.is_recurrent = False
         if self.is_discrete:
             self.actorcritic = getattr(net,
                                        'ConvNet')(state_dim,
                                                   net_spec['hid_layers'],
                                                   [action_dim, 1],
                                                   **actor_kwargs)
             logger.info(
                 "Convolutional net, discrete action space, actor and critic combined into single network, sharing params"
             )
         else:
             self.actorcritic = getattr(net, 'ConvNet')(
                 state_dim, net_spec['hid_layers'],
                 [action_dim, action_dim, 1], **actor_kwargs)
             logger.info(
                 "Convolutional net, continuous action space, actor and critic combined into single network, sharing params"
             )
     elif net_type == 'Recurrentseparate':
         self.is_shared_architecture = False
         self.is_recurrent = True
         if self.is_discrete:
             self.actor = getattr(net, 'RecurrentNet')(
                 state_dim, net_spec['hid_layers'], action_dim,
                 mem_spec['length_history'], **actor_kwargs)
             logger.info(
                 "Recurrent net, discrete action space, actor and critic are separate networks"
             )
         else:
             self.actor = getattr(net, 'RecurrentNet')(
                 state_dim, net_spec['hid_layers'],
                 [action_dim, action_dim], mem_spec['length_history'],
                 **actor_kwargs)
             logger.info(
                 "Recurrent net, continuous action space, actor and critic are separate networks"
             )
         self.critic = getattr(net,
                               'RecurrentNet')(state_dim,
                                               net_spec['hid_layers'], 1,
                                               mem_spec['length_history'],
                                               **critic_kwargs)
     elif net_type == 'Recurrentshared':
         self.is_shared_architecture = True
         self.is_recurrent = True
         if self.is_discrete:
             self.actorcritic = getattr(net, 'RecurrentNet')(
                 state_dim, net_spec['hid_layers'], [action_dim, 1],
                 mem_spec['length_history'], **actor_kwargs)
             logger.info(
                 "Recurrent net, discrete action space, actor and critic combined into single network, sharing params"
             )
         else:
             self.actorcritic = getattr(net, 'RecurrentNet')(
                 state_dim, net_spec['hid_layers'],
                 [action_dim, action_dim, 1], mem_spec['length_history'],
                 **actor_kwargs)
             logger.info(
                 "Recurrent net, continuous action space, actor and critic combined into single network, sharing params"
             )
     else:
         logger.warn(
             "Incorrect network type. Please use 'MLPshared', MLPseparate', Recurrentshared, or Recurrentseparate."
         )
         raise NotImplementedError