Пример #1
0
def init(config, agent='robot', her=False, 
                                reward_fun=None, 
                                obj_traj=None,
                                obj_mean=None,
                                obj_std=None):
        
    #hyperparameters
    ENV_NAME = config['env_id'] 
    SEED = config['random_seed']
    N_ENVS = config['n_envs']

    def make_env(env_id, i_env, env_type='Fetch', stack_prob=None):
        def _f():
            if env_type == 'Fetch':
                env = gym.make(env_id, n_objects=config['max_nb_objects'], 
                                    obj_action_type=config['obj_action_type'], 
                                    observe_obj_grp=config['observe_obj_grp'],
                                    obj_range=config['obj_range']
                                    )
            elif env_type == 'FetchStack':
                env = gym.make(env_id, n_objects=config['max_nb_objects'], 
                                    obj_action_type=config['obj_action_type'], 
                                    observe_obj_grp=config['observe_obj_grp'],
                                    obj_range=config['obj_range'],
                                    change_stack_order=config['change_stack_order']
                                    )
            elif env_type == 'Hand':
                env = gym.make(env_id, obj_action_type=config['obj_action_type'])
            elif env_type == 'Others':
                env = gym.make(env_id)
            

            #env._max_episode_steps *= config['max_nb_objects']
            keys = env.observation_space.spaces.keys()
            env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
            env.seed(SEED+10*i_env)
            if stack_prob is not None:
                env.unwrapped.stack_prob = stack_prob
            return env
        return _f  

    if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Stack' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], 
                                    obj_action_type=config['obj_action_type'], 
                                    observe_obj_grp=config['observe_obj_grp'],
                                    obj_range=config['obj_range'])
        envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['train_stack_prob']) for i_env in range(N_ENVS)])
        envs_test = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(N_ENVS)])
        envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(1)])         
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], 
                                    obj_action_type=config['obj_action_type'], 
                                    observe_obj_grp=config['observe_obj_grp'],
                                    obj_range=config['obj_range'])
        envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)])
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions
    elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type'])
        envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Hand') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Hand') for i_env in range(1)])
        n_rob_actions = 20
        n_actions = 1 * len(config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'MaRob' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], 
                                    obj_action_type=config['obj_action_type'], 
                                    observe_obj_grp=config['observe_obj_grp'],
                                    obj_range=config['obj_range'])
        envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)])
        n_rob_actions = 4 * 2
        n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions
    else:
        dummy_env = gym.make(ENV_NAME)
        envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Others') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = None

    def make_her_reward_fun(nb_critics, use_step_reward_fun=False):

        def _her_reward_fun(ag_2, g, info):  # vectorized
            goal_len = ag_2.shape[1]//nb_critics
            
            rew = dummy_env.compute_reward(achieved_goal=ag_2[:,0:goal_len], desired_goal=g[:,0:goal_len], info=info)
            all_rew = rew.copy()[:, np.newaxis]

            for i_reward in range(1,nb_critics):
                if not use_step_reward_fun:
                    obj_rew = dummy_env.compute_reward(achieved_goal=ag_2[:,goal_len*i_reward:goal_len*(i_reward+1)], 
                                                    desired_goal=g[:,goal_len*i_reward:goal_len*(i_reward+1)], info=info)
                else:
                    goal_a = ag_2[:,goal_len*i_reward:goal_len*(i_reward+1)].reshape(-1,dummy_env.env.n_objects,3)
                    goal_b = g[:,goal_len*i_reward:goal_len*(i_reward+1)].reshape(-1,dummy_env.env.n_objects,3)
                    d = np.linalg.norm(goal_a - goal_b, axis=-1)
                    #obj_rew = - (d > dummy_env.env.distance_threshold).astype(np.float32).sum(-1)
                    obj_rew = - (d > dummy_env.env.distance_threshold).astype(np.float32)
                all_rew = np.concatenate((all_rew, obj_rew.copy()[:, np.newaxis]), axis=-1)
                #all_rew = np.concatenate((all_rew, obj_rew.copy()), axis=-1)
            return all_rew
        
        return _her_reward_fun

    her_reward_fun = make_her_reward_fun(2, config['use_step_reward_fun'])

    K.manual_seed(SEED)
    np.random.seed(SEED)

    observation_space = (dummy_env.observation_space.spaces['observation'].shape[1]*2 + dummy_env.observation_space.spaces['desired_goal'].shape[0]*2, 
                        dummy_env.observation_space.spaces['observation'].shape[1]*1 + dummy_env.observation_space.spaces['desired_goal'].shape[0]*2)
    action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions,), dtype='float32'),
                    gym.spaces.Box(-1., 1., shape=(n_actions-n_rob_actions,), dtype='float32'),
                    gym.spaces.Box(-1., 1., shape=(n_actions,), dtype='float32'))

    GAMMA = config['gamma']
    clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None
    TAU = config['tau'] 
    ACTOR_LR = config['plcy_lr'] 
    CRITIC_LR = config['crtc_lr'] 

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    OUT_FUNC = K.tanh 
    MODEL = DDPG_BD

    #exploration initialization
    noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) 
    config['episode_length'] = dummy_env._max_episode_steps
    config['observation_space'] = dummy_env.observation_space

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)) # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space, action_space, optimizer, 
                  Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, 
                  regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS,
                  reward_fun=reward_fun, clip_Q_neg=clip_Q_neg, nb_critics=config['max_nb_objects']+1
                  )

    model.n_objects = config['max_nb_objects']

    class NormalizerObj(object):
        def __init__(self, mean, std):
            self.mean = mean
            self.std = std

        def process(self, achieved, desired):
            achieved_out = achieved - K.tensor(self.mean[0], dtype=achieved.dtype, device=achieved.device)
            achieved_out /= K.tensor(self.std[0], dtype=achieved.dtype, device=achieved.device)

            desired_out = desired - K.tensor(self.mean[1], dtype=desired.dtype, device=desired.device)
            desired_out /= K.tensor(self.std[1], dtype=desired.dtype, device=desired.device)

            return achieved_out, desired_out

    normalizer = [Normalizer(), Normalizer(), NormalizerObj(obj_mean, obj_std)]

    model.obj_traj = obj_traj.to('cpu')
    model.obj_traj.eval()

    for _ in range(1):
        state_all = dummy_env.reset()
        for i_step in range(config['episode_length']):

            model.to_cpu()

            obs = [K.tensor(obs, dtype=K.float32).unsqueeze(0) for obs in state_all['observation']]
            goal = K.tensor(state_all['desired_goal'], dtype=K.float32).unsqueeze(0)
            if i_step%config['objtraj_goal_horizon'] == 0:
                achieved_goal = K.tensor(state_all['achieved_goal'], dtype=K.float32).unsqueeze(0)

                objtraj_goal = []
                goal_len_per_obj = goal.shape[1]//model.n_objects
                for i_object in range(model.n_objects):
                    
                    achieved_goal_per_obj = achieved_goal[:,i_object*goal_len_per_obj:(i_object+1)*goal_len_per_obj]
                    goal_per_obj = goal[:,i_object*goal_len_per_obj:(i_object+1)*goal_len_per_obj]

                    normed_achieved_goal_per_obj, normed_goal_per_goal = normalizer[2].process(achieved_goal_per_obj, goal_per_obj)
                    with K.no_grad():
                        objtraj_goal_per_obj = model.obj_traj(normed_achieved_goal_per_obj, normed_goal_per_goal)

                    objtraj_goal.append(objtraj_goal_per_obj)
                objtraj_goal = K.cat(objtraj_goal, dim=-1)

            # Observation normalization
            obs_goal = []
            obs_goal.append(K.cat([obs[0], obs[1], goal, objtraj_goal], dim=-1))
            if normalizer[0] is not None:
                obs_goal[0] = normalizer[0].preprocess_with_update(obs_goal[0])

            if config['agent_alg'] == 'DDPG_BD':
                action = model.select_action(obs_goal[0], noise).cpu().numpy().squeeze(0)
            elif config['agent_alg'] == 'MADDPG_BD':
                action = model.select_action(obs_goal[0], noise, goal_size=goal.shape[1]).cpu().numpy().squeeze(0)
            action_to_env = np.zeros_like(dummy_env.action_space.sample())
            action_to_env[0:action.shape[0]] = action

            next_state_all, _, _, _ = dummy_env.step(action_to_env)

            # Move to the next state
            state_all = next_state_all

    #memory initilization  
    if her:
        sample_her_transitions = make_sample_her_transitions('future', 4, her_reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions('none', 4, her_reward_fun)

    buffer_shapes = {
        'o' : (config['episode_length'], dummy_env.observation_space.spaces['observation'].shape[1]*3),
        'ag' : (config['episode_length'], dummy_env.observation_space.spaces['achieved_goal'].shape[0]*2),
        'g' : (config['episode_length'], dummy_env.observation_space.spaces['desired_goal'].shape[0]*2),
        'u' : (config['episode_length']-1, action_space[2].shape[0])
        }
    memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'], sample_her_transitions)

    experiment_args = ((envs, envs_test, envs_render), memory, noise, config, normalizer, None)
          
    return model, experiment_args
def init(config,
         agent='robot',
         her=False,
         reward_fun=None,
         obj_traj=None,
         obj_mean=None,
         obj_std=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']
    N_ENVS = config['n_envs']

    def make_env(env_id, i_env, env_type='Fetch', stack_prob=None):
        def _f():
            if env_type == 'Fetch':
                env = gym.make(env_id,
                               n_objects=config['max_nb_objects'],
                               obj_action_type=config['obj_action_type'],
                               observe_obj_grp=config['observe_obj_grp'],
                               obj_range=config['obj_range'])
            elif env_type == 'FetchStack':
                env = gym.make(env_id,
                               n_objects=config['max_nb_objects'],
                               obj_action_type=config['obj_action_type'],
                               observe_obj_grp=config['observe_obj_grp'],
                               obj_range=config['obj_range'],
                               change_stack_order=config['change_stack_order'])
            elif env_type == 'Hand':
                env = gym.make(env_id,
                               obj_action_type=config['obj_action_type'])
            elif env_type == 'FetchMaRobSeq':
                env = gym.make(env_id,
                               n_objects=config['max_nb_objects'],
                               obj_action_type=config['obj_action_type'],
                               observe_obj_grp=config['observe_obj_grp'],
                               obj_range=np.array([0.15, 0.60]),
                               widerangeobj=True)
            elif env_type == 'FetchMaRobSeqTest':
                env = gym.make(env_id,
                               n_objects=config['max_nb_objects'],
                               obj_action_type=config['obj_action_type'],
                               observe_obj_grp=config['observe_obj_grp'],
                               obj_range=config['obj_range'],
                               widerangeobj=False)
            elif env_type == 'Others':
                env = gym.make(env_id)

            #env._max_episode_steps *= config['max_nb_objects']
            keys = env.observation_space.spaces.keys()
            env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
            env.seed(SEED + 10 * i_env)
            if stack_prob is not None:
                env.unwrapped.stack_prob = stack_prob
            return env

        return _f

    if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Stack' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchStack', config['train_stack_prob'])
            for i_env in range(N_ENVS)
        ])
        envs_test = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob'])
            for i_env in range(N_ENVS)
        ])
        envs_render = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob'])
            for i_env in range(1)
        ])
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)])
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             obj_action_type=config['obj_action_type'])
        envs = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Hand') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Hand') for i_env in range(1)])
        n_rob_actions = 20
        n_actions = 1 * len(config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'MaRobLong' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)])
        n_rob_actions = 4 * 2
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'MaRobSeq' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchMaRobSeq')
            for i_env in range(N_ENVS)
        ])
        envs_test = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchMaRobSeqTest')
            for i_env in range(N_ENVS)
        ])
        envs_render = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'FetchMaRobSeqTest')
            for i_env in range(1)
        ])
        n_rob_actions = 4 * 2
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    else:
        dummy_env = gym.make(ENV_NAME)
        envs = SubprocVecEnv(
            [make_env(ENV_NAME, i_env, 'Others') for i_env in range(N_ENVS)])
        envs_test = None
        envs_render = None

    def her_reward_fun(ag_2, g, info):  # vectorized
        return dummy_env.compute_reward(achieved_goal=ag_2,
                                        desired_goal=g,
                                        info=info).reshape(-1, 1)

    K.manual_seed(SEED)
    np.random.seed(SEED)

    observation_space = (
        dummy_env.observation_space.spaces['observation'].shape[1] * 2 +
        dummy_env.observation_space.spaces['desired_goal'].shape[0],
        dummy_env.observation_space.spaces['observation'].shape[1] +
        dummy_env.observation_space.spaces['desired_goal'].shape[0])
    action_space = (gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    OUT_FUNC = K.tanh
    if config['agent_alg'] == 'DDPG_BD':
        from olbr.algorithms.ddpg_q_schedule import DDPG_BD
    elif config['agent_alg'] == 'MADDPG_BD':
        from olbr.algorithms.maddpg_q_schedule import DDPG_BD
    MODEL = DDPG_BD

    #exploration initialization
    noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3)
    config['episode_length'] = dummy_env._max_episode_steps
    config['observation_space'] = dummy_env.observation_space

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(
        observation_space,
        action_space,
        optimizer,
        Actor,
        Critic,
        loss_func,
        GAMMA,
        TAU,
        out_func=OUT_FUNC,
        discrete=False,
        regularization=REGULARIZATION,
        normalized_rewards=NORMALIZED_REWARDS,
        reward_fun=reward_fun,
        clip_Q_neg=clip_Q_neg,
        nb_critics=config['max_nb_objects']  #or fixing to 3
    )

    model.n_objects = config['max_nb_objects']

    class NormalizerObj(object):
        def __init__(self, mean, std):
            self.mean = mean
            self.std = std

        def process(self, achieved, desired, step):
            achieved_out = achieved - K.tensor(
                self.mean[0], dtype=achieved.dtype, device=achieved.device)
            achieved_out /= K.tensor(self.std[0],
                                     dtype=achieved.dtype,
                                     device=achieved.device)

            desired_out = desired - K.tensor(
                self.mean[1], dtype=desired.dtype, device=desired.device)
            desired_out /= K.tensor(self.std[1],
                                    dtype=desired.dtype,
                                    device=desired.device)

            step_out = step - K.tensor(
                self.mean[2], dtype=desired.dtype, device=desired.device)
            step_out /= K.tensor(self.std[2],
                                 dtype=desired.dtype,
                                 device=desired.device)

            return achieved_out, desired_out, step_out

    normalizer = [Normalizer(), Normalizer(), NormalizerObj(obj_mean, obj_std)]

    model.obj_traj = obj_traj.to('cuda')
    model.obj_traj.eval()

    for _ in range(1):
        state_all = dummy_env.reset()
        for i_step in range(config['episode_length']):

            model.to_cpu()

            obs = [
                K.tensor(obs, dtype=K.float32).unsqueeze(0)
                for obs in state_all['observation']
            ]
            goal = K.tensor(state_all['desired_goal'],
                            dtype=K.float32).unsqueeze(0)
            if i_step == 0:
                objtraj_goal = goal

            # Observation normalization
            obs_goal = []
            obs_goal.append(K.cat([obs[0], obs[1], objtraj_goal], dim=-1))
            if normalizer[0] is not None:
                obs_goal[0] = normalizer[0].preprocess_with_update(obs_goal[0])

            if config['agent_alg'] == 'DDPG_BD':
                action = model.select_action(obs_goal[0],
                                             noise).cpu().numpy().squeeze(0)
            elif config['agent_alg'] == 'MADDPG_BD':
                action = model.select_action(
                    obs_goal[0], noise,
                    goal_size=goal.shape[1]).cpu().numpy().squeeze(0)
            action_to_env = np.zeros_like(dummy_env.action_space.sample())
            action_to_env[0:action.shape[0]] = action

            next_state_all, _, _, _ = dummy_env.step(action_to_env)

            # Move to the next state
            state_all = next_state_all

    #memory initilization
    if her:
        sample_her_transitions = make_sample_her_transitions(
            'future', 4, her_reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions(
            'none', 4, her_reward_fun)

    buffer_shapes = {
        'o': (config['episode_length'],
              dummy_env.observation_space.spaces['observation'].shape[1] * 3),
        'ag': (config['episode_length'],
               dummy_env.observation_space.spaces['achieved_goal'].shape[0]),
        'g': (config['episode_length'],
              dummy_env.observation_space.spaces['desired_goal'].shape[0]),
        'u': (config['episode_length'] - 1, action_space[2].shape[0])
    }
    memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'],
                          sample_her_transitions)

    experiment_args = ((envs, envs_test, envs_render), memory, noise, config,
                       normalizer, None)

    print("0.20 - 0.25 - boundary_sample iff original_goal update norm")

    return model, experiment_args
Пример #3
0
def init(config,
         agent='robot',
         her=False,
         object_Qfunc=None,
         backward_dyn=None,
         object_policy=None,
         reward_fun=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']

    if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME:
        env = gym.make(ENV_NAME,
                       n_objects=config['max_nb_objects'],
                       obj_action_type=config['obj_action_type'],
                       observe_obj_grp=config['observe_obj_grp'])
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME:
        env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type'])
        n_rob_actions = 20
        n_actions = 1 * len(config['obj_action_type']) + n_rob_actions
    else:
        env = gym.make(ENV_NAME)

    def her_reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2,
                                  desired_goal=g,
                                  info=info)

    env.seed(SEED)
    K.manual_seed(SEED)
    np.random.seed(SEED)

    #if config['obj_action_type'] == 'all':
    #    n_actions = config['max_nb_objects'] * 7 + 4
    #elif config['obj_action_type'] == 'slide_only':
    #    n_actions = config['max_nb_objects'] * 3 + 4
    #elif config['obj_action_type'] == 'rotation_only':
    #    n_actions = config['max_nb_objects'] * 4 + 4

    observation_space = env.observation_space.spaces['observation'].shape[
        1] + env.observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    OUT_FUNC = K.tanh
    if config['agent_alg'] == 'DDPG_BD':
        MODEL = DDPG_BD
        from olbr.replay_buffer import ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions
    elif config['agent_alg'] == 'MADDPG_BD':
        MODEL = MADDPG_BD
        from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions

    #exploration initialization
    if agent == 'robot':
        agent_id = 0
        noise = (Noise(action_space[0].shape[0], sigma=0.2, eps=0.3),
                 Noise(action_space[1].shape[0], sigma=0.2, eps=0.3))

        env._max_episode_steps *= config['max_nb_objects']
    elif agent == 'object':
        agent_id = 1
        #noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1)
        noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3)

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space,
                  action_space,
                  optimizer,
                  Actor,
                  Critic,
                  loss_func,
                  GAMMA,
                  TAU,
                  out_func=OUT_FUNC,
                  discrete=False,
                  regularization=REGULARIZATION,
                  normalized_rewards=NORMALIZED_REWARDS,
                  agent_id=agent_id,
                  object_Qfunc=object_Qfunc,
                  backward_dyn=backward_dyn,
                  object_policy=object_policy,
                  reward_fun=reward_fun,
                  masked_with_r=config['masked_with_r'])
    normalizer = [Normalizer(), Normalizer()]

    #memory initilization
    if her:
        sample_her_transitions = make_sample_her_transitions(
            'future', 4, her_reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions(
            'none', 4, her_reward_fun)

    buffer_shapes = {
        'o': (env._max_episode_steps,
              env.observation_space.spaces['observation'].shape[1] * 2),
        'ag': (env._max_episode_steps,
               env.observation_space.spaces['achieved_goal'].shape[0]),
        'g': (env._max_episode_steps,
              env.observation_space.spaces['desired_goal'].shape[0]),
        'u': (env._max_episode_steps - 1, action_space[2].shape[0])
    }
    memory = (ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps,
                           sample_her_transitions),
              ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps,
                           sample_her_transitions))

    experiment_args = (env, memory, noise, config, normalizer, agent_id)

    print('singleseeding')

    return model, experiment_args
Пример #4
0
def init(config):

    if config['resume'] != '':
        resume_path = config['resume']
        saver = Saver(config)
        config, start_episode, save_dict = saver.resume_ckpt()
        config['resume'] = resume_path
    else:
        start_episode = 0

    #hyperparameters
    ENV_NAME = config['env_id']  #'simple_spread'
    SEED = config['random_seed']  # 1

    GAMMA = config['gamma']  # 0.95
    TAU = config['tau']  # 0.01

    ACTOR_LR = config['plcy_lr']  # 0.01
    CRITIC_LR = config['crtc_lr']  # 0.01

    MEM_SIZE = config['buffer_length']  # 1000000

    REGULARIZATION = config['regularization']  # True
    NORMALIZED_REWARDS = config['reward_normalization']  # True

    if (ENV_NAME == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1'):
        env = gym.make(ENV_NAME,
                       n_objects=config['max_nb_objects'],
                       obj_action_type=config['obj_action_type'])
    else:
        env = gym.make(ENV_NAME)
    env.seed(SEED)

    if config['obj_action_type'] == 'all':
        n_actions = config['max_nb_objects'] * 7 + 4
    elif config['obj_action_type'] == 'slide_only':
        n_actions = config['max_nb_objects'] * 3 + 4
    elif config['obj_action_type'] == 'rotation_only':
        n_actions = config['max_nb_objects'] * 4 + 4

    observation_space = env.observation_space.spaces['observation'].shape[
        1] + env.observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - 4, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))
    if env.action_space.low[0] == -1 and env.action_space.high[0] == 1:
        OUT_FUNC = K.tanh
    elif env.action_space.low[0] == 0 and env.action_space.high[0] == 1:
        OUT_FUNC = K.sigmoid
    else:
        OUT_FUNC = K.sigmoid

    K.manual_seed(SEED)
    np.random.seed(SEED)

    if config['agent_alg'] == 'MADDPG':
        MODEL = MADDPG
    elif config['agent_alg'] == 'DDPG':
        MODEL = DDPG
    elif config['agent_alg'] == 'MADDPG_R':
        MODEL = MADDPG_R
    elif config['agent_alg'] == 'MADDPG_RAE':
        MODEL = MADDPG_RAE

    if config['verbose'] > 1:
        # utils
        summaries = (Summarizer(config['dir_summary_train'], config['port'],
                                config['resume']),
                     Summarizer(config['dir_summary_test'], config['port'],
                                config['resume']))
        saver = Saver(config)
    else:
        summaries = None
        saver = None

    #exploration initialization
    noise = (Noise(action_space[0].shape[0], sigma=0.2, eps=0.3),
             Noise(action_space[1].shape[0], sigma=0.05, eps=0.1))
    #noise = OUNoise(action_space.shape[0])

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space,
                  action_space,
                  optimizer,
                  Actor,
                  Critic,
                  loss_func,
                  GAMMA,
                  TAU,
                  out_func=OUT_FUNC,
                  discrete=False,
                  regularization=REGULARIZATION,
                  normalized_rewards=NORMALIZED_REWARDS)

    if config['resume'] != '':
        for i, param in enumerate(save_dict['model_params']):
            model.entities[i].load_state_dict(param)

    #memory initilization
    #memory = ReplayMemory(MEM_SIZE)
    def reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2,
                                  desired_goal=g,
                                  info=info)

    sample_her_transitions = make_sample_her_transitions(
        'future', 4, reward_fun)
    buffer_shapes = {
        'o': (env._max_episode_steps,
              env.observation_space.spaces['observation'].shape[1] * 2),
        'ag': (env._max_episode_steps,
               env.observation_space.spaces['achieved_goal'].shape[0]),
        'g': (env._max_episode_steps,
              env.observation_space.spaces['desired_goal'].shape[0]),
        'u': (env._max_episode_steps - 1, action_space[2].shape[0])
    }
    memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps,
                          sample_her_transitions)

    normalizer = (Normalizer(), Normalizer())

    experiment_args = (env, memory, noise, config, summaries, saver,
                       start_episode, normalizer)

    return model, experiment_args
Пример #5
0
def init(config,
         agent='robot',
         her=False,
         object_Qfunc=None,
         backward_dyn=None,
         object_policy=None,
         reward_fun=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']
    N_ENVS = config['n_envs']

    def make_env(env_id, i_env, env_type='Fetch', ai_object=False):
        def _f():
            if env_type == 'Fetch':
                env = gym.make(env_id,
                               n_objects=config['max_nb_objects'],
                               obj_action_type=config['obj_action_type'],
                               observe_obj_grp=config['observe_obj_grp'],
                               obj_range=config['obj_range'])
            elif env_type == 'Hand':
                env = gym.make(env_id,
                               obj_action_type=config['obj_action_type'])
            elif env_type == 'Others':
                env = gym.make(env_id)

            keys = env.observation_space.spaces.keys()
            env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
            env.seed(SEED + 10 * i_env)
            env.unwrapped.ai_object = ai_object
            return env

        return _f

    if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Flex' not in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Fetch', agent == 'object')
            for i_env in range(N_ENVS)
        ])
        envs_render = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Fetch', agent == 'object')
            for i_env in range(1)
        ])
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Flex' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             n_objects=config['max_nb_objects'],
                             obj_action_type=config['obj_action_type'],
                             observe_obj_grp=config['observe_obj_grp'],
                             obj_range=config['obj_range'])
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Fetch', agent == 'object')
            for i_env in range(N_ENVS)
        ])
        envs_render = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Fetch', agent == 'object')
            for i_env in range(1)
        ])
        n_rob_actions = 4
        n_actions = 2 * len(config['obj_action_type']) + n_rob_actions
    elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME:
        dummy_env = gym.make(ENV_NAME,
                             obj_action_type=config['obj_action_type'])
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Hand', agent == 'object')
            for i_env in range(N_ENVS)
        ])
        envs_render = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Hand', agent == 'object')
            for i_env in range(1)
        ])
        n_rob_actions = 20
        n_actions = 1 * len(config['obj_action_type']) + n_rob_actions
    else:
        dummy_env = gym.make(ENV_NAME)
        envs = SubprocVecEnv([
            make_env(ENV_NAME, i_env, 'Others', agent == 'object')
            for i_env in range(N_ENVS)
        ])
        envs_render = None

    def her_reward_fun(ag_2, g, info):  # vectorized
        return dummy_env.compute_reward(achieved_goal=ag_2,
                                        desired_goal=g,
                                        info=info)

    K.manual_seed(SEED)
    np.random.seed(SEED)

    observation_space = dummy_env.observation_space.spaces['observation'].shape[
        1] + dummy_env.observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    OUT_FUNC = K.tanh
    if config['agent_alg'] == 'DDPG_BD':
        MODEL = DDPG_BD
        from olbr.replay_buffer import ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions
    elif config['agent_alg'] == 'MADDPG_BD':
        MODEL = MADDPG_BD
        from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions

    #exploration initialization
    if agent == 'robot':
        agent_id = 0
        noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3)
    elif agent == 'object':
        agent_id = 1
        #noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3)
        noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.2)
    config['episode_length'] = dummy_env._max_episode_steps
    config['observation_space'] = dummy_env.observation_space

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space,
                  action_space,
                  optimizer,
                  Actor,
                  Critic,
                  loss_func,
                  GAMMA,
                  TAU,
                  out_func=OUT_FUNC,
                  discrete=False,
                  regularization=REGULARIZATION,
                  normalized_rewards=NORMALIZED_REWARDS,
                  agent_id=agent_id,
                  object_Qfunc=object_Qfunc,
                  backward_dyn=backward_dyn,
                  object_policy=object_policy,
                  reward_fun=reward_fun,
                  clip_Q_neg=clip_Q_neg,
                  goal_space=dummy_env.reset()['desired_goal'].shape[0])
    normalizer = [Normalizer(), Normalizer()]

    for _ in range(1):
        state_all = dummy_env.reset()
        for _ in range(config['episode_length']):

            model.to_cpu()

            obs = [
                K.tensor(obs, dtype=K.float32).unsqueeze(0)
                for obs in state_all['observation']
            ]
            goal = K.tensor(state_all['desired_goal'],
                            dtype=K.float32).unsqueeze(0)

            # Observation normalization
            obs_goal = []
            obs_goal.append(K.cat([obs[agent_id], goal], dim=-1))
            if normalizer[agent_id] is not None:
                obs_goal[0] = normalizer[agent_id].preprocess_with_update(
                    obs_goal[0])

            action = model.select_action(obs_goal[0],
                                         noise).cpu().numpy().squeeze(0)
            action_to_env = np.zeros_like(dummy_env.action_space.sample())
            if agent_id == 0:
                action_to_env[0:action.shape[0]] = action
            else:
                action_to_env[-action.shape[0]::] = action

            next_state_all, _, _, _ = dummy_env.step(action_to_env)

            # Move to the next state
            state_all = next_state_all

    #memory initilization
    if her:
        sample_her_transitions = make_sample_her_transitions(
            'future', 4, her_reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions(
            'none', 4, her_reward_fun)

    buffer_shapes = {
        'o': (config['episode_length'],
              dummy_env.observation_space.spaces['observation'].shape[1] * 2),
        'ag': (config['episode_length'],
               dummy_env.observation_space.spaces['achieved_goal'].shape[0]),
        'g': (config['episode_length'],
              dummy_env.observation_space.spaces['desired_goal'].shape[0]),
        'u': (config['episode_length'] - 1, action_space[2].shape[0])
    }
    memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'],
                          sample_her_transitions)

    experiment_args = ((envs, envs_render), memory, noise, config, normalizer,
                       agent_id)

    return model, experiment_args
Пример #6
0
def init(config,
         agent='robot',
         her=False,
         object_Qfunc=None,
         backward_dyn=None,
         object_policy=None,
         reward_fun=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']

    if (ENV_NAME
            == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1') or (
                ENV_NAME == 'FetchPickAndPlaceMulti-v1'):
        env = gym.make(ENV_NAME,
                       n_objects=config['max_nb_objects'],
                       obj_action_type=config['obj_action_type'],
                       observe_obj_grp=config['observe_obj_grp'])
    else:
        env = gym.make(ENV_NAME)

    def her_reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2,
                                  desired_goal=g,
                                  info=info)

    env.seed(SEED)
    K.manual_seed(SEED)
    np.random.seed(SEED)

    #if config['obj_action_type'] == 'all':
    #    n_actions = config['max_nb_objects'] * 7 + 4
    #elif config['obj_action_type'] == 'slide_only':
    #    n_actions = config['max_nb_objects'] * 3 + 4
    #elif config['obj_action_type'] == 'rotation_only':
    #    n_actions = config['max_nb_objects'] * 4 + 4
    n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + 4

    observation_space = env.observation_space.spaces['observation'].shape[
        1] + env.observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - 4, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    if config['agent_alg'] == 'DDPG_BD':
        MODEL = DDPG_BD
        OUT_FUNC = K.tanh
        from olbr.agents.basic import Actor
        from olbr.replay_buffer import ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions
    elif config['agent_alg'] == 'MADDPG_BD':
        MODEL = MADDPG_BD
        OUT_FUNC = K.tanh
        from olbr.agents.basic import Actor
        from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions
    elif config['agent_alg'] == 'PPO_BD':
        MODEL = PPO_BD
        OUT_FUNC = 'linear'
        from olbr.agents.basic import ActorStoch as Actor
        from olbr.replay_buffer import RolloutStorage as ReplayBuffer

    #exploration initialization
    if agent == 'robot':
        agent_id = 0
        if config['agent_alg'] == 'PPO_BD':
            noise = True
        else:
            noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3)
    elif agent == 'object':
        agent_id = 1
        #noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1)
        noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3)

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    if config['agent_alg'] == 'PPO_BD':
        model = MODEL(observation_space,
                      action_space,
                      optimizer,
                      Actor,
                      Critic,
                      config['clip_param'],
                      config['ppo_epoch'],
                      config['n_batches'],
                      config['value_loss_coef'],
                      config['entropy_coef'],
                      eps=config['eps'],
                      max_grad_norm=config['max_grad_norm'],
                      use_clipped_value_loss=True,
                      out_func=OUT_FUNC,
                      discrete=False,
                      agent_id=agent_id,
                      object_Qfunc=object_Qfunc,
                      backward_dyn=backward_dyn,
                      object_policy=object_policy,
                      reward_fun=reward_fun,
                      masked_with_r=config['masked_with_r'])
    else:
        model = MODEL(observation_space,
                      action_space,
                      optimizer,
                      Actor,
                      Critic,
                      loss_func,
                      GAMMA,
                      TAU,
                      out_func=OUT_FUNC,
                      discrete=False,
                      regularization=REGULARIZATION,
                      normalized_rewards=NORMALIZED_REWARDS,
                      agent_id=agent_id,
                      object_Qfunc=object_Qfunc,
                      backward_dyn=backward_dyn,
                      object_policy=object_policy,
                      reward_fun=reward_fun,
                      masked_with_r=config['masked_with_r'])

    normalizer = [Normalizer(), Normalizer()]

    #memory initilization
    if config['agent_alg'] == 'PPO_BD':
        memory = ReplayBuffer(env._max_episode_steps - 1, config['n_rollouts'],
                              (observation_space, ), action_space[0])
    else:
        if her:
            sample_her_transitions = make_sample_her_transitions(
                'future', 4, her_reward_fun)
        else:
            sample_her_transitions = make_sample_her_transitions(
                'none', 4, her_reward_fun)

        buffer_shapes = {
            'o': (env._max_episode_steps,
                  env.observation_space.spaces['observation'].shape[1] * 2),
            'ag': (env._max_episode_steps,
                   env.observation_space.spaces['achieved_goal'].shape[0]),
            'g': (env._max_episode_steps,
                  env.observation_space.spaces['desired_goal'].shape[0]),
            'u': (env._max_episode_steps - 1, action_space[2].shape[0]),
            'r': (env._max_episode_steps - 1, 1)
        }
        memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps,
                              sample_her_transitions)

    experiment_args = (env, memory, noise, config, normalizer, agent_id)

    print('clipped between -1 and 0, and masked with abs(r), and + r')

    return model, experiment_args
Пример #7
0
def init(config,
         agent='robot',
         her=False,
         object_Qfunc=None,
         backward_dyn=None,
         object_policy=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']

    if (ENV_NAME == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1'):
        env = gym.make(ENV_NAME,
                       n_objects=config['max_nb_objects'],
                       obj_action_type=config['obj_action_type'])
    else:
        env = gym.make(ENV_NAME)

    def reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2,
                                  desired_goal=g,
                                  info=info)

    env.seed(SEED)
    K.manual_seed(SEED)
    np.random.seed(SEED)

    if config['obj_action_type'] == 'all':
        n_actions = config['max_nb_objects'] * 7 + 4
    elif config['obj_action_type'] == 'slide_only':
        n_actions = config['max_nb_objects'] * 3 + 4
    elif config['obj_action_type'] == 'rotation_only':
        n_actions = config['max_nb_objects'] * 4 + 4

    observation_space = env.observation_space.spaces['observation'].shape[
        1] + env.observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - 4, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    OUT_FUNC = K.tanh
    MODEL = DDPG_BD

    #exploration initialization
    if agent == 'robot':
        agent_id = 0
        noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3)
    elif agent == 'object':
        agent_id = 1
        noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1)

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space,
                  action_space,
                  optimizer,
                  Actor,
                  Critic,
                  loss_func,
                  GAMMA,
                  TAU,
                  out_func=OUT_FUNC,
                  discrete=False,
                  regularization=REGULARIZATION,
                  normalized_rewards=NORMALIZED_REWARDS,
                  agent_id=agent_id,
                  object_Qfunc=object_Qfunc,
                  backward_dyn=backward_dyn,
                  object_policy=object_policy)
    normalizer = [Normalizer(), Normalizer()]

    #memory initilization
    if her:
        sample_her_transitions = make_sample_her_transitions(
            'future', 4, reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions(
            'none', 4, reward_fun)

    buffer_shapes = {
        'o': (env._max_episode_steps,
              env.observation_space.spaces['observation'].shape[1] * 2),
        'ag': (env._max_episode_steps,
               env.observation_space.spaces['achieved_goal'].shape[0]),
        'g': (env._max_episode_steps,
              env.observation_space.spaces['desired_goal'].shape[0]),
        'u': (env._max_episode_steps - 1, action_space[2].shape[0])
    }
    memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps,
                          sample_her_transitions)

    experiment_args = (env, memory, noise, config, normalizer, agent_id)

    return model, experiment_args
Пример #8
0
def init(config,
         agent='robot',
         her=False,
         object_Qfunc=None,
         backward_dyn=None,
         object_policy=None,
         reward_fun=None):

    #hyperparameters
    ENV_NAME = config['env_id']
    SEED = config['random_seed']
    N_ENVS = config['n_envs']

    env = []
    if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME:
        for i_env in range(N_ENVS):
            env.append(
                gym.make(ENV_NAME,
                         n_objects=config['max_nb_objects'],
                         obj_action_type=config['obj_action_type'],
                         observe_obj_grp=config['observe_obj_grp'],
                         obj_range=config['obj_range']))
        n_rob_actions = 4
        n_actions = config['max_nb_objects'] * len(
            config['obj_action_type']) + n_rob_actions
    elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME:
        for i_env in range(N_ENVS):
            env.append(
                gym.make(ENV_NAME, obj_action_type=config['obj_action_type']))
        n_rob_actions = 20
        n_actions = 1 * len(config['obj_action_type']) + n_rob_actions
    else:
        for i_env in range(N_ENVS):
            env.append(gym.make(ENV_NAME))

    def her_reward_fun(ag_2, g, info):  # vectorized
        return env[0].compute_reward(achieved_goal=ag_2,
                                     desired_goal=g,
                                     info=info)

    for i_env in range(N_ENVS):
        env[i_env].seed(SEED + 10 * i_env)
    K.manual_seed(SEED)
    np.random.seed(SEED)

    observation_space = env[0].observation_space.spaces['observation'].shape[
        1] + env[0].observation_space.spaces['desired_goal'].shape[0]
    action_space = (gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions - n_rob_actions, ),
                                   dtype='float32'),
                    gym.spaces.Box(-1.,
                                   1.,
                                   shape=(n_actions, ),
                                   dtype='float32'))

    GAMMA = config['gamma']
    TAU = config['tau']
    ACTOR_LR = config['plcy_lr']
    CRITIC_LR = config['crtc_lr']

    MEM_SIZE = config['buffer_length']

    REGULARIZATION = config['regularization']
    NORMALIZED_REWARDS = config['reward_normalization']

    if config['agent_alg'] == 'PPO_BD':
        MODEL = PPO_BD
        OUT_FUNC = 'linear'
        from olbr.replay_buffer import ReplayBuffer
        from olbr.her_sampler import make_sample_her_transitions
        from olbr.replay_buffer import RolloutStorage as RolloutStorage

    #exploration initialization
    env[0]._max_episode_steps *= config['max_nb_objects']
    noise = (True, Noise(action_space[1].shape[0], sigma=0.2, eps=0.3))

    #model initialization
    optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)
                 )  # optimiser func, (actor_lr, critic_lr)
    loss_func = F.mse_loss
    model = MODEL(observation_space,
                  action_space,
                  optimizer,
                  Actor,
                  Critic,
                  config['clip_param'],
                  config['ppo_epoch'],
                  config['n_batches'],
                  config['value_loss_coef'],
                  config['entropy_coef'],
                  eps=config['eps'],
                  max_grad_norm=config['max_grad_norm'],
                  use_clipped_value_loss=True,
                  out_func=OUT_FUNC,
                  discrete=False,
                  agent_id=0,
                  object_Qfunc=object_Qfunc,
                  backward_dyn=backward_dyn,
                  object_policy=object_policy,
                  reward_fun=reward_fun,
                  masked_with_r=config['masked_with_r'])
    normalizer = [Normalizer(), Normalizer()]

    #memory initilization
    if her:
        sample_her_transitions = make_sample_her_transitions(
            'future', 4, her_reward_fun)
    else:
        sample_her_transitions = make_sample_her_transitions(
            'none', 4, her_reward_fun)

    buffer_shapes = {
        'o': (env[0]._max_episode_steps,
              env[0].observation_space.spaces['observation'].shape[1] * 2),
        'ag': (env[0]._max_episode_steps,
               env[0].observation_space.spaces['achieved_goal'].shape[0]),
        'g': (env[0]._max_episode_steps,
              env[0].observation_space.spaces['desired_goal'].shape[0]),
        'u': (env[0]._max_episode_steps - 1, action_space[2].shape[0])
    }
    memory = (RolloutStorage(env[0]._max_episode_steps - 1,
                             config['n_rollouts'], (observation_space, ),
                             action_space[0]),
              ReplayBuffer(buffer_shapes, MEM_SIZE, env[0]._max_episode_steps,
                           sample_her_transitions))

    experiment_args = (env, memory, noise, config, normalizer, 0)

    return model, experiment_args