예제 #1
0
    def act(self, obs):
        with tx.device_scope(self.gpu_ids):
            if self.sleep_time > 0.0:
                time.sleep(self.sleep_time)
            if not self.frame_stack_concatenate_on_env:
                # Output pixels of environment is a list of frames,
                # we concatenate the frames into a single numpy array
                obs = copy.deepcopy(obs)
                if 'pixel' in obs:
                    for key in obs['pixel']:
                        obs['pixel'][key] = np.concatenate(obs['pixel'][key],
                                                           axis=0)
            # Convert to pytorch tensor
            obs_tensor = collections.OrderedDict()
            for modality in obs:
                modality_dict = collections.OrderedDict()
                for key in obs[modality]:
                    modality_dict[key] = torch.tensor(
                        obs[modality][key], dtype=torch.float32).unsqueeze(0)
                obs_tensor[modality] = modality_dict
            action, _ = self.model(obs_tensor, calculate_value=False)
            if self.param_noise and self.param_noise_type == 'adaptive_normal':
                self.param_noise.compute_action_distance(obs_tensor, action)
            action = action.data.cpu().numpy()[0]

            action = action.clip(-1, 1)

            if self.agent_mode != 'eval_deterministic':
                action += self.noise()

            action = action.clip(-1, 1)
            return action
예제 #2
0
def mytest_data_parallel(devices):
    """
    Make a new process because otherwise cannot empty nvidia-smi
    pytest doesn't work with multiprocessing
    """
    dtype = torch.double
    with tx.device_scope(devices, dtype=dtype):
        net = MyNet(19, 78, 25)
        x = torch.empty(64, 19).uniform_(0, 0.1)
        y = torch.randn(64, 25)
        z = y - net(x)

        devices = tx.ids_to_devices(devices)
        assert z.device == devices[0]
        assert z.dtype == dtype
        # all GPUs without DataParallel allocated should report 0 memory usage
        # os.system('nvidia-smi')
        should_have_mem = [tx.device_to_int(d) for d in devices]
        actual_mems = tx.cuda_memory('all', mode='cache', unit='kb')
        print('IDs', should_have_mem, '\tmemory:', actual_mems)
        for i, actual_mem in enumerate(actual_mems):
            if i in should_have_mem:
                assert actual_mem > 0, ('device', i, actual_mem)
            else:
                assert actual_mem == 0, ('device', i, actual_mem)
예제 #3
0
    def act(self, obs):
        '''
            Agent returns an action based on input observation. if in training,
            returns action along with action infos, which includes the current
            probability distribution, RNN hidden states and etc.
            Args:
                obs: numpy array of (1, obs_dim)

            Returns:
                action_choice: sampled or max likelihood action to input to env
                action_info: list of auxiliary information - [onetime, persistent]
                    Note: this includes probability distribution the action is
                    sampled from, RNN hidden states
        '''
        # Note: we collect two kinds of action infos, one persistent one onetime
        # persistent info is collected for every step in rollout (i.e. policy probability distribution)
        # onetime info is collected for the first step in partial trajectory (i.e. RNN hidden state)
        # see ExpSenderWrapperMultiStepMovingWindowWithInfo in exp_sender_wrapper for more
        action_info = [[], []]

        with tx.device_scope(self.gpu_ids):
            obs_tensor = {}
            for mod in obs.keys():
                obs_tensor[mod] = {}
                for k in obs[mod].keys():
                    obs_tensor[mod][k] = torch.tensor(
                        obs[mod][k], dtype=torch.float32).unsqueeze(0)

            if self.rnn_config.if_rnn_policy:
                action_info[0].append(self.cells[0].squeeze(1).cpu().numpy())
                action_info[0].append(self.cells[1].squeeze(1).cpu().numpy())

            action_pd, self.cells = self.model.forward_actor_expose_cells(
                obs_tensor, self.cells)
            action_pd = action_pd.detach().cpu().numpy()
            action_pd[:, self.action_dim:] *= np.exp(self.noise)

            if self.agent_mode != 'eval_deterministic':
                action_choice = self.pd.sample(action_pd)
            else:
                action_choice = self.pd.maxprob(action_pd)
            np.clip(action_choice, -1, 1, out=action_choice)

            action_choice = action_choice.reshape((-1, ))
            action_pd = action_pd.reshape((-1, ))
            action_info[1].append(action_pd)

            if self.env_config.action_spec['type'] == 'discrete':
                action_choice = np.argmax(action_choice)

            if self.agent_mode != 'training':
                return action_choice
            else:
                time.sleep(self.env_config.sleep_time)
                return action_choice, action_info
예제 #4
0
파일: ddpg.py 프로젝트: wwxFromTju/surreal
    def preprocess(self, batch):
        '''
        Override for learner/base/preprocess.  Before learn() is called, preprocess() takes the batch and converts
        the numpy arrays to pytorch tensors.  Note that this operation will transfer the data to gpu if a gpu is used.

        Arguments:
            batch: a batch of numpy arrays from the replay memory
        '''
        # Convert all numpy arrays to pytorch tensors, and transfers to gpu if applicable
        with tx.device_scope(self.gpu_ids):
            obs, actions, rewards, obs_next, done = (batch['obs'],
                                                     batch['actions'],
                                                     batch['rewards'],
                                                     batch['obs_next'],
                                                     batch['dones'])
            device_name = 'cpu'
            if self._num_gpus > 0:
                device_name = 'cuda'

            for modality in obs:
                for key in obs[modality]:
                    if modality == 'pixel':
                        obs[modality][key] = (torch.tensor(
                            obs[modality][key], dtype=torch.uint8).to(
                                torch.device(device_name))).float().detach()
                    else:
                        obs[modality][key] = (torch.tensor(
                            obs[modality][key], dtype=torch.float32).to(
                                torch.device(device_name))).detach()

            for modality in obs_next:
                for key in obs_next[modality]:
                    if modality == 'pixel':
                        obs_next[modality][key] = (torch.tensor(
                            obs_next[modality][key], dtype=torch.uint8).to(
                                torch.device(device_name))).float().detach()
                    else:
                        obs_next[modality][key] = (torch.tensor(
                            obs_next[modality][key], dtype=torch.float32).to(
                                torch.device(device_name))).detach()

            actions = torch.tensor(actions, dtype=torch.float32).to(
                torch.device(device_name))
            rewards = torch.tensor(rewards, dtype=torch.float32).to(
                torch.device(device_name))
            done = torch.tensor(done, dtype=torch.float32).to(
                torch.device(device_name))

            (batch['obs'], batch['actions'], batch['rewards'],
             batch['obs_next'], batch['dones']) = (obs, actions, rewards,
                                                   obs_next, done)
            return batch
예제 #5
0
 def reset(self):
     '''
         reset of LSTM hidden and cell states
     '''
     if self.rnn_config.if_rnn_policy:
         # Note that .detach() is necessary here to prevent overflow of memory
         # otherwise rollout in length of thousands will prevent previously
         # accumulated hidden/cell states from being freed.
         with tx.device_scope(self.gpu_ids):
             self.cells = (torch.zeros(self.rnn_config.rnn_layer,
                                       1,  # batch_size is 1
                                       self.rnn_config.rnn_hidden).detach(),
                           torch.zeros(self.rnn_config.rnn_layer,
                                       1,  # batch_size is 1
                                       self.rnn_config.rnn_hidden).detach())
예제 #6
0
    def __init__(self,
                 learner_config,
                 env_config,
                 session_config,
                 agent_id,
                 agent_mode,
                 render=False):
        super().__init__(
            learner_config=learner_config,
            env_config=env_config,
            session_config=session_config,
            agent_id=agent_id,
            agent_mode=agent_mode,
            render=render,
        )
        self.action_dim = self.env_config.action_spec.dim[0]
        self.obs_spec = self.env_config.obs_spec
        self.use_z_filter = self.learner_config.algo.use_z_filter

        self.init_log_sig = self.learner_config.algo.consts.init_log_sig
        self.log_sig_range = self.learner_config.algo.consts.log_sig_range

        # setting agent mode
        if self.agent_mode != 'training':
            if self.env_config.stochastic_eval:
                self.agent_mode = 'eval_stochastic'
            else:
                self.agent_mode = 'eval_deterministic'

        if self.agent_mode != 'training':
            self.noise = 0
        else:
            self.noise = np.random.uniform(low=-self.log_sig_range,
                                           high=self.log_sig_range)
        self.rnn_config = self.learner_config.algo.rnn

        # GPU setup
        # TODO: deprecate
        self._num_gpus = session_config.agent.num_gpus

        if torch.cuda.is_available():
            self.gpu_ids = 'cuda:all'
            self.log.info('PPO agent is using GPU')
            # Note that user is responsible for only providing one GPU for the program
            self.log.info('cudnn version: {}'.format(
                torch.backends.cudnn.version()))
            torch.backends.cudnn.benchmark = True
        else:
            self.gpu_ids = 'cpu'
            self.log.info('PPO agent is using CPU')

        self.pd = DiagGauss(self.action_dim)
        self.cells = None

        with tx.device_scope(self.gpu_ids):
            if self.rnn_config.if_rnn_policy:
                # Note that .detach() is necessary here to prevent overflow of memory
                # otherwise rollout in length of thousands will prevent previously
                # accumulated hidden/cell states from being freed.
                self.cells = (
                    torch.zeros(
                        self.rnn_config.rnn_layer,
                        1,  # batch_size is 1
                        self.rnn_config.rnn_hidden).detach(),
                    torch.zeros(
                        self.rnn_config.rnn_layer,
                        1,  # batch_size is 1
                        self.rnn_config.rnn_hidden).detach())

            self.model = PPOModel(
                obs_spec=self.obs_spec,
                action_dim=self.action_dim,
                model_config=self.learner_config.model,
                use_cuda=False,
                init_log_sig=self.init_log_sig,
                use_z_filter=self.use_z_filter,
                if_pixel_input=self.env_config.pixel_input,
                rnn_config=self.rnn_config,
            )
예제 #7
0
파일: ddpg.py 프로젝트: wwxFromTju/surreal
    def __init__(self, learner_config, env_config, session_config):
        super().__init__(learner_config, env_config, session_config)

        self.current_iteration = 0

        # load multiple optimization instances onto a single gpu
        self.batch_size = self.learner_config.replay.batch_size
        self.discount_factor = self.learner_config.algo.gamma
        self.n_step = self.learner_config.algo.n_step
        self.is_pixel_input = self.env_config.pixel_input
        self.use_layernorm = self.learner_config.model.use_layernorm
        self.use_double_critic = self.learner_config.algo.network.use_double_critic
        self.use_action_regularization = self.learner_config.algo.network.use_action_regularization

        self.frame_stack_concatenate_on_env = self.env_config.frame_stack_concatenate_on_env

        self.log.info('Initializing DDPG learner')
        self._num_gpus = session_config.learner.num_gpus
        if not torch.cuda.is_available():
            self.gpu_ids = 'cpu'
            self.log.info('Using CPU')
        else:
            self.gpu_ids = 'cuda:all'
            self.log.info('Using GPU')
            self.log.info('cudnn version: {}'.format(
                torch.backends.cudnn.version()))
            torch.backends.cudnn.benchmark = True
            self._num_gpus = 1

        with tx.device_scope(self.gpu_ids):
            self._target_update_init()

            self.clip_actor_gradient = self.learner_config.algo.network.clip_actor_gradient
            if self.clip_actor_gradient:
                self.actor_gradient_clip_value = self.learner_config.algo.network.actor_gradient_value_clip
                self.log.info('Clipping actor gradient at {}'.format(
                    self.actor_gradient_clip_value))

            self.clip_critic_gradient = self.learner_config.algo.network.clip_critic_gradient
            if self.clip_critic_gradient:
                self.critic_gradient_clip_value = self.learner_config.algo.network.critic_gradient_value_clip
                self.log.info('Clipping critic gradient at {}'.format(
                    self.critic_gradient_clip_value))

            self.action_dim = self.env_config.action_spec.dim[0]
            self.model = DDPGModel(
                obs_spec=self.env_config.obs_spec,
                action_dim=self.action_dim,
                use_layernorm=self.use_layernorm,
                actor_fc_hidden_sizes=self.learner_config.model.
                actor_fc_hidden_sizes,
                critic_fc_hidden_sizes=self.learner_config.model.
                critic_fc_hidden_sizes,
                conv_out_channels=self.learner_config.model.conv_spec.
                out_channels,
                conv_kernel_sizes=self.learner_config.model.conv_spec.
                kernel_sizes,
                conv_strides=self.learner_config.model.conv_spec.strides,
                conv_hidden_dim=self.learner_config.model.conv_spec.
                hidden_output_dim,
            )

            self.model_target = DDPGModel(
                obs_spec=self.env_config.obs_spec,
                action_dim=self.action_dim,
                use_layernorm=self.use_layernorm,
                actor_fc_hidden_sizes=self.learner_config.model.
                actor_fc_hidden_sizes,
                critic_fc_hidden_sizes=self.learner_config.model.
                critic_fc_hidden_sizes,
                conv_out_channels=self.learner_config.model.conv_spec.
                out_channels,
                conv_kernel_sizes=self.learner_config.model.conv_spec.
                kernel_sizes,
                conv_strides=self.learner_config.model.conv_spec.strides,
                conv_hidden_dim=self.learner_config.model.conv_spec.
                hidden_output_dim,
            )

            if self.use_double_critic:
                self.model2 = DDPGModel(
                    obs_spec=self.env_config.obs_spec,
                    action_dim=self.action_dim,
                    use_layernorm=self.use_layernorm,
                    actor_fc_hidden_sizes=self.learner_config.model.
                    actor_fc_hidden_sizes,
                    critic_fc_hidden_sizes=self.learner_config.model.
                    critic_fc_hidden_sizes,
                    conv_out_channels=self.learner_config.model.conv_spec.
                    out_channels,
                    conv_kernel_sizes=self.learner_config.model.conv_spec.
                    kernel_sizes,
                    conv_strides=self.learner_config.model.conv_spec.strides,
                    conv_hidden_dim=self.learner_config.model.conv_spec.
                    hidden_output_dim,
                    critic_only=True,
                )

                self.model_target2 = DDPGModel(
                    obs_spec=self.env_config.obs_spec,
                    action_dim=self.action_dim,
                    use_layernorm=self.use_layernorm,
                    actor_fc_hidden_sizes=self.learner_config.model.
                    actor_fc_hidden_sizes,
                    critic_fc_hidden_sizes=self.learner_config.model.
                    critic_fc_hidden_sizes,
                    conv_out_channels=self.learner_config.model.conv_spec.
                    out_channels,
                    conv_kernel_sizes=self.learner_config.model.conv_spec.
                    kernel_sizes,
                    conv_strides=self.learner_config.model.conv_spec.strides,
                    conv_hidden_dim=self.learner_config.model.conv_spec.
                    hidden_output_dim,
                    critic_only=True,
                )

            self.critic_criterion = nn.MSELoss()

            self.log.info('Using Adam for critic with learning rate {}'.format(
                self.learner_config.algo.network.lr_critic))
            self.critic_optim = torch.optim.Adam(
                self.model.get_critic_parameters(),
                lr=self.learner_config.algo.network.lr_critic,
                weight_decay=self.learner_config.algo.network.
                critic_regularization  # Weight regularization term
            )

            self.log.info('Using Adam for actor with learning rate {}'.format(
                self.learner_config.algo.network.lr_actor))
            self.actor_optim = torch.optim.Adam(
                self.model.get_actor_parameters(),
                lr=self.learner_config.algo.network.lr_actor,
                weight_decay=self.learner_config.algo.network.
                actor_regularization  # Weight regularization term
            )

            if self.use_double_critic:
                self.log.info(
                    'Using Adam for critic with learning rate {}'.format(
                        self.learner_config.algo.network.lr_critic))
                self.critic_optim2 = torch.optim.Adam(
                    self.model2.get_critic_parameters(),
                    lr=self.learner_config.algo.network.lr_critic,
                    weight_decay=self.learner_config.algo.network.
                    critic_regularization  # Weight regularization term
                )

            self.log.info('Using {}-step bootstrapped return'.format(
                self.learner_config.algo.n_step))
            self.frame_stack_preprocess = FrameStackPreprocessor(
                self.env_config.frame_stacks)
            self.aggregator = SSARAggregator(self.env_config.obs_spec,
                                             self.env_config.action_spec)

            self.model_target.actor.hard_update(self.model.actor)
            self.model_target.critic.hard_update(self.model.critic)

            if self.use_double_critic:
                self.model_target2.critic.hard_update(self.model2.critic)

            self.total_learn_time = U.TimeRecorder()
            self.forward_time = U.TimeRecorder()
            self.critic_update_time = U.TimeRecorder()
            self.actor_update_time = U.TimeRecorder()
예제 #8
0
파일: ddpg.py 프로젝트: wwxFromTju/surreal
    def _optimize(self, obs, actions, rewards, obs_next, done):
        '''
        Note that while the replay contains uint8, the
        aggregator returns float32 tensors

        Arguments:
            obs: an observation from the minibatch, often represented as s_n in literature. Dimensionality: (N, C) for
                low dimensional inputs, (N, C, H, W) for pixel inputs
            actions: actions taken given observations obs, often represented as a_n in literature.
                Dimensionality: (N, A), where A is the dimensionality of a single action
            rewards: rewards received after action is taken. Dimensionality: N
            obs_next: an observation from the minibatch, often represented as s_{n+1} in literature
            done: 1 if obs_next is terminal, 0 otherwise. Dimensionality: N
        '''
        with tx.device_scope(self.gpu_ids):

            with self.forward_time.time():
                assert actions.max().item() <= 1.0
                assert actions.min().item() >= -1.0

                # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a))

                model_policy, next_Q_target = self.model_target.forward(
                    obs_next)
                if self.use_action_regularization:
                    # https://github.com/sfujim/TD3/blob/master/TD3.py -- action regularization
                    policy_noise = 0.2
                    noise_clip = 0.5
                    batch_size = self.batch_size
                    noise = np.clip(
                        np.random.normal(0,
                                         policy_noise,
                                         size=(batch_size, self.action_dim)),
                        -noise_clip, noise_clip)
                    device_name = 'cpu'
                    if self._num_gpus > 0:
                        device_name = 'cuda'
                    model_policy += torch.tensor(
                        noise, dtype=torch.float32).to(device_name).detach()
                    model_policy = model_policy.clamp(-1, 1).to(device_name)
                y = rewards + pow(self.discount_factor,
                                  self.n_step) * next_Q_target * (1.0 - done)
                if self.use_double_critic:
                    _, next_Q_target2 = self.model_target2.forward(
                        obs_next, action=model_policy)
                    y2 = rewards + pow(self.discount_factor, self.n_step
                                       ) * next_Q_target2 * (1.0 - done)
                    y = torch.min(y, y2)
                y = y.detach()

                # compute Q(s_t, a_t)
                perception = self.model.forward_perception(obs)
                y_policy = self.model.forward_critic(perception,
                                                     actions.detach())

                y_policy2 = None
                if self.use_double_critic:
                    perception2 = self.model2.forward_perception(obs)
                    y_policy2 = self.model2.forward_critic(
                        perception2, actions.detach())

            # critic update
            with self.critic_update_time.time():
                self.model.critic.zero_grad()
                if self.is_pixel_input:
                    self.model.perception.zero_grad()
                critic_loss = self.critic_criterion(y_policy, y)
                critic_loss.backward()
                if self.clip_critic_gradient:
                    self.model.critic.clip_grad_value(
                        self.critic_gradient_clip_value)
                self.critic_optim.step()

                if self.use_double_critic:
                    self.model2.critic.zero_grad()
                    if self.is_pixel_input:
                        self.model2.perception.zero_grad()
                    critic_loss = self.critic_criterion(y_policy2, y)
                    critic_loss.backward()
                    if self.clip_critic_gradient:
                        self.model2.critic.clip_grad_value(
                            self.critic_gradient_clip_value)
                    self.critic_optim2.step()

            # actor update
            with self.actor_update_time.time():
                self.model.actor.zero_grad()
                actor_loss = -self.model.forward_critic(
                    perception.detach(),
                    self.model.forward_actor(perception.detach()))
                actor_loss = actor_loss.mean()
                actor_loss.backward()
                if self.clip_actor_gradient:
                    self.model.actor.clip_grad_value(
                        self.actor_gradient_clip_value)
                self.actor_optim.step()

            tensorplex_update_dict = {
                'actor_loss': actor_loss.item(),
                'critic_loss': critic_loss.item(),
                'action_norm': actions.norm(2, 1).mean().item(),
                'rewards': rewards.mean().item(),
                'Q_target': y.mean().item(),
                'Q_policy': y_policy.mean().item(),
                'performance/forward_time': self.forward_time.avg,
                'performance/critic_update_time': self.critic_update_time.avg,
                'performance/actor_update_time': self.actor_update_time.avg,
            }
            if self.use_double_critic:
                tensorplex_update_dict['Q_policy2'] = y_policy2.mean().item()

            # (possibly) update target networks
            self._target_update()

            return tensorplex_update_dict
예제 #9
0
def new_tensor(device_id, value):
    with tx.device_scope(device_id, dtype=torch.float64):
        return torch.ones(SHAPE) * value
예제 #10
0
    def __init__(self,
                 learner_config,
                 env_config,
                 session_config,
                 agent_id,
                 agent_mode,
                 render=False):
        '''
        Constructor for DDPGAgent class.
        Important attributes:
            learner_config, env_config, session_config: experiment configurations
            agent_id: unique id in the range [0, num_agents)
            agent_mode: toggles between agent noise and deterministic behavior
        '''
        super().__init__(
            learner_config=learner_config,
            env_config=env_config,
            session_config=session_config,
            agent_id=agent_id,
            agent_mode=agent_mode,
            render=render,
        )

        self.agent_id = agent_id
        self.action_dim = self.env_config.action_spec.dim[0]
        self.obs_spec = self.env_config.obs_spec
        self.use_layernorm = self.learner_config.model.use_layernorm
        self.sleep_time = self.env_config.sleep_time

        self.param_noise = None
        self.param_noise_type = self.learner_config.algo.exploration.param_noise_type
        self.param_noise_sigma = self.learner_config.algo.exploration.param_noise_sigma
        self.param_noise_alpha = self.learner_config.algo.exploration.param_noise_alpha
        self.param_noise_target_stddev = self.learner_config.algo.exploration.param_noise_target_stddev

        self.frame_stack_concatenate_on_env = self.env_config.frame_stack_concatenate_on_env

        self.noise_type = self.learner_config.algo.exploration.noise_type

        if env_config.num_agents == 1:
            # If only one agent, we don't want a sigma of 0
            self.sigma = self.learner_config.algo.exploration.max_sigma / 3.0
        else:
            self.sigma = self.learner_config.algo.exploration.max_sigma * (
                float(agent_id) / (env_config.num_agents))
        #self.sigma = self.learner_config.algo.exploration.sigma
        print('Using exploration sigma', self.sigma)

        if torch.cuda.is_available():
            self.gpu_ids = 'cuda:all'
            self.log.info('DDPG agent is using GPU')
            # Note that user is responsible for only providing one GPU for the program
            self.log.info('cudnn version: {}'.format(
                torch.backends.cudnn.version()))
            torch.backends.cudnn.benchmark = True
        else:
            self.gpu_ids = 'cpu'
            self.log.info('DDPG agent is using CPU')

        with tx.device_scope(self.gpu_ids):
            self.model = DDPGModel(
                obs_spec=self.obs_spec,
                action_dim=self.action_dim,
                use_layernorm=self.use_layernorm,
                actor_fc_hidden_sizes=self.learner_config.model.
                actor_fc_hidden_sizes,
                critic_fc_hidden_sizes=self.learner_config.model.
                critic_fc_hidden_sizes,
                conv_out_channels=self.learner_config.model.conv_spec.
                out_channels,
                conv_kernel_sizes=self.learner_config.model.conv_spec.
                kernel_sizes,
                conv_strides=self.learner_config.model.conv_spec.strides,
                conv_hidden_dim=self.learner_config.model.conv_spec.
                hidden_output_dim,
            )
            self.model.eval()

            self._init_noise()
예제 #11
0
    def __init__(self, learner_config, env_config, session_config):
        super().__init__(learner_config, env_config, session_config)

        # GPU setting
        self.current_iteration = 0
        self.global_step = 0
        if not torch.cuda.is_available():
            self.gpu_option = 'cpu'
        else:
            self.gpu_option = 'cuda:all'
        self.use_cuda = torch.cuda.is_available()

        if not self.use_cuda:
            self.log.info('Using CPU')
        else:
            self.log.info('Using GPU: {}'.format(self.gpu_option))

        # RL general parameters
        self.gamma = self.learner_config.algo.gamma
        self.lam = self.learner_config.algo.advantage.lam
        self.n_step = self.learner_config.algo.n_step
        self.use_z_filter = self.learner_config.algo.use_z_filter
        self.use_r_filter = self.learner_config.algo.use_r_filter
        self.norm_adv = self.learner_config.algo.advantage.norm_adv
        self.batch_size = self.learner_config.replay.batch_size

        self.action_dim = self.env_config.action_spec.dim[0]
        self.obs_spec = self.env_config.obs_spec
        self.init_log_sig = self.learner_config.algo.consts.init_log_sig

        # PPO parameters
        self.ppo_mode = self.learner_config.algo.ppo_mode
        self.if_rnn_policy = self.learner_config.algo.rnn.if_rnn_policy
        self.horizon = self.learner_config.algo.rnn.horizon
        self.lr_actor = self.learner_config.algo.network.lr_actor
        self.lr_critic = self.learner_config.algo.network.lr_critic
        self.epoch_policy = self.learner_config.algo.consts.epoch_policy
        self.epoch_baseline = self.learner_config.algo.consts.epoch_baseline
        self.kl_target = self.learner_config.algo.consts.kl_target
        self.adjust_threshold = self.learner_config.algo.consts.adjust_threshold
        self.reward_scale = self.learner_config.algo.advantage.reward_scale

        # PPO mode 'adjust'
        self.kl_cutoff_coeff = self.learner_config.algo.adapt_consts.kl_cutoff_coeff
        self.beta_init = self.learner_config.algo.adapt_consts.beta_init
        self.beta_range = self.learner_config.algo.adapt_consts.beta_range

        # PPO mode 'clip'
        self.clip_range = self.learner_config.algo.clip_consts.clip_range
        self.clip_epsilon_init = self.learner_config.algo.clip_consts.clip_epsilon_init

        if self.ppo_mode == 'adapt':
            self.beta = self.beta_init
            self.eta = self.kl_cutoff_coeff
            self.beta_upper = self.beta_range[1]
            self.beta_lower = self.beta_range[0]
            self.beta_adjust_threshold = self.adjust_threshold
        else:  # method == 'clip'
            self.clip_epsilon = self.clip_epsilon_init
            self.clip_adjust_threshold = self.adjust_threshold
            self.clip_upper = self.clip_range[1]
            self.clip_lower = self.clip_range[0]

        # learning rate setting:
        self.min_lr = self.learner_config.algo.network.anneal.min_lr
        self.lr_update_frequency = self.learner_config.algo.network.anneal.lr_update_frequency
        self.frames_to_anneal = self.learner_config.algo.network.anneal.frames_to_anneal
        num_updates = int(self.frames_to_anneal /
                          self.learner_config.parameter_publish.exp_interval)
        lr_scheduler = eval(
            self.learner_config.algo.network.anneal.lr_scheduler)

        self.exp_counter = 0
        self.kl_record = []

        with tx.device_scope(self.gpu_option):
            self.model = PPOModel(
                obs_spec=self.obs_spec,
                action_dim=self.action_dim,
                model_config=self.learner_config.model,
                use_cuda=self.use_cuda,
                init_log_sig=self.init_log_sig,
                use_z_filter=self.use_z_filter,
                if_pixel_input=self.env_config.pixel_input,
                rnn_config=self.learner_config.algo.rnn,
            )
            self.ref_target_model = PPOModel(
                obs_spec=self.obs_spec,
                action_dim=self.action_dim,
                model_config=self.learner_config.model,
                use_cuda=self.use_cuda,
                init_log_sig=self.init_log_sig,
                use_z_filter=self.use_z_filter,
                if_pixel_input=self.env_config.pixel_input,
                rnn_config=self.learner_config.algo.rnn,
            )
            self.ref_target_model.update_target_params(self.model)

            # Learning parameters and optimizer
            self.clip_actor_gradient = self.learner_config.algo.network.clip_actor_gradient
            self.actor_gradient_clip_value = self.learner_config.algo.network.actor_gradient_norm_clip
            self.clip_critic_gradient = self.learner_config.algo.network.clip_critic_gradient
            self.critic_gradient_clip_value = self.learner_config.algo.network.critic_gradient_norm_clip

            self.critic_optim = torch.optim.Adam(
                self.model.get_critic_params(),
                lr=self.lr_critic,
                weight_decay=self.learner_config.algo.network.
                critic_regularization)
            self.actor_optim = torch.optim.Adam(
                self.model.get_actor_params(),
                lr=self.lr_actor,
                weight_decay=self.learner_config.algo.network.
                actor_regularization)

            # learning rate scheduler
            self.actor_lr_scheduler = lr_scheduler(
                self.actor_optim,
                num_updates,
                update_freq=self.lr_update_frequency,
                min_lr=self.min_lr)
            self.critic_lr_scheduler = lr_scheduler(
                self.critic_optim,
                num_updates,
                update_freq=self.lr_update_frequency,
                min_lr=self.min_lr)

            # Experience Aggregator
            self.aggregator = MultistepAggregatorWithInfo(
                self.env_config.obs_spec, self.env_config.action_spec)

            # probability distribution. Gaussian only for now
            self.pd = DiagGauss(self.action_dim)

            # placeholder for RNN hidden cells
            self.cells = None

            # Reward White-filtering
            if self.use_r_filter:
                self.reward_filter = RewardFilter()
예제 #12
0
    def _optimize(self, obs, actions, rewards, obs_next, persistent_infos,
                  onetime_infos, dones):
        '''
            main method for optimization that calls _adapt/clip_update and 
            _value_update epoch_policy and epoch_baseline times respectively
            return: dictionary of tracted statistics
            Args:
                obs: batch of observations (batch_size, N-step , obs_dim)
                obs_next: batch of next observations (batch_size, 1 , obs_dim)
                actions: batch of actions (batch_size, N-step , act_dim)
                rewards: batch of rewards (batch_size, N-step)
                dones: batch of termination flags (batch_size, N-step)
                action_infos: list of batched other attributes tracted, such as
                    behavior policy, RNN hidden states and etc.
            Returns:
                dictionary of recorded statistics
        '''
        # convert everything to float tensor:
        with tx.device_scope(self.gpu_option):
            pds = persistent_infos[-1]

            if self.if_rnn_policy:
                h = (onetime_infos[0].transpose(0, 1).contiguous()).detach()
                c = (onetime_infos[1].transpose(0, 1).contiguous()).detach()
                self.cells = (h, c)

            advantages, returns = self._gae_and_return(obs, obs_next, rewards,
                                                       dones)
            advantages = advantages.detach()
            returns = returns.detach()

            if self.if_rnn_policy:
                h = self.cells[0].detach()
                c = self.cells[1].detach()
                self.cells = (h, c)
                eff_len = self.n_step - self.horizon + 1
                behave_pol = pds[:, :eff_len, :].contiguous().detach()
                actions_iter = actions[:, :eff_len, :].contiguous().detach()
            else:
                behave_pol = pds[:, 0, :].contiguous().detach()
                actions_iter = actions[:, 0, :].contiguous().detach()

            obs_iter = {}
            for mod in obs.keys():
                obs_iter[mod] = {}
                for k in obs[mod].keys():
                    if self.if_rnn_policy:
                        obs_iter[mod][k] = obs[mod][k][:, :self.n_step -
                                                       self.horizon +
                                                       1, :].contiguous(
                                                       ).detach()
                    else:
                        obs_iter[mod][k] = obs[mod][k][:, 0, :].contiguous(
                        ).detach()

            ref_pol = self.ref_target_model.forward_actor(
                obs_iter, self.cells).detach()

            for ep in range(self.epoch_policy):
                if self.ppo_mode == 'clip':
                    stats = self._clip_update(obs_iter, actions_iter,
                                              advantages, behave_pol)
                else:
                    stats = self._adapt_update(obs_iter, actions_iter,
                                               advantages, behave_pol, ref_pol)
                curr_pol = self.model.forward_actor(obs_iter,
                                                    self.cells).detach()
                kl = self.pd.kl(ref_pol, curr_pol).mean()
                stats['_pol_kl'] = kl.item()
                if kl.item() > self.kl_target * 4:
                    break

            self.kl_record.append(stats['_pol_kl'])

            for _ in range(self.epoch_baseline):
                baseline_stats = self._value_update(obs_iter, returns)

            # Collecting metrics and updating tensorplex
            for k in baseline_stats:
                stats[k] = baseline_stats[k]

            behave_likelihood = self.pd.likelihood(actions_iter, behave_pol)
            curr_likelihood = self.pd.likelihood(actions_iter, curr_pol)

            stats['_avg_return_targ'] = returns.mean().item()
            stats['_avg_log_sig'] = self.model.actor.log_var.mean().item()
            stats['_avg_behave_likelihood'] = behave_likelihood.mean().item()
            stats['_avg_is_weight'] = (
                curr_likelihood / (behave_likelihood + 1e-4)).mean().item()
            stats['_ref_behave_diff'] = self.pd.kl(ref_pol,
                                                   behave_pol).mean().item()
            stats['_lr'] = self.actor_lr_scheduler.get_lr()[0]

            if self.use_z_filter:
                self.model.z_update(obs_iter)
                stats['obs_running_mean'] = np.mean(
                    self.model.z_filter.running_mean())
                stats['obs_running_square'] = np.mean(
                    self.model.z_filter.running_square())
                stats['obs_running_std'] = np.mean(
                    self.model.z_filter.running_std())
            if self.use_r_filter:
                stats['reward_mean'] = self.reward_filter.reward_mean()

            return stats
예제 #13
0
    def _preprocess_batch_ppo(self, batch):
        '''
            Loading experiences from numpy to torch.FloatTensor type
            Args: 
                batch: BeneDict of experiences containing following attributes
                        'obs' - observation
                        'actions' - actions
                        'rewards' - rewards
                        'obs_next' - next observation
                        'persistent_infos' - action policy
                        'onetime_infos' - RNN hidden cells or None
            Return:
                Benedict of torch.FloatTensors
        '''
        with tx.device_scope(self.gpu_option):

            obs, actions, rewards, obs_next, done, persistent_infos, onetime_infos = (
                batch['obs'],
                batch['actions'],
                batch['rewards'],
                batch['obs_next'],
                batch['dones'],
                batch['persistent_infos'],
                batch['onetime_infos'],
            )

            for modality in obs:
                for key in obs[modality]:
                    obs[modality][key] = (torch.tensor(
                        obs[modality][key], dtype=torch.float32)).detach()
                    obs_next[modality][key] = (torch.tensor(
                        obs_next[modality][key],
                        dtype=torch.float32)).detach()

            actions = torch.tensor(actions, dtype=torch.float32)
            rewards = torch.tensor(rewards,
                                   dtype=torch.float32) * self.reward_scale
            if self.use_r_filter:
                normed_reward = self.reward_filter.forward(rewards)
                self.reward_filter.update(rewards)
                rewards = normed_reward

            done = torch.tensor(done, dtype=torch.float32)

            if persistent_infos is not None:
                for i in range(len(persistent_infos)):
                    persistent_infos[i] = torch.tensor(
                        persistent_infos[i], dtype=torch.float32).detach()
            if onetime_infos is not None:
                for i in range(len(onetime_infos)):
                    onetime_infos[i] = torch.tensor(
                        onetime_infos[i], dtype=torch.float32).detach()

            (
                batch['obs'],
                batch['actions'],
                batch['rewards'],
                batch['obs_next'],
                batch['dones'],
                batch['persistent_infos'],
                batch['onetime_infos'],
            ) = (obs, actions, rewards, obs_next, done, persistent_infos,
                 onetime_infos)
            return batch
예제 #14
0
    def _gae_and_return(self, obs, obs_next, rewards, dones):
        '''
        computes generalized advantage estimate and corresponding N-step return. 
        Details of algorithm can be found here: https://arxiv.org/pdf/1506.02438.pdf
        Args: 
            obs: batch of observations (batch_size, N-step , obs_dim)
            obs_next: batch of next observations (batch_size, 1 , obs_dim)
            actions: batch of actions (batch_size, N-step , act_dim)
            rewards: batch of rewards (batch_size, N-step)
            dones: batch of termination flags (batch_size, N-step)
        Returns:
            obs: batch of observation (batch_size, obs_dim)
            actions: batch of action (batch_size, act_dim)
            advantage: batch of advantages (batch_size, 1)
            returns: batch of returns (batch_size, 1)
        '''
        with tx.device_scope(self.gpu_option):
            index_set = torch.tensor(range(self.n_step), dtype=torch.float32)
            gamma = torch.pow(self.gamma, index_set)
            lam = torch.pow(self.lam, index_set)

            obs_concat_var = {}
            for mod in obs.keys():
                obs_concat_var[mod] = {}
                for k in obs[mod].keys():
                    obs_concat_var[mod][k] = (torch.cat(
                        [obs[mod][k], obs_next[mod][k]], dim=1))
                    if not self.if_rnn_policy:
                        obs_shape = obs_concat_var[mod][k].size()
                        obs_concat_var[mod][k] = obs_concat_var[mod][k].view(
                            -1, *obs_shape[2:])

            values = self.model.forward_critic(obs_concat_var, self.cells)
            values = values.view(self.batch_size, self.n_step + 1)
            values[:, 1:] *= 1 - dones

            if self.if_rnn_policy:
                tds = rewards + self.gamma * values[:, 1:] - values[:, :-1]
                eff_len = self.n_step - self.horizon + 1
                gamma = gamma[:self.horizon]
                lam = lam[:self.horizon]

                returns = torch.zeros(self.batch_size, eff_len)
                advs = torch.zeros(self.batch_size, eff_len)
                for step in range(eff_len):
                    returns[:, step] = torch.sum(gamma * rewards[:, step:step + self.horizon], 1) + \
                                       values[:, step + self.horizon] * (self.gamma ** self.horizon)
                    advs[:, step] = torch.sum(
                        tds[:, step:step + self.horizon] * gamma * lam, 1)

                if self.norm_adv:
                    std = advs.std()
                    mean = advs.mean()
                    advs = (advs - mean) / max(std, 1e-4)
                return advs, returns

            else:
                returns = torch.sum(
                    gamma * rewards,
                    1) + values[:, -1] * (self.gamma**self.n_step)
                tds = rewards + self.gamma * values[:, 1:] - values[:, :-1]
                gae = torch.sum(tds * gamma * lam, 1)

                if self.norm_adv:
                    std = gae.std()
                    mean = gae.mean()
                    gae = (gae - mean) / max(std, 1e-4)

                return gae.view(-1, 1), returns.view(-1, 1)