Пример #1
0
    def __init__(self, action_size, transition_model, encoder, reward_model,
                 observation_model):

        self.encoder, self.reward_model, self.transition_model, self.observation_model = encoder, reward_model, transition_model, observation_model

        self.merge_value_model = ValueModel(
            args.belief_size, args.state_size, args.hidden_size,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model = MergeModel(
            args.belief_size, args.state_size, args.hidden_size, action_size,
            args.pool_len,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model.share_memory()
        self.merge_value_model.share_memory()

        # set actor, value pool
        self.actor_pool = [
            ActorModel(args.belief_size, args.state_size, args.hidden_size,
                       action_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        self.value_pool = [
            ValueModel(args.belief_size, args.state_size, args.hidden_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        [actor.share_memory() for actor in self.actor_pool]
        [value.share_memory() for value in self.value_pool]

        self.env_model_modules = get_modules([
            self.transition_model, self.encoder, self.observation_model,
            self.reward_model
        ])
        self.actor_pool_modules = get_modules(self.actor_pool)
        self.model_modules = self.env_model_modules + self.actor_pool_modules

        self.merge_value_model_modules = get_modules([self.merge_value_model])

        self.merge_actor_optimizer = optim.Adam(
            self.merge_actor_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.actor_learning_rate,
            eps=args.adam_epsilon)
        self.merge_value_optimizer = optim.Adam(
            self.merge_value_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.value_learning_rate,
            eps=args.adam_epsilon)

        self.actor_pipes = [
            Pipe() for i in range(1,
                                  len(self.actor_pool) + 1)
        ]  # Set Multi Pipe
        self.workers_actor = [
            Worker_actor(actor_l=self.actor_pool[i],
                         value_l=self.value_pool[i],
                         transition_model=self.transition_model,
                         encoder=self.encoder,
                         observation_model=self.observation_model,
                         reward_model=self.reward_model,
                         child_conn=child,
                         results_dir=args.results_dir,
                         id=i + 1)
            for i, [parent, child] in enumerate(self.actor_pipes)
        ]  # Set Worker_actor Using i'th actor_pipes

        [w.start()
         for i, w in enumerate(self.workers_actor)]  # Start Single Process

        self.metrics = {
            'episodes': [],
            'merge_actor_loss': [],
            'merge_value_loss': []
        }
        self.merge_losses = []
Пример #2
0
# Initialise model parameters randomly
transition_model = TransitionModel(
    args.belief_size, args.state_size, env.action_size, args.hidden_size,
    args.embedding_size, args.dense_activation_function).to(device=args.device)
observation_model = ObservationModel(
    args.symbolic_env, env.observation_size, args.belief_size, args.state_size,
    args.embedding_size, args.cnn_activation_function).to(device=args.device)
reward_model = RewardModel(
    args.belief_size, args.state_size, args.hidden_size,
    args.dense_activation_function).to(device=args.device)
encoder = Encoder(args.symbolic_env, env.observation_size, args.embedding_size,
                  args.cnn_activation_function).to(device=args.device)
actor_model = ActorModel(args.belief_size, args.state_size, args.hidden_size,
                         env.action_size,
                         args.dense_activation_function).to(device=args.device)
value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_activation_function).to(device=args.device)
param_list = list(transition_model.parameters()) + list(
    observation_model.parameters()) + list(reward_model.parameters()) + list(
        encoder.parameters())
value_actor_param_list = list(value_model.parameters()) + list(
    actor_model.parameters())
params_list = param_list + value_actor_param_list
model_optimizer = optim.Adam(
    param_list,
    lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate,
    eps=args.adam_epsilon)
actor_optimizer = optim.Adam(
    actor_model.parameters(),
    lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate,
    eps=args.adam_epsilon)
value_optimizer = optim.Adam(
Пример #3
0
      D.append(observation, action, reward, done)
      observation = next_observation
      t += 1
    print('this random get reward',single_trial_reward,'pass gate num:',env._env.gate_counter)
    # print()
    metrics['steps'].append(t * args.action_repeat + (0 if len(metrics['steps']) == 0 else metrics['steps'][-1]))
    metrics['episodes'].append(s)


# Initialise model parameters randomly
transition_model = TransitionModel(args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size, args.dense_activation_function).to(device=args.device)
observation_model = ObservationModel(args.symbolic_env, env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.cnn_activation_function).to(device=args.device)
reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device=args.device)
encoder = Encoder(args.symbolic_env, env.observation_size, args.embedding_size, args.cnn_activation_function).to(device=args.device)
actor_model = ActorModel(args.belief_size, args.state_size, args.hidden_size, env.action_size, args.dense_activation_function).to(device=args.device)
value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device=args.device)
param_list = list(transition_model.parameters()) + list(observation_model.parameters()) + list(reward_model.parameters()) + list(encoder.parameters())
value_actor_param_list = list(value_model.parameters()) + list(actor_model.parameters())
params_list = param_list + value_actor_param_list
model_optimizer = optim.Adam(param_list, lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate, eps=args.adam_epsilon)
actor_optimizer = optim.Adam(actor_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate, eps=args.adam_epsilon)
value_optimizer = optim.Adam(value_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate, eps=args.adam_epsilon)
if args.models is not '' and os.path.exists(args.models):
  model_dicts = torch.load(args.models)
  transition_model.load_state_dict(model_dicts['transition_model'])
  observation_model.load_state_dict(model_dicts['observation_model'])
  reward_model.load_state_dict(model_dicts['reward_model'])
  encoder.load_state_dict(model_dicts['encoder'])
  actor_model.load_state_dict(model_dicts['actor_model'])
  value_model.load_state_dict(model_dicts['value_model'])
  model_optimizer.load_state_dict(model_dicts['model_optimizer'])
Пример #4
0
class Algorithms(object):
    def __init__(self, action_size, transition_model, encoder, reward_model,
                 observation_model):

        self.encoder, self.reward_model, self.transition_model, self.observation_model = encoder, reward_model, transition_model, observation_model

        self.merge_value_model = ValueModel(
            args.belief_size, args.state_size, args.hidden_size,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model = MergeModel(
            args.belief_size, args.state_size, args.hidden_size, action_size,
            args.pool_len,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model.share_memory()
        self.merge_value_model.share_memory()

        # set actor, value pool
        self.actor_pool = [
            ActorModel(args.belief_size, args.state_size, args.hidden_size,
                       action_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        self.value_pool = [
            ValueModel(args.belief_size, args.state_size, args.hidden_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        [actor.share_memory() for actor in self.actor_pool]
        [value.share_memory() for value in self.value_pool]

        self.env_model_modules = get_modules([
            self.transition_model, self.encoder, self.observation_model,
            self.reward_model
        ])
        self.actor_pool_modules = get_modules(self.actor_pool)
        self.model_modules = self.env_model_modules + self.actor_pool_modules

        self.merge_value_model_modules = get_modules([self.merge_value_model])

        self.merge_actor_optimizer = optim.Adam(
            self.merge_actor_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.actor_learning_rate,
            eps=args.adam_epsilon)
        self.merge_value_optimizer = optim.Adam(
            self.merge_value_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.value_learning_rate,
            eps=args.adam_epsilon)

        self.actor_pipes = [
            Pipe() for i in range(1,
                                  len(self.actor_pool) + 1)
        ]  # Set Multi Pipe
        self.workers_actor = [
            Worker_actor(actor_l=self.actor_pool[i],
                         value_l=self.value_pool[i],
                         transition_model=self.transition_model,
                         encoder=self.encoder,
                         observation_model=self.observation_model,
                         reward_model=self.reward_model,
                         child_conn=child,
                         results_dir=args.results_dir,
                         id=i + 1)
            for i, [parent, child] in enumerate(self.actor_pipes)
        ]  # Set Worker_actor Using i'th actor_pipes

        [w.start()
         for i, w in enumerate(self.workers_actor)]  # Start Single Process

        self.metrics = {
            'episodes': [],
            'merge_actor_loss': [],
            'merge_value_loss': []
        }
        self.merge_losses = []

    def get_action(self, belief, posterior_state, explore=False):
        merge_action_list = []
        for actor_l in self.actor_pool:
            actions_l_mean, actions_l_std = actor_l.get_action_mean_std(
                belief, posterior_state)
            merge_action_list.append(actions_l_mean)
            merge_action_list.append(actions_l_std)
        merge_actions = torch.cat(merge_action_list, dim=1)
        action = self.merge_actor_model.get_merge_action(merge_actions,
                                                         belief,
                                                         posterior_state,
                                                         det=not (explore))
        return action

    def train_algorithm(self, actor_states, actor_beliefs):

        [
            self.actor_pipes[i][0].send(1)
            for i, w in enumerate(self.workers_actor)
        ]  # Parent_pipe send data using i'th pipes
        [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)
         ]  # waitting the children finish

        with FreezeParameters(self.model_modules):
            imagination_traj = self.imagine_merge_ahead(
                prev_state=actor_states,
                prev_belief=actor_beliefs,
                policy_pool=self.actor_pool,
                transition_model=self.transition_model,
                merge_model=self.merge_actor_model)
        imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

        with FreezeParameters(self.model_modules +
                              self.merge_value_model_modules):
            imged_reward = bottle(self.reward_model,
                                  (imged_beliefs, imged_prior_states))
            value_pred = bottle(self.merge_value_model,
                                (imged_beliefs, imged_prior_states))

        with FreezeParameters(self.actor_pool_modules):
            returns = lambda_return(imged_reward,
                                    value_pred,
                                    bootstrap=value_pred[-1],
                                    discount=args.discount,
                                    lambda_=args.disclam)
            merge_actor_loss = -torch.mean(returns)
            # Update model parameters
            self.merge_actor_optimizer.zero_grad()
            merge_actor_loss.backward()
            nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(),
                                     args.grad_clip_norm,
                                     norm_type=2)
            self.merge_actor_optimizer.step()

        # Dreamer implementation: value loss calculation and optimization
        with torch.no_grad():
            value_beliefs = imged_beliefs.detach()
            value_prior_states = imged_prior_states.detach()
            target_return = returns.detach()

        value_dist = Normal(
            bottle(self.merge_value_model,
                   (value_beliefs, value_prior_states)),
            1)  # detach the input tensor from the transition network.
        merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1))
        # Update model parameters
        self.merge_value_optimizer.zero_grad()
        merge_value_loss.backward()
        nn.utils.clip_grad_norm_(self.merge_value_model.parameters(),
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.merge_value_optimizer.step()

        self.merge_losses.append(
            [merge_actor_loss.item(),
             merge_value_loss.item()])

        # return [merge_actor_loss, merge_value_loss]

    def save_loss_data(self, metrics_episodes):
        losses = tuple(zip(*self.merge_losses))
        self.metrics['merge_actor_loss'].append(losses[0])
        self.metrics['merge_value_loss'].append(losses[1])
        Save_Txt(metrics_episodes[-1], self.metrics['merge_actor_loss'][-1],
                 'merge_actor_loss', args.results_dir)
        Save_Txt(metrics_episodes[-1], self.metrics['merge_value_loss'][-1],
                 'merge_value_loss', args.results_dir)
        [
            sub_actor.save_loss_data(metrics_episodes)
            for sub_actor in self.workers_actor
        ]  # save sub actor loss
        self.merge_losses = []

    def imagine_merge_ahead(self,
                            prev_state,
                            prev_belief,
                            policy_pool,
                            transition_model,
                            merge_model,
                            planning_horizon=12):
        flatten = lambda x: x.view([-1] + list(x.size()[2:]))
        prev_belief = flatten(prev_belief)
        prev_state = flatten(prev_state)

        # Create lists for hidden states (cannot use single tensor as buffer because autograd won't work with inplace writes)
        T = planning_horizon
        beliefs, prior_states, prior_means, prior_std_devs = [
            torch.empty(0)
        ] * T, [torch.empty(0)] * T, [torch.empty(0)] * T, [torch.empty(0)] * T
        beliefs[0], prior_states[0] = prev_belief, prev_state
        for t in range(T - 1):
            _state = prior_states[t]

            merge_action_list = []
            for actor_l in policy_pool:
                actions_l_mean, actions_l_std = actor_l.get_action_mean_std(
                    beliefs[t].detach(), _state.detach())
                merge_action_list.append(actions_l_mean)
                merge_action_list.append(actions_l_std)

            merge_actions = torch.cat(merge_action_list, dim=1)

            actions = merge_model.get_merge_action(merge_actions,
                                                   beliefs[t].detach(),
                                                   _state.detach())
            # Compute belief (deterministic hidden state)
            if args.MultiGPU and torch.cuda.device_count() > 1:
                hidden = transition_model.module.act_fn(
                    transition_model.module.fc_embed_state_action(
                        torch.cat([_state, actions], dim=1)))
                beliefs[t + 1] = transition_model.module.rnn(
                    hidden, beliefs[t])
                # Compute state prior by applying transition dynamics
                hidden = transition_model.module.act_fn(
                    transition_model.module.fc_embed_belief_prior(beliefs[t +
                                                                          1]))
                prior_means[t + 1], _prior_std_dev = torch.chunk(
                    transition_model.module.fc_state_prior(hidden), 2, dim=1)
                prior_std_devs[t + 1] = F.softplus(
                    _prior_std_dev) + transition_model.module.min_std_dev
            else:
                hidden = transition_model.act_fn(
                    transition_model.fc_embed_state_action(
                        torch.cat([_state, actions], dim=1)))
                beliefs[t + 1] = transition_model.rnn(hidden, beliefs[t])
                # Compute state prior by applying transition dynamics
                hidden = transition_model.act_fn(
                    transition_model.fc_embed_belief_prior(beliefs[t + 1]))
                prior_means[t + 1], _prior_std_dev = torch.chunk(
                    transition_model.fc_state_prior(hidden), 2, dim=1)
                prior_std_devs[t + 1] = F.softplus(
                    _prior_std_dev) + transition_model.min_std_dev
            prior_states[t + 1] = prior_means[t + 1] + prior_std_devs[
                t + 1] * torch.randn_like(prior_means[t + 1])
            # Return new hidden states
        # imagined_traj = [beliefs, prior_states, prior_means, prior_std_devs]
        imagined_traj = [
            torch.stack(beliefs[1:], dim=0),
            torch.stack(prior_states[1:], dim=0),
            torch.stack(prior_means[1:], dim=0),
            torch.stack(prior_std_devs[1:], dim=0)
        ]
        return imagined_traj

    def train_to_eval(self):
        [actor_model.eval() for actor_model in self.actor_pool]
        [value_model.eval() for value_model in self.value_pool]
        self.merge_actor_model.eval()
        self.merge_value_model.eval()

    def eval_to_train(self):
        [actor_model.train() for actor_model in self.actor_pool]
        [value_model.train() for value_model in self.value_pool]
        self.merge_actor_model.train()
        self.merge_value_model.train()
Пример #5
0
class Dreamer():
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)

        for p in self.target_value_model.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

    def process_im(self, image):
        # Resize, put channel first, convert it to a tensor, centre it to [-0.5, 0.5] and add batch dimenstion.

        def preprocess_observation_(observation, bit_depth):
            # Preprocesses an observation inplace (from float32 Tensor [0, 255] to [-0.5, 0.5])
            observation.div_(2**(8 - bit_depth)).floor_().div_(
                2**bit_depth).sub_(
                    0.5)  # Quantise to given bit depth and centre
            observation.add_(
                torch.rand_like(observation).div_(2**bit_depth)
            )  # Dequantise (to approx. match likelihood of PDF of continuous images vs. PMF of discrete images)

        image = torch.tensor(
            cv2.resize(image, (64, 64),
                       interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1),
            dtype=torch.float32)  # Resize and put channel first

        preprocess_observation_(image, self.args.bit_depth)
        return image.unsqueeze(dim=0)

    def _compute_loss_world(self, state, data):
        # unpackage data
        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state
        observations, rewards, nonterminals = data

        observation_loss = F.mse_loss(
            bottle(self.observation_model, (beliefs, posterior_states)),
            observations,
            reduction='none').sum(
                dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))

        reward_loss = F.mse_loss(bottle(self.reward_model,
                                        (beliefs, posterior_states)),
                                 rewards,
                                 reduction='none').mean(dim=(0, 1))  # TODO: 5

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
                Independent(Normal(posterior_means, posterior_std_devs), 1),
                Independent(Normal(prior_means, prior_std_devs), 1)),
            self.free_nats).mean(dim=(0, 1))

        if self.args.pcont:
            pcont_loss = F.binary_cross_entropy(
                bottle(self.pcont_model, (beliefs, posterior_states)),
                nonterminals)

        return observation_loss, self.args.reward_scale * reward_loss, kl_loss, (
            self.args.pcont_scale * pcont_loss if self.args.pcont else 0)

    def _compute_loss_actor(self,
                            imag_beliefs,
                            imag_states,
                            imag_ac_logps=None):
        # reward and value prediction of imagined trajectories
        imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states))
        imag_values = bottle(self.value_model, (imag_beliefs, imag_states))

        with torch.no_grad():
            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)
        pcont = pcont.detach()

        if imag_ac_logps is not None:
            imag_values[
                1:] -= self.args.temp * imag_ac_logps  # add entropy here

        returns = cal_returns(imag_rewards[:-1],
                              imag_values[:-1],
                              imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)

        discount = torch.cumprod(
            torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0),
            0).detach()

        actor_loss = -torch.mean(discount * returns)
        return actor_loss

    def _compute_loss_critic(self,
                             imag_beliefs,
                             imag_states,
                             imag_ac_logps=None):

        with torch.no_grad():
            # calculate the target with the target nn
            target_imag_values = bottle(self.target_value_model,
                                        (imag_beliefs, imag_states))
            imag_rewards = bottle(self.reward_model,
                                  (imag_beliefs, imag_states))

            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)

            if imag_ac_logps is not None:
                target_imag_values[1:] -= self.args.temp * imag_ac_logps

        returns = cal_returns(imag_rewards[:-1],
                              target_imag_values[:-1],
                              target_imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)
        target_return = returns.detach()

        value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1]

        value_loss = F.mse_loss(value_pred, target_return,
                                reduction="none").mean(dim=(0, 1))

        return value_loss

    def _latent_imagination(self,
                            beliefs,
                            posterior_states,
                            with_logprob=False):
        # Rollout to generate imagined trajectories

        chunk_size, batch_size, _ = list(
            posterior_states.size())  # flatten the tensor
        flatten_size = chunk_size * batch_size

        posterior_states = posterior_states.detach().reshape(flatten_size, -1)
        beliefs = beliefs.detach().reshape(flatten_size, -1)

        imag_beliefs, imag_states, imag_ac_logps = [beliefs
                                                    ], [posterior_states], []

        for i in range(self.args.planning_horizon):
            imag_action, imag_ac_logp = self.actor_model(
                imag_beliefs[-1].detach(),
                imag_states[-1].detach(),
                deterministic=False,
                with_logprob=with_logprob,
            )
            imag_action = imag_action.unsqueeze(dim=0)  # add time dim

            imag_belief, imag_state, _, _ = self.transition_model(
                imag_states[-1], imag_action, imag_beliefs[-1])
            imag_beliefs.append(imag_belief.squeeze(dim=0))
            imag_states.append(imag_state.squeeze(dim=0))

            if with_logprob:
                imag_ac_logps.append(imag_ac_logp.squeeze(dim=0))

        imag_beliefs = torch.stack(imag_beliefs, dim=0).to(
            self.args.device
        )  # shape [horizon+1, (chuck-1)*batch, belief_size]
        imag_states = torch.stack(imag_states, dim=0).to(self.args.device)

        if with_logprob:
            imag_ac_logps = torch.stack(imag_ac_logps, dim=0).to(
                self.args.device)  # shape [horizon, (chuck-1)*batch]

        return imag_beliefs, imag_states, imag_ac_logps if with_logprob else None

    def update_parameters(self, data, gradient_steps):
        loss_info = []  # used to record loss
        for s in tqdm(range(gradient_steps)):
            # get state and belief of samples
            observations, actions, rewards, nonterminals = data

            init_belief = torch.zeros(self.args.batch_size,
                                      self.args.belief_size,
                                      device=self.args.device)
            init_state = torch.zeros(self.args.batch_size,
                                     self.args.state_size,
                                     device=self.args.device)

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
                init_state, actions, init_belief,
                bottle(self.encoder, (observations, )),
                nonterminals)  # TODO: 4

            # update paras of world model
            world_model_loss = self._compute_loss_world(
                state=(beliefs, prior_states, prior_means, prior_std_devs,
                       posterior_states, posterior_means, posterior_std_devs),
                data=(observations, rewards, nonterminals))
            observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss
            self.world_optimizer.zero_grad()
            (observation_loss + reward_loss + kl_loss + pcont_loss).backward()
            nn.utils.clip_grad_norm_(self.world_param,
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.world_optimizer.step()

            # freeze params to save memory
            for p in self.world_param:
                p.requires_grad = False
            for p in self.value_model.parameters():
                p.requires_grad = False

            # latent imagination
            imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination(
                beliefs, posterior_states, with_logprob=self.args.with_logprob)

            # update actor
            actor_loss = self._compute_loss_actor(imag_beliefs,
                                                  imag_states,
                                                  imag_ac_logps=imag_ac_logps)

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            nn.utils.clip_grad_norm_(self.actor_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.actor_optimizer.step()

            for p in self.world_param:
                p.requires_grad = True
            for p in self.value_model.parameters():
                p.requires_grad = True

            # update critic
            imag_beliefs = imag_beliefs.detach()
            imag_states = imag_states.detach()

            critic_loss = self._compute_loss_critic(
                imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps)

            self.value_optimizer.zero_grad()
            critic_loss.backward()
            nn.utils.clip_grad_norm_(self.value_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.value_optimizer.step()

            loss_info.append([
                observation_loss.item(),
                reward_loss.item(),
                kl_loss.item(),
                pcont_loss.item() if self.args.pcont else 0,
                actor_loss.item(),
                critic_loss.item()
            ])

        # finally, update target value function every #gradient_steps
        with torch.no_grad():
            self.target_value_model.load_state_dict(
                self.value_model.state_dict())

        return loss_info

    def infer_state(self, observation, action, belief=None, state=None):
        """ Infer belief over current state q(s_t|o≤t,a<t) from the history,
        return updated belief and posterior_state at time t
        returned shape: belief/state [belief/state_dim] (remove the time_dim)
    """
        # observation is obs.to(device), action.shape=[act_dim] (will add time dim inside this fn), belief.shape
        belief, _, _, _, posterior_state, _, _ = self.transition_model(
            state, action.unsqueeze(dim=0), belief,
            self.encoder(observation).unsqueeze(
                dim=0))  # Action and observation need extra time dimension

        belief, posterior_state = belief.squeeze(
            dim=0), posterior_state.squeeze(
                dim=0)  # Remove time dimension from belief/state

        return belief, posterior_state

    def select_action(self, state, deterministic=False):
        # get action with the inputs get from fn: infer_state; return a numpy with shape [batch, act_size]
        belief, posterior_state = state
        action, _ = self.actor_model(belief,
                                     posterior_state,
                                     deterministic=deterministic,
                                     with_logprob=False)

        if not deterministic and not self.args.with_logprob:  ## add exploration noise
            action = Normal(action, self.args.expl_amount).rsample()
            action = torch.clamp(action, -1, 1)
        return action  # tensor
Пример #6
0
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)

        for p in self.target_value_model.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence
Пример #7
0
# Model
transition_model = TransitionModel(
    args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size, args.dense_activation_function).to(device)
observation_model = ObservationModel(
    env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.cnn_activation_function).to(device)
reward_model = RewardModel(
    args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device)
pcont_model = PcontModel(
    args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device)
encoder = Encoder(env.observation_size, args.embedding_size,
                  args.cnn_activation_function).to(device)
actor_model = ActorModel(args.belief_size, args.state_size, args.hidden_size, env.action_size,
                         args.action_dist, args.dense_activation_function).to(device)
# enabling doubleQ?
value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_activation_function, doubleQ=False).to(device)

# Param List
param_list = list(transition_model.parameters()) + list(
    observation_model.parameters()) + list(reward_model.parameters()) + list(
        encoder.parameters())
if args.pcont:
    param_list += list(pcont_model.parameters())

# Optimizer
model_optimizer = optim.Adam(
    param_list, lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate, eps=args.adam_epsilon)
actor_optimizer = optim.Adam(
    actor_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate, eps=args.adam_epsilon)
value_optimizer = optim.Adam(
    value_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate, eps=args.adam_epsilon)
Пример #8
0
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act,
            fix_speed=args.fix_speed,
            throttle_base=args.throttle_base).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.value_model2 = ValueModel(args.belief_size, args.state_size,
                                       args.hidden_size,
                                       args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)
        self.target_value_model2 = deepcopy(self.value_model2)

        for p in self.target_value_model.parameters():
            p.requires_grad = False
        for p in self.target_value_model2.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()) +
                                          list(self.value_model2.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat to
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

        # TODO: change it to the new replay buffer, in buffer.py
        self.D = ExperienceReplay(args.experience_size, args.symbolic,
                                  args.observation_size, args.action_size,
                                  args.bit_depth, args.device)

        if self.args.auto_temp:
            # setup for learning of alpha term (temp of the entropy term)
            self.log_temp = torch.zeros(1,
                                        requires_grad=True,
                                        device=args.device)
            self.target_entropy = -np.prod(
                args.action_size if not args.fix_speed else self.args.
                action_size - 1).item()  # heuristic value from SAC paper
            self.temp_optimizer = optim.Adam(
                [self.log_temp], lr=args.value_lr)  # use the same value_lr
Пример #9
0
class Dreamer(Agent):
    # The agent has its own replay buffer, update, act
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act,
            fix_speed=args.fix_speed,
            throttle_base=args.throttle_base).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.value_model2 = ValueModel(args.belief_size, args.state_size,
                                       args.hidden_size,
                                       args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)
        self.target_value_model2 = deepcopy(self.value_model2)

        for p in self.target_value_model.parameters():
            p.requires_grad = False
        for p in self.target_value_model2.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()) +
                                          list(self.value_model2.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat to
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

        # TODO: change it to the new replay buffer, in buffer.py
        self.D = ExperienceReplay(args.experience_size, args.symbolic,
                                  args.observation_size, args.action_size,
                                  args.bit_depth, args.device)

        if self.args.auto_temp:
            # setup for learning of alpha term (temp of the entropy term)
            self.log_temp = torch.zeros(1,
                                        requires_grad=True,
                                        device=args.device)
            self.target_entropy = -np.prod(
                args.action_size if not args.fix_speed else self.args.
                action_size - 1).item()  # heuristic value from SAC paper
            self.temp_optimizer = optim.Adam(
                [self.log_temp], lr=args.value_lr)  # use the same value_lr

        # TODO: print out the param used in Dreamer
        # var_counts = tuple(count_vars(module) for module in [self., self.ac.q1, self.ac.q2])
        # print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)

    # def process_im(self, image, image_size=None, rgb=None):
    #   # Resize, put channel first, convert it to a tensor, centre it to [-0.5, 0.5] and add batch dimenstion.
    #
    #   def preprocess_observation_(observation, bit_depth):
    #     # Preprocesses an observation inplace (from float32 Tensor [0, 255] to [-0.5, 0.5])
    #     observation.div_(2 ** (8 - bit_depth)).floor_().div_(2 ** bit_depth).sub_(
    #       0.5)  # Quantise to given bit depth and centre
    #     observation.add_(torch.rand_like(observation).div_(
    #       2 ** bit_depth))  # Dequantise (to approx. match likelihood of PDF of continuous images vs. PMF of discrete images)
    #
    #   image = image[40:, :, :]  # clip the above 40 rows
    #   image = torch.tensor(cv2.resize(image, (40, 40), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1),
    #                         dtype=torch.float32)  # Resize and put channel first
    #
    #   preprocess_observation_(image, self.args.bit_depth)
    #   return image.unsqueeze(dim=0)
    def process_im(self, images, image_size=None, rgb=None):
        images = cv2.resize(images, (40, 40))
        images = np.dot(images, [0.299, 0.587, 0.114])
        obs = torch.tensor(images,
                           dtype=torch.float32).div_(255.).sub_(0.5).unsqueeze(
                               dim=0)  # shape [1, 40, 40], range:[-0.5,0.5]
        return obs.unsqueeze(dim=0)  # add batch dimension

    def append_buffer(self, new_traj):
        # append new collected trajectory, not implement the data augmentation
        # shape of new_traj: [(o, a, r, d) * steps]
        for state in new_traj:
            observation, action, reward, done = state
            self.D.append(observation, action.cpu(), reward, done)

    def _compute_loss_world(self, state, data):
        # unpackage data
        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state
        observations, rewards, nonterminals = data

        # observation_loss = F.mse_loss(
        #   bottle(self.observation_model, (beliefs, posterior_states)),
        #   observations[1:],
        #   reduction='none').sum(dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))
        #
        # reward_loss = F.mse_loss(
        #   bottle(self.reward_model, (beliefs, posterior_states)),
        #   rewards[1:],
        #   reduction='none').mean(dim=(0,1))

        observation_loss = F.mse_loss(
            bottle(self.observation_model, (beliefs, posterior_states)),
            observations,
            reduction='none').sum(
                dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))

        reward_loss = F.mse_loss(bottle(self.reward_model,
                                        (beliefs, posterior_states)),
                                 rewards,
                                 reduction='none').mean(dim=(0, 1))  # TODO: 5

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
                Independent(Normal(posterior_means, posterior_std_devs), 1),
                Independent(Normal(prior_means, prior_std_devs), 1)),
            self.free_nats).mean(dim=(0, 1))

        # print("check the reward", bottle(pcont_model, (beliefs, posterior_states)).shape, nonterminals[:-1].shape)
        if self.args.pcont:
            pcont_loss = F.binary_cross_entropy(
                bottle(self.pcont_model, (beliefs, posterior_states)),
                nonterminals)
            # pcont_pred = torch.distributions.Bernoulli(logits=bottle(self.pcont_model, (beliefs, posterior_states)))
            # pcont_loss = -pcont_pred.log_prob(nonterminals[1:]).mean(dim=(0, 1))

        return observation_loss, self.args.reward_scale * reward_loss, kl_loss, (
            self.args.pcont_scale * pcont_loss if self.args.pcont else 0)

    def _compute_loss_actor(self,
                            imag_beliefs,
                            imag_states,
                            imag_ac_logps=None):
        # reward and value prediction of imagined trajectories
        imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states))
        imag_values = bottle(self.value_model, (imag_beliefs, imag_states))
        imag_values2 = bottle(self.value_model2, (imag_beliefs, imag_states))
        imag_values = torch.min(imag_values, imag_values2)

        with torch.no_grad():
            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)
        pcont = pcont.detach()

        if imag_ac_logps is not None:
            imag_values[
                1:] -= self.args.temp * imag_ac_logps  # add entropy here

        returns = cal_returns(imag_rewards[:-1],
                              imag_values[:-1],
                              imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)

        discount = torch.cumprod(
            torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0), 0)
        discount = discount.detach()

        assert list(discount.size()) == list(returns.size())
        actor_loss = -torch.mean(discount * returns)
        return actor_loss

    def _compute_loss_critic(self,
                             imag_beliefs,
                             imag_states,
                             imag_ac_logps=None):

        with torch.no_grad():
            # calculate the target with the target nn
            target_imag_values = bottle(self.target_value_model,
                                        (imag_beliefs, imag_states))
            target_imag_values2 = bottle(self.target_value_model2,
                                         (imag_beliefs, imag_states))
            target_imag_values = torch.min(target_imag_values,
                                           target_imag_values2)
            imag_rewards = bottle(self.reward_model,
                                  (imag_beliefs, imag_states))

            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)

        # print("check pcont", pcont)
            if imag_ac_logps is not None:
                target_imag_values[1:] -= self.args.temp * imag_ac_logps

        returns = cal_returns(imag_rewards[:-1],
                              target_imag_values[:-1],
                              target_imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)
        target_return = returns.detach()

        value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1]
        value_pred2 = bottle(self.value_model2,
                             (imag_beliefs, imag_states))[:-1]

        value_loss = F.mse_loss(value_pred, target_return,
                                reduction="none").mean(dim=(0, 1))
        value_loss2 = F.mse_loss(value_pred2, target_return,
                                 reduction="none").mean(dim=(0, 1))
        value_loss += value_loss2

        return value_loss

    def _latent_imagination(self,
                            beliefs,
                            posterior_states,
                            with_logprob=False):
        # Rollout to generate imagined trajectories

        chunk_size, batch_size, _ = list(
            posterior_states.size())  # flatten the tensor
        flatten_size = chunk_size * batch_size

        posterior_states = posterior_states.detach().reshape(flatten_size, -1)
        beliefs = beliefs.detach().reshape(flatten_size, -1)

        imag_beliefs, imag_states, imag_ac_logps = [beliefs
                                                    ], [posterior_states], []

        for i in range(self.args.planning_horizon):
            imag_action, imag_ac_logp = self.actor_model(
                imag_beliefs[-1].detach(),
                imag_states[-1].detach(),
                deterministic=False,
                with_logprob=with_logprob,
            )
            imag_action = imag_action.unsqueeze(dim=0)  # add time dim

            # print(imag_states[-1].shape, imag_action.shape, imag_beliefs[-1].shape)
            imag_belief, imag_state, _, _ = self.transition_model(
                imag_states[-1], imag_action, imag_beliefs[-1])
            imag_beliefs.append(imag_belief.squeeze(dim=0))
            imag_states.append(imag_state.squeeze(dim=0))
            if with_logprob:
                imag_ac_logps.append(imag_ac_logp.squeeze(dim=0))

        imag_beliefs = torch.stack(imag_beliefs, dim=0).to(
            self.args.device
        )  # shape [horizon+1, (chuck-1)*batch, belief_size]
        imag_states = torch.stack(imag_states, dim=0).to(self.args.device)
        if with_logprob:
            imag_ac_logps = torch.stack(imag_ac_logps, dim=0).to(
                self.args.device)  # shape [horizon, (chuck-1)*batch]

        return imag_beliefs, imag_states, imag_ac_logps if with_logprob else None

    def update_parameters(self, gradient_steps):
        loss_info = []  # used to record loss
        for s in tqdm(range(gradient_steps)):
            # get state and belief of samples
            observations, actions, rewards, nonterminals = self.D.sample(
                self.args.batch_size, self.args.chunk_size)
            # print("check sampled rewrads", rewards)
            init_belief = torch.zeros(self.args.batch_size,
                                      self.args.belief_size,
                                      device=self.args.device)
            init_state = torch.zeros(self.args.batch_size,
                                     self.args.state_size,
                                     device=self.args.device)

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            # beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
            #   init_state,
            #   actions[:-1],
            #   init_belief,
            #   bottle(self.encoder, (observations[1:], )),
            #   nonterminals[:-1])

            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
                init_state, actions, init_belief,
                bottle(self.encoder, (observations, )),
                nonterminals)  # TODO: 4

            # update paras of world model
            world_model_loss = self._compute_loss_world(
                state=(beliefs, prior_states, prior_means, prior_std_devs,
                       posterior_states, posterior_means, posterior_std_devs),
                data=(observations, rewards, nonterminals))
            observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss
            self.world_optimizer.zero_grad()
            (observation_loss + reward_loss + kl_loss + pcont_loss).backward()
            nn.utils.clip_grad_norm_(self.world_param,
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.world_optimizer.step()

            # freeze params to save memory
            for p in self.world_param:
                p.requires_grad = False
            for p in self.value_model.parameters():
                p.requires_grad = False
            for p in self.value_model2.parameters():
                p.requires_gard = False

            # latent imagination
            imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination(
                beliefs, posterior_states, with_logprob=self.args.with_logprob)

            # update temp
            if self.args.auto_temp:
                temp_loss = -(
                    self.log_temp *
                    (imag_ac_logps[0] + self.target_entropy).detach()).mean()
                self.temp_optimizer.zero_grad()
                temp_loss.backward()
                self.temp_optimizer.step()
                self.args.temp = self.log_temp.exp()

            # update actor
            actor_loss = self._compute_loss_actor(imag_beliefs,
                                                  imag_states,
                                                  imag_ac_logps=imag_ac_logps)

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            nn.utils.clip_grad_norm_(self.actor_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.actor_optimizer.step()

            for p in self.world_param:
                p.requires_grad = True
            for p in self.value_model.parameters():
                p.requires_grad = True
            for p in self.value_model2.parameters():
                p.requires_grad = True

            # update critic
            imag_beliefs = imag_beliefs.detach()
            imag_states = imag_states.detach()

            critic_loss = self._compute_loss_critic(
                imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps)

            self.value_optimizer.zero_grad()
            critic_loss.backward()
            nn.utils.clip_grad_norm_(self.value_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            nn.utils.clip_grad_norm_(self.value_model2.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.value_optimizer.step()

            loss_info.append([
                observation_loss.item(),
                reward_loss.item(),
                kl_loss.item(),
                pcont_loss.item() if self.args.pcont else 0,
                actor_loss.item(),
                critic_loss.item()
            ])

        # finally, update target value function every #gradient_steps
        with torch.no_grad():
            self.target_value_model.load_state_dict(
                self.value_model.state_dict())
        with torch.no_grad():
            self.target_value_model2.load_state_dict(
                self.value_model2.state_dict())

        return loss_info

    def infer_state(self, observation, action, belief=None, state=None):
        """ Infer belief over current state q(s_t|o≤t,a<t) from the history,
        return updated belief and posterior_state at time t
        returned shape: belief/state [belief/state_dim] (remove the time_dim)
    """
        # observation is obs.to(device), action.shape=[act_dim] (will add time dim inside this fn), belief.shape
        belief, _, _, _, posterior_state, _, _ = self.transition_model(
            state, action.unsqueeze(dim=0), belief,
            self.encoder(observation).unsqueeze(
                dim=0))  # Action and observation need extra time dimension

        belief, posterior_state = belief.squeeze(
            dim=0), posterior_state.squeeze(
                dim=0)  # Remove time dimension from belief/state

        return belief, posterior_state

    def select_action(self, state, deterministic=False):
        # get action with the inputs get from fn: infer_state; return a numpy with shape [batch, act_size]
        belief, posterior_state = state
        action, _ = self.actor_model(belief,
                                     posterior_state,
                                     deterministic=deterministic,
                                     with_logprob=False)
        if not deterministic and not self.args.with_logprob:
            print("e")
            action = Normal(action, self.args.expl_amount).rsample()

            # clip the angle
            action[:, 0].clamp_(min=self.args.angle_min,
                                max=self.args.angle_max)
            # clip the throttle
            if self.args.fix_speed:
                action[:, 1] = self.args.throttle_base
            else:
                action[:, 1].clamp_(min=self.args.throttle_min,
                                    max=self.args.throttle_max)
        print("action", action)
        # return action.cup().numpy()
        return action  # this is a Tonsor.cuda

    def import_parameters(self, params):
        # only import or export the parameters used when local rollout
        self.encoder.load_state_dict(params["encoder"])
        self.actor_model.load_state_dict(params["policy"])
        self.transition_model.load_state_dict(params["transition"])

    def export_parameters(self):
        """ return the model paras used for local rollout """
        params = {
            "encoder": self.encoder.cpu().state_dict(),
            "policy": self.actor_model.cpu().state_dict(),
            "transition": self.transition_model.cpu().state_dict()
        }

        self.encoder.to(self.args.device)
        self.actor_model.to(self.args.device)
        self.transition_model.to(self.args.device)

        return params
Пример #10
0
from circular import CircularFun, CircularSpline
from models import ValueModel
from viz import value_plot

import matplotlib.pyplot as plt
import numpy as np

# value function
v = CircularSpline(3, 5, w=[0, 0, 0, 1, 0])

# movement penalty function
p = CircularFun(lambda x, a: a*x**2, p_circ=0.5, args=[10])

# model
m = ValueModel(v, p, 0, 10, 20)

# simulation
x = m.simulate(2000)
value_plot(x, m)
Пример #11
0
                         args.cnn_act)).to(device=args.device)

reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size,
                           args.dense_act).to(device=args.device)

encoder = Encoder(args.symbolic, env.observation_size, args.embedding_size,
                  args.cnn_act).to(device=args.device)

actor_model = ActorModel(
    env.action_size,
    args.belief_size,
    args.state_size,
    args.hidden_size,
    activation_function=args.dense_act).to(device=args.device)

value_model1 = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                          args.dense_act).to(device=args.device)

value_model2 = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                          args.dense_act).to(device=args.device)

pcont_model = PCONTModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_act).to(device=args.device)

target_value_model1 = deepcopy(value_model1)
for p in target_value_model1.parameters():
    p.requires_grad = False

target_value_model2 = deepcopy(value_model2)
for p in target_value_model2.parameters():
    p.requires_grad = False