Exemplo n.º 1
0
    def _compute_loss_actor(self,
                            imag_beliefs,
                            imag_states,
                            imag_ac_logps=None):
        # reward and value prediction of imagined trajectories
        imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states))
        imag_values = bottle(self.value_model, (imag_beliefs, imag_states))

        with torch.no_grad():
            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)
        pcont = pcont.detach()

        if imag_ac_logps is not None:
            imag_values[
                1:] -= self.args.temp * imag_ac_logps  # add entropy here

        returns = cal_returns(imag_rewards[:-1],
                              imag_values[:-1],
                              imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)

        discount = torch.cumprod(
            torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0),
            0).detach()

        actor_loss = -torch.mean(discount * returns)
        return actor_loss
Exemplo n.º 2
0
    def _compute_loss_critic(self,
                             imag_beliefs,
                             imag_states,
                             imag_ac_logps=None):

        with torch.no_grad():
            # calculate the target with the target nn
            target_imag_values = bottle(self.target_value_model,
                                        (imag_beliefs, imag_states))
            imag_rewards = bottle(self.reward_model,
                                  (imag_beliefs, imag_states))

            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)

            if imag_ac_logps is not None:
                target_imag_values[1:] -= self.args.temp * imag_ac_logps

        returns = cal_returns(imag_rewards[:-1],
                              target_imag_values[:-1],
                              target_imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)
        target_return = returns.detach()

        value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1]

        value_loss = F.mse_loss(value_pred, target_return,
                                reduction="none").mean(dim=(0, 1))

        return value_loss
Exemplo n.º 3
0
    def _compute_loss_world(self, state, data):
        # unpackage data
        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state
        observations, rewards, nonterminals = data

        observation_loss = F.mse_loss(
            bottle(self.observation_model, (beliefs, posterior_states)),
            observations,
            reduction='none').sum(
                dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))

        reward_loss = F.mse_loss(bottle(self.reward_model,
                                        (beliefs, posterior_states)),
                                 rewards,
                                 reduction='none').mean(dim=(0, 1))  # TODO: 5

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
                Independent(Normal(posterior_means, posterior_std_devs), 1),
                Independent(Normal(prior_means, prior_std_devs), 1)),
            self.free_nats).mean(dim=(0, 1))

        if self.args.pcont:
            pcont_loss = F.binary_cross_entropy(
                bottle(self.pcont_model, (beliefs, posterior_states)),
                nonterminals)

        return observation_loss, self.args.reward_scale * reward_loss, kl_loss, (
            self.args.pcont_scale * pcont_loss if self.args.pcont else 0)
Exemplo n.º 4
0
    def fit_buffer(self,episode):
        ####
        # Fit data taken from buffer 
        ######

        # Model fitting
        losses = []
        tqdm.write("Fitting buffer")
        for s in tqdm(range(self.parms.collect_interval)):

            # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
            observations, actions, rewards, nonterminals = self.D.sample(self.parms.batch_size, self.parms.chunk_size)  # Transitions start at time t = 0
            # Create initial belief and state for time t = 0
            init_belief, init_state = torch.zeros(self.parms.batch_size, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.batch_size, self.parms.state_size, device=self.parms.device)
            encoded_obs = bottle(self.encoder, (observations[1:], ))

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(init_state, actions[:-1], init_belief, encoded_obs, nonterminals[:-1])
            
            # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
            # LOSS
            observation_loss = F.mse_loss(bottle(self.observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum((2, 3, 4)).mean(dim=(0, 1))
            kl_loss = torch.max(kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), self.free_nats).mean(dim=(0, 1))  
            reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1))            

            # Update model parameters
            self.optimiser.zero_grad()

            (observation_loss + reward_loss + kl_loss).backward() # BACKPROPAGATION
            nn.utils.clip_grad_norm_(self.param_list, self.parms.grad_clip_norm, norm_type=2)
            self.optimiser.step()
            # Store (0) observation loss (1) reward loss (2) KL loss
            losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()])#, regularizer_loss.item()])

        #save statistics and plot them
        losses = tuple(zip(*losses))  
        self.metrics['observation_loss'].append(losses[0])
        self.metrics['reward_loss'].append(losses[1])
        self.metrics['kl_loss'].append(losses[2])
      
        lineplot(self.metrics['episodes'][-len(self.metrics['observation_loss']):], self.metrics['observation_loss'], 'observation_loss', self.statistics_path)
        lineplot(self.metrics['episodes'][-len(self.metrics['reward_loss']):], self.metrics['reward_loss'], 'reward_loss', self.statistics_path)
        lineplot(self.metrics['episodes'][-len(self.metrics['kl_loss']):], self.metrics['kl_loss'], 'kl_loss', self.statistics_path)
Exemplo n.º 5
0
    def _compute_loss_world(self, state, data):
        # unpackage data
        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state
        observations, rewards, nonterminals = data

        # observation_loss = F.mse_loss(
        #   bottle(self.observation_model, (beliefs, posterior_states)),
        #   observations[1:],
        #   reduction='none').sum(dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))
        #
        # reward_loss = F.mse_loss(
        #   bottle(self.reward_model, (beliefs, posterior_states)),
        #   rewards[1:],
        #   reduction='none').mean(dim=(0,1))

        observation_loss = F.mse_loss(
            bottle(self.observation_model, (beliefs, posterior_states)),
            observations,
            reduction='none').sum(
                dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))

        reward_loss = F.mse_loss(bottle(self.reward_model,
                                        (beliefs, posterior_states)),
                                 rewards,
                                 reduction='none').mean(dim=(0, 1))  # TODO: 5

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
                Independent(Normal(posterior_means, posterior_std_devs), 1),
                Independent(Normal(prior_means, prior_std_devs), 1)),
            self.free_nats).mean(dim=(0, 1))

        # print("check the reward", bottle(pcont_model, (beliefs, posterior_states)).shape, nonterminals[:-1].shape)
        if self.args.pcont:
            pcont_loss = F.binary_cross_entropy(
                bottle(self.pcont_model, (beliefs, posterior_states)),
                nonterminals)
            # pcont_pred = torch.distributions.Bernoulli(logits=bottle(self.pcont_model, (beliefs, posterior_states)))
            # pcont_loss = -pcont_pred.log_prob(nonterminals[1:]).mean(dim=(0, 1))

        return observation_loss, self.args.reward_scale * reward_loss, kl_loss, (
            self.args.pcont_scale * pcont_loss if self.args.pcont else 0)
Exemplo n.º 6
0
def update_belief_and_act(args,
                          env,
                          actor_model,
                          transition_model,
                          encoder,
                          belief,
                          posterior_state,
                          action,
                          observation,
                          deterministic=False):
    # Infer belief over current state q(s_t|o≤t,a<t) from the history
    belief, _, _, _, posterior_state, _, _ = transition_model(
        posterior_state, action.unsqueeze(dim=0), belief,
        encoder(observation).unsqueeze(
            dim=0))  # Action and observation need extra time dimension

    belief, posterior_state = belief.squeeze(dim=0), posterior_state.squeeze(
        dim=0)  # Remove time dimension from belief/state

    #
    # if explore:
    #   action = actor_model(belief, posterior_state).rsample()  # batch_shape=1, event_shape=6
    #   # add exploration noise -- following the original code: line 275-280
    #   action = Normal(action, args.expl_amount).rsample()
    #
    #   # TODO: add this later
    #   # action = torch.clamp(action, [-1.0, 0.0], [1.0, 5.0])
    # else:
    #   action = actor_model(belief, posterior_state).mode()
    action, _ = actor_model(
        belief,
        posterior_state,
        deterministic=deterministic,
        with_logprob=False
    )  # with sac, not need to add exploration noise, the max entropy can maintain it.
    if args.temp == 0 and not deterministic:
        action = Normal(action, args.expl_amount).rsample()
    action[:, 1] = 0.3  # TODO: fix the speed
    next_observation, reward, done = env.step(
        action.cpu() if isinstance(env, EnvBatcher) else action[0].cpu(
        ))  # Perform environment step (action repeats handled internally)

    print(
        bottle(value_model1, (belief.unsqueeze(dim=0),
                              posterior_state.unsqueeze(dim=0))).item())
    return belief, posterior_state, action, next_observation, reward, done
Exemplo n.º 7
0
def compute_curious_action_values(beliefs, states, means, std_devs, actions, onestep_models, curious_actor_model, curious_value_model, discount):
  intrinsic_reward = compute_intrinsic_reward(beliefs, actions, onestep_models)
  reward = intrinsic_reward
  # reward -= compute_action_divergence(beliefs, states, curious_actor)
  # reward -= compute_state_divergence(means, std_devs)
  pcont = torch.ones_like(reward)
  pcont *= discount
  value = Normal(bottle(curious_value_model, (beliefs, states)),1).mean()

  reward = reward[:, :-1]
  value = value[:, :-1]
  pcont = pcont[:, :-1]
  bootstrap = value[:, -1]

  return_ = lambda_return(
      reward, value, pcont, bootstrap,
      lambda_=self._c.disclam, axis=1)

  return_ *= tf.stop_gradient(tf.math.cumprod(tf.concat([
      tf.ones_like(pcont[:, :1]), pcont[:, :-1]], 1), 1))

  return return_
Exemplo n.º 8
0
 for s in tqdm(range(args.collect_interval)):
     # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
     observations, actions, rewards, nonterminals = D.sample(
         args.batch_size,
         args.chunk_size)  # Transitions start at time t = 0
     # Create initial belief and state for time t = 0
     init_belief, init_state = torch.zeros(args.batch_size,
                                           args.belief_size,
                                           device=args.device), torch.zeros(
                                               args.batch_size,
                                               args.state_size,
                                               device=args.device)
     # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
     beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
         init_state, actions[:-1], init_belief,
         bottle(encoder, (observations[1:], )), nonterminals[:-1])
     # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
     if args.worldmodel_LogProbLoss:
         observation_dist = Normal(
             bottle(observation_model, (beliefs, posterior_states)), 1)
         observation_loss = -observation_dist.log_prob(
             observations[1:]).sum(
                 dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
     else:
         observation_loss = F.mse_loss(
             bottle(observation_model, (beliefs, posterior_states)),
             observations[1:],
             reduction='none').sum(
                 dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
     if args.worldmodel_LogProbLoss:
         reward_dist = Normal(
Exemplo n.º 9
0
 for s in tqdm(range(args.collect_interval)):
     # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
     observations, actions, rewards, nonterminals = D.sample(
         args.batch_size,
         args.chunk_size)  # Transitions start at time t = 0
     # Create initial belief and state for time t = 0
     init_belief, init_state = torch.zeros(args.batch_size,
                                           args.belief_size,
                                           device=args.device), torch.zeros(
                                               args.batch_size,
                                               args.state_size,
                                               device=args.device)
     # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
     beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
         init_state, actions[:-1], init_belief,
         bottle(encoder, (observations[1:], )), nonterminals[:-1])
     #print("******************")
     #print(beliefs.shape)
     #print(prior_states.shape)
     #print(prior_means.shape)
     #print(prior_std_devs.shape)
     #print(actions.shape)
     #print("******************")
     # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
     observation_loss = F.mse_loss(
         bottle(observation_model, (beliefs, posterior_states)),
         observations[1:],
         reduction='none').sum(
             dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
     reward_loss = F.mse_loss(bottle(reward_model,
                                     (beliefs, posterior_states)),
Exemplo n.º 10
0
def to_image(obs):
  return torch.nn.functional.interpolate(obs.view(args.test_episodes,1,20,10),scale_factor=5)
# Training (and testing)
for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=metrics['episodes'][-1] + 1):
  # Model fitting
  losses = []
  model_modules = transition_model.modules+encoder.modules+observation_model.modules+reward_model.modules

  print("training loop")
  for s in tqdm(range(args.collect_interval)):
    # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
    observations, actions, rewards, nonterminals = D.sample(args.batch_size, args.chunk_size) # Transitions start at time t = 0
    # Create initial belief and state for time t = 0
    init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device)
    # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
    beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1])
    # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
    if args.worldmodel_MSEloss:
      observation_loss = F.mse_loss(bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
    else:
      observation_dist = Normal(bottle(observation_model, (beliefs, posterior_states)), 1)
      observation_loss = -observation_dist.log_prob(observations[1:]).sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
    if args.algo == "p2e":
      if args.zero_shot:
        reward_dist = Normal(bottle(reward_model, (beliefs.detach(), posterior_states)),1)
      else:
        if metrics['steps'][-1]*args.action_repeat > args.adaptation_step:
          reward_dist = Normal(bottle(reward_model, (beliefs, posterior_states)),1)
        else:
          reward_dist = Normal(bottle(reward_model, (beliefs.detach(), posterior_states)),1)
      reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
Exemplo n.º 11
0
    def train_algorithm(self, actor_states, actor_beliefs):

        [
            self.actor_pipes[i][0].send(1)
            for i, w in enumerate(self.workers_actor)
        ]  # Parent_pipe send data using i'th pipes
        [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)
         ]  # waitting the children finish

        with FreezeParameters(self.model_modules):
            imagination_traj = self.imagine_merge_ahead(
                prev_state=actor_states,
                prev_belief=actor_beliefs,
                policy_pool=self.actor_pool,
                transition_model=self.transition_model,
                merge_model=self.merge_actor_model)
        imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

        with FreezeParameters(self.model_modules +
                              self.merge_value_model_modules):
            imged_reward = bottle(self.reward_model,
                                  (imged_beliefs, imged_prior_states))
            value_pred = bottle(self.merge_value_model,
                                (imged_beliefs, imged_prior_states))

        with FreezeParameters(self.actor_pool_modules):
            returns = lambda_return(imged_reward,
                                    value_pred,
                                    bootstrap=value_pred[-1],
                                    discount=args.discount,
                                    lambda_=args.disclam)
            merge_actor_loss = -torch.mean(returns)
            # Update model parameters
            self.merge_actor_optimizer.zero_grad()
            merge_actor_loss.backward()
            nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(),
                                     args.grad_clip_norm,
                                     norm_type=2)
            self.merge_actor_optimizer.step()

        # Dreamer implementation: value loss calculation and optimization
        with torch.no_grad():
            value_beliefs = imged_beliefs.detach()
            value_prior_states = imged_prior_states.detach()
            target_return = returns.detach()

        value_dist = Normal(
            bottle(self.merge_value_model,
                   (value_beliefs, value_prior_states)),
            1)  # detach the input tensor from the transition network.
        merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1))
        # Update model parameters
        self.merge_value_optimizer.zero_grad()
        merge_value_loss.backward()
        nn.utils.clip_grad_norm_(self.merge_value_model.parameters(),
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.merge_value_optimizer.step()

        self.merge_losses.append(
            [merge_actor_loss.item(),
             merge_value_loss.item()])
Exemplo n.º 12
0
    def train(self):
        # Model fitting
        losses = []
        print("training loop")
        # args.collect_interval = 1
        for s in tqdm(range(args.collect_interval)):

            # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
            observations, actions, rewards, nonterminals = self.D.sample(
                args.batch_size,
                args.chunk_size)  # Transitions start at time t = 0
            # Create initial belief and state for time t = 0
            init_belief, init_state = torch.zeros(
                args.batch_size, args.belief_size,
                device=args.device), torch.zeros(args.batch_size,
                                                 args.state_size,
                                                 device=args.device)
            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            obs = bottle(self.encoder, (observations[1:], ))
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.upper_transition_model(
                prev_state=init_state,
                actions=actions[:-1],
                prev_belief=init_belief,
                obs=obs,
                nonterminals=nonterminals[:-1])

            # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
            observation_loss, reward_loss, kl_loss = self.train_env_model(
                beliefs, prior_states, prior_means, prior_std_devs,
                posterior_states, posterior_means, posterior_std_devs,
                observations, actions, rewards, nonterminals)

            # Dreamer implementation: actor loss calculation and optimization
            with torch.no_grad():
                actor_states = posterior_states.detach().to(
                    device=args.device).share_memory_()
                actor_beliefs = beliefs.detach().to(
                    device=args.device).share_memory_()

            # if not os.path.exists(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)): os.mkdir(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir))
            torch.save(
                actor_states,
                os.path.join(os.getcwd(),
                             args.results_dir + '/actor_states.pt'))
            torch.save(
                actor_beliefs,
                os.path.join(os.getcwd(),
                             args.results_dir + '/actor_beliefs.pt'))

            # [self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor)]  # Parent_pipe send data using i'th pipes
            # [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)]  # waitting the children finish

            self.algorithms.train_algorithm(actor_states, actor_beliefs)
            losses.append(
                [observation_loss.item(),
                 reward_loss.item(),
                 kl_loss.item()])

            # if self.algorithms.train_algorithm(actor_states, actor_beliefs) is not None:
            #   merge_actor_loss, merge_value_loss = self.algorithms.train_algorithm(actor_states, actor_beliefs)
            #   losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item(), merge_actor_loss.item(), merge_value_loss.item()])
            # else:
            #   losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()])

        return losses
Exemplo n.º 13
0
  env.close()
  quit()

print('Training Dada')
# Training (and testing)
for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=metrics['episodes'][-1] + 1):
  # Model fitting
  losses = []
  for s in tqdm(range(args.collect_interval)):
    # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
    observations, actions, rewards, nonterminals = D.sample(args.batch_size, args.chunk_size)  # Transitions start at time t = 0

    # Create initial belief and state for time t = 0
    init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device)
    # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
    beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1])
    # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)

    observation_loss = F.mse_loss(bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
    reward_loss = F.mse_loss(bottle(reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1))

    kl_loss = torch.max(kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), free_nats).mean(dim=(0, 1))  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
    # print (type(beliefs))
    if args.global_kl_beta != 0:
      kl_loss += args.global_kl_beta * kl_divergence(Normal(posterior_means, posterior_std_devs), global_prior).sum(dim=2).mean(dim=(0, 1))
    # Calculate latent overshooting objective for t > 0
    if args.overshooting_kl_beta != 0:
      overshooting_vars = []  # Collect variables for overshooting to process in batch
      for t in range(1, args.chunk_size - 1):
        d = min(t + args.overshooting_distance, args.chunk_size - 1)  # Overshooting distance
        t_, d_ = t - 1, d - 1  # Use t_ and d_ to deal with different time indexing for latent states
Exemplo n.º 14
0
    def update_parameters(self, data, gradient_steps):
        loss_info = []  # used to record loss
        for s in tqdm(range(gradient_steps)):
            # get state and belief of samples
            observations, actions, rewards, nonterminals = data

            init_belief = torch.zeros(self.args.batch_size,
                                      self.args.belief_size,
                                      device=self.args.device)
            init_state = torch.zeros(self.args.batch_size,
                                     self.args.state_size,
                                     device=self.args.device)

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
                init_state, actions, init_belief,
                bottle(self.encoder, (observations, )),
                nonterminals)  # TODO: 4

            # update paras of world model
            world_model_loss = self._compute_loss_world(
                state=(beliefs, prior_states, prior_means, prior_std_devs,
                       posterior_states, posterior_means, posterior_std_devs),
                data=(observations, rewards, nonterminals))
            observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss
            self.world_optimizer.zero_grad()
            (observation_loss + reward_loss + kl_loss + pcont_loss).backward()
            nn.utils.clip_grad_norm_(self.world_param,
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.world_optimizer.step()

            # freeze params to save memory
            for p in self.world_param:
                p.requires_grad = False
            for p in self.value_model.parameters():
                p.requires_grad = False

            # latent imagination
            imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination(
                beliefs, posterior_states, with_logprob=self.args.with_logprob)

            # update actor
            actor_loss = self._compute_loss_actor(imag_beliefs,
                                                  imag_states,
                                                  imag_ac_logps=imag_ac_logps)

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            nn.utils.clip_grad_norm_(self.actor_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.actor_optimizer.step()

            for p in self.world_param:
                p.requires_grad = True
            for p in self.value_model.parameters():
                p.requires_grad = True

            # update critic
            imag_beliefs = imag_beliefs.detach()
            imag_states = imag_states.detach()

            critic_loss = self._compute_loss_critic(
                imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps)

            self.value_optimizer.zero_grad()
            critic_loss.backward()
            nn.utils.clip_grad_norm_(self.value_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.value_optimizer.step()

            loss_info.append([
                observation_loss.item(),
                reward_loss.item(),
                kl_loss.item(),
                pcont_loss.item() if self.args.pcont else 0,
                actor_loss.item(),
                critic_loss.item()
            ])

        # finally, update target value function every #gradient_steps
        with torch.no_grad():
            self.target_value_model.load_state_dict(
                self.value_model.state_dict())

        return loss_info
Exemplo n.º 15
0
    def train_env_model(self, beliefs, prior_states, prior_means,
                        prior_std_devs, posterior_states, posterior_means,
                        posterior_std_devs, observations, actions, rewards,
                        nonterminals):
        # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
        if args.worldmodel_LogProbLoss:
            observation_dist = Normal(
                bottle(self.observation_model, (beliefs, posterior_states)), 1)
            observation_loss = -observation_dist.log_prob(
                observations[1:]).sum(
                    dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
        else:
            observation_loss = F.mse_loss(
                bottle(self.observation_model, (beliefs, posterior_states)),
                observations[1:],
                reduction='none').sum(
                    dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
        if args.worldmodel_LogProbLoss:
            reward_dist = Normal(
                bottle(self.reward_model, (beliefs, posterior_states)), 1)
            reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
        else:
            reward_loss = F.mse_loss(bottle(self.reward_model,
                                            (beliefs, posterior_states)),
                                     rewards[:-1],
                                     reduction='none').mean(dim=(0, 1))

        # transition loss
        div = kl_divergence(Normal(posterior_means, posterior_std_devs),
                            Normal(prior_means, prior_std_devs)).sum(dim=2)
        kl_loss = torch.max(div, self.free_nats).mean(
            dim=(0, 1)
        )  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
        if args.global_kl_beta != 0:
            kl_loss += args.global_kl_beta * kl_divergence(
                Normal(posterior_means, posterior_std_devs),
                self.global_prior).sum(dim=2).mean(dim=(0, 1))
        # Calculate latent overshooting objective for t > 0
        if args.overshooting_kl_beta != 0:
            overshooting_vars = [
            ]  # Collect variables for overshooting to process in batch
            for t in range(1, args.chunk_size - 1):
                d = min(t + args.overshooting_distance,
                        args.chunk_size - 1)  # Overshooting distance
                t_, d_ = t - 1, d - 1  # Use t_ and d_ to deal with different time indexing for latent states
                seq_pad = (
                    0, 0, 0, 0, 0, t - d + args.overshooting_distance
                )  # Calculate sequence padding so overshooting terms can be calculated in one batch
                # Store (0) actions, (1) nonterminals, (2) rewards, (3) beliefs, (4) prior states, (5) posterior means, (6) posterior standard deviations and (7) sequence masks
                overshooting_vars.append(
                    (F.pad(actions[t:d],
                           seq_pad), F.pad(nonterminals[t:d], seq_pad),
                     F.pad(rewards[t:d],
                           seq_pad[2:]), beliefs[t_], prior_states[t_],
                     F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad),
                     F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(),
                           seq_pad,
                           value=1),
                     F.pad(
                         torch.ones(d - t,
                                    args.batch_size,
                                    args.state_size,
                                    device=args.device), seq_pad))
                )  # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences
            overshooting_vars = tuple(zip(*overshooting_vars))
            # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs = self.upper_transition_model(
                torch.cat(overshooting_vars[4], dim=0),
                torch.cat(overshooting_vars[0], dim=1),
                torch.cat(overshooting_vars[3], dim=0), None,
                torch.cat(overshooting_vars[1], dim=1))
            seq_mask = torch.cat(overshooting_vars[7], dim=1)
            # Calculate overshooting KL loss with sequence mask
            kl_loss += (
                1 / args.overshooting_distance
            ) * args.overshooting_kl_beta * torch.max((kl_divergence(
                Normal(torch.cat(overshooting_vars[5], dim=1),
                       torch.cat(overshooting_vars[6], dim=1)),
                Normal(prior_means, prior_std_devs)
            ) * seq_mask).sum(dim=2), self.free_nats).mean(dim=(0, 1)) * (
                args.chunk_size
                - 1
            )  # Update KL loss (compensating for extra average over each overshooting/open loop sequence)
            # Calculate overshooting reward prediction loss with sequence mask
            if args.overshooting_reward_scale != 0:
                reward_loss += (
                    1 / args.overshooting_distance
                ) * args.overshooting_reward_scale * F.mse_loss(
                    bottle(self.reward_model,
                           (beliefs, prior_states)) * seq_mask[:, :, 0],
                    torch.cat(overshooting_vars[2], dim=1),
                    reduction='none'
                ).mean(dim=(0, 1)) * (
                    args.chunk_size - 1
                )  # Update reward loss (compensating for extra average over each overshooting/open loop sequence)
        # Apply linearly ramping learning rate schedule
        if args.learning_rate_schedule != 0:
            for group in self.model_optimizer.param_groups:
                group['lr'] = min(
                    group['lr'] + args.model_learning_rate /
                    args.model_learning_rate_schedule,
                    args.model_learning_rate)
        model_loss = observation_loss + reward_loss + kl_loss
        # Update model parameters
        self.model_optimizer.zero_grad()
        model_loss.backward()
        nn.utils.clip_grad_norm_(self.param_list,
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.model_optimizer.step()
        return observation_loss, reward_loss, kl_loss
Exemplo n.º 16
0
def train(args: argparse.Namespace,
          env: Env,
          D: ExperienceReplay,
          models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module],
          optimizer: Tuple[optim.Optimizer, optim.Optimizer],
          param_list: List[nn.parameter.Parameter],
          planner: nn.Module):
    # auxilliary tensors
    global_prior = Normal(
        torch.zeros(args.batch_size, args.state_size, device=args.device),
        torch.ones(args.batch_size, args.state_size, device=args.device)
    )  # Global prior N(0, I)
    # Allowed deviation in KL divergence
    free_nats = torch.full((1, ), args.free_nats, dtype=torch.float32, device=args.device)
    summary_writter = SummaryWriter(args.tensorboard_dir)

    # unpack models
    transition_model, observation_model, reward_model, encoder = models
    transition_optimizer, reward_optimizer = optimizer

    for idx_episode in trange(args.episodes, leave=False, desc="Episode"):
        for idx_train in trange(args.collect_interval, leave=False, desc="Training"):
            # Draw sequence chunks {(o[t], a[t], r[t+1], z[t+1])} ~ D uniformly at random from the dataset
            # The first two dimensions of the tensors are L (chunk size) and n (batch size)
            # We want to use o[t+1] to correct the error of the transition model,
            # so we need to convert the sequence to {(o[t+1], a[t], r[t+1], z[t+1])}
            observations, actions, rewards_dist, rewards_coll, nonterminals = D.sample(args.batch_size, args.chunk_size)
            # Create initial belief and state for time t = 0
            init_belief = torch.zeros(args.batch_size, args.belief_size, device=args.device)
            init_state = torch.zeros(args.batch_size, args.state_size, device=args.device)
            # Transition model forward
            # deterministic: h[t+1] = f(h[t], a[t])
            # prior:         s[t+1] ~ Prob(s|h[t+1])
            # posterior:     s[t+1] ~ Prob(s|h[t+1], o[t+1])
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
                init_state,
                actions[:-1],
                init_belief,
                bottle(encoder, (observations[1:], )),
                nonterminals[:-1]
            )

            # observation loss
            predictions = bottle(observation_model, (beliefs, posterior_states))
            visual_loss = F.mse_loss(
                predictions[:, :, :3*64*64],
                observations[1:, :, :3*64*64]
            ).mean()
            symbol_loss = F.mse_loss(
                predictions[:, :, 3*64*64:],
                observations[1:, :, 3*64*64:]
            ).mean()
            observation_loss = visual_loss + symbol_loss

            # KL divergence loss. Minimize the difference between posterior and prior
            kl_loss = torch.max(
                kl_divergence(
                    Normal(posterior_means, posterior_std_devs),
                    Normal(prior_means, prior_std_devs)
                ).sum(dim=2),
                free_nats
            ).mean(dim=(0, 1))  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
            if args.global_kl_beta != 0:
                kl_loss += args.global_kl_beta * kl_divergence(
                    Normal(posterior_means, posterior_std_devs),
                    global_prior
                ).sum(dim=2).mean(dim=(0, 1))

            # overshooting loss
            if args.overshooting_kl_beta != 0:
                overshooting_vars = []  # Collect variables for overshooting to process in batch
                for t in range(1, args.chunk_size - 1):
                    d = min(t + args.overshooting_distance, args.chunk_size - 1)  # Overshooting distance
                    # Use t_ and d_ to deal with different time indexing for latent states
                    t_, d_ = t - 1, d - 1
                    # Calculate sequence padding so overshooting terms can be calculated in one batch
                    seq_pad = (0, 0, 0, 0, 0, t - d + args.overshooting_distance)
                    # Store
                    # * a[t:d],
                    # * z[t+1:d+1]
                    # * r[t+1:d+1]
                    # * h[t]
                    # * s[t] prior
                    # * E[s[t:d]] posterior
                    # * Var[s[t:d]] posterior
                    # * mask:
                    #       the last few sequences do not have enough length,
                    #       so we pad it with 0 to the same length as previous sequence for batch operation,
                    #       and use mask to indicate invalid variables.
                    overshooting_vars.append(
                        (F.pad(actions[t:d], seq_pad),
                         F.pad(nonterminals[t:d], seq_pad),
                         F.pad(rewards_dist[t:d], seq_pad[2:]),
                         beliefs[t_],
                         prior_states[t_],
                         F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad),
                         F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1),
                         F.pad(torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad)
                         )
                    )  # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences

                overshooting_vars = tuple(zip(*overshooting_vars))
                # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once)
                beliefs, prior_states, prior_means, prior_std_devs = transition_model(
                    torch.cat(overshooting_vars[4], dim=0),
                    torch.cat(overshooting_vars[0], dim=1),
                    torch.cat(overshooting_vars[3], dim=0),
                    None,
                    torch.cat(overshooting_vars[1], dim=1)
                )
                seq_mask = torch.cat(overshooting_vars[7], dim=1)
                # Calculate overshooting KL loss with sequence mask
                kl_loss += (1 / args.overshooting_distance) * args.overshooting_kl_beta * torch.max(
                    (kl_divergence(
                        Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)),
                        Normal(prior_means, prior_std_devs)
                    ) * seq_mask).sum(dim=2),
                    free_nats
                ).mean(dim=(0, 1)) * (args.chunk_size - 1)  # Update KL loss (compensating for extra average over each overshooting/open loop sequence)

            # TODO: add learning rate schedule
            # Update model parameters
            transition_optimizer.zero_grad()
            loss = observation_loss * 200 + kl_loss
            loss.backward()
            nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2)
            transition_optimizer.step()

            # reward loss
            rewards_dist_predict, rewards_coll_predict = bottle(reward_model.raw, (beliefs.detach(), posterior_states.detach()))
            reward_loss = F.mse_loss(
                rewards_dist_predict,
                rewards_dist[:-1],
                reduction='mean'
            ) + F.binary_cross_entropy(
                rewards_coll_predict,
                rewards_coll[:-1],
                reduction='mean'
            )
            reward_optimizer.zero_grad()
            reward_loss.backward()
            reward_optimizer.step()

            # add tensorboard log
            global_step = idx_train + idx_episode * args.collect_interval
            summary_writter.add_scalar("observation_loss", observation_loss, global_step)
            summary_writter.add_scalar("reward_loss", reward_loss, global_step)
            summary_writter.add_scalar("kl_loss", kl_loss, global_step)

        for idx_collect in trange(1, leave=False, desc="Collecting"):
            experience = collect_experience(args, env, models, planner, True, desc="Collecting experience {}".format(idx_collect))
            T = len(experience["observation"])
            for idx_step in range(T):
                D.append(experience["observation"][idx_step],
                         experience["action"][idx_step],
                         experience["reward_dist"][idx_step],
                         experience["reward_coll"][idx_step],
                         experience["done"][idx_step])

        # Checkpoint models
        if (idx_episode + 1) % args.checkpoint_interval == 0:
            record_path = os.path.join(args.checkpoint_dir, "checkpoint")
            checkpoint_path = os.path.join(args.checkpoint_dir, 'models_%d.pth' % (idx_episode+1))
            torch.save(
                {
                    'transition_model': transition_model.state_dict(),
                    'observation_model': observation_model.state_dict(),
                    'reward_model': reward_model.state_dict(),
                    'encoder': encoder.state_dict(),
                    'transition_optimizer': transition_optimizer.state_dict(),
                    'reward_optimizer': reward_optimizer.state_dict()
                },
                checkpoint_path)
            with open(record_path, "w") as f:
                f.write('models_%d.pth' % (idx_episode+1))
            planner.save(os.path.join(args.torchscript_dir, "mpc_planner.pth"))
            transition_model.save(os.path.join(args.torchscript_dir, "transition_model.pth"))
            reward_model.save(os.path.join(args.torchscript_dir, "reward_model.pth"))
            observation_model.save(os.path.join(args.torchscript_dir, "observation_decoder.pth"))
            encoder.save(os.path.join(args.torchscript_dir, "observation_encoder.pth"))

    summary_writter.close()
Exemplo n.º 17
0
        observations, actions, rewards, nonterminals = D.sample(
            args.batch_size,
            args.chunk_size)  # Transitions start at time t = 01
        # print("data shape check", observations.shape, actions.shape, rewards.shape, nonterminals.shape)
        """world model update"""
        init_belief = torch.zeros(args.batch_size,
                                  args.belief_size,
                                  device=args.device)
        init_state = torch.zeros(args.batch_size,
                                 args.state_size,
                                 device=args.device)
        # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)

        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
            init_state, actions[:-1], init_belief,
            bottle(encoder, (observations[1:], )), nonterminals[:-1])

        observation_loss = F.mse_loss(
            bottle(observation_model, (beliefs, posterior_states)),
            observations[1:],
            reduction='none').sum(dim=2 if args.symbolic else (2, 3, 4)).mean(
                dim=(0, 1))

        reward_loss = F.mse_loss(bottle(reward_model,
                                        (beliefs, posterior_states)),
                                 rewards[1:],
                                 reduction='none').mean(dim=(0, 1))

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
Exemplo n.º 18
0
 for s in tqdm(range(args.collect_interval)):
     # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
     observations, actions, rewards, nonterminals = D.sample(
         args.batch_size,
         args.chunk_size)  # Transitions start at time t = 0
     # Create initial belief and state for time t = 0
     init_belief, init_state = torch.zeros(args.batch_size,
                                           args.belief_size,
                                           device=args.device), torch.zeros(
                                               args.batch_size,
                                               args.state_size,
                                               device=args.device)
     # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
     beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
         init_state, actions[:-1], init_belief,
         bottle(encoder, (observations[1:], )), nonterminals[:-1])
     # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
     if args.worldmodel_MSEloss:
         observation_loss = F.mse_loss(
             bottle(observation_model, (beliefs, posterior_states)),
             observations[1:],
             reduction='none').sum(
                 dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
     else:
         observation_dist = Normal(
             bottle(observation_model, (beliefs, posterior_states)), 1)
         observation_loss = -observation_dist.log_prob(
             observations[1:]).sum(
                 dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
     if args.algo == "p2e":
         if args.zero_shot:
Exemplo n.º 19
0
    def run(self) -> None:
        # print("children process {} waiting to get data".format(self.process_id))
        # Run = self.child_conn.recv()
        # print("children process {} Geted data form parent".format(self.process_id))
        actor_loss, value_loss = None, None
        while self.child_conn.recv() == 1:
            # print("Start Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id))
            for _ in range(args.sub_traintime):
                with FreezeParameters(self.env_model_modules):
                    actor_states = torch.load(
                        os.path.join(os.getcwd(),
                                     self.results_dir + '/actor_states.pt'))
                    actor_beliefs = torch.load(
                        os.path.join(os.getcwd(),
                                     self.results_dir + '/actor_beliefs.pt'))
                    actor_states = actor_states.cuda(
                    ) if torch.cuda.is_available(
                    ) and not args.disable_cuda else actor_states.cpu()
                    actor_beliefs = actor_beliefs.cuda(
                    ) if torch.cuda.is_available(
                    ) and not args.disable_cuda else actor_beliefs.cpu()

                    imagination_traj = imagine_ahead(
                        actor_states,
                        actor_beliefs,
                        self.actor_l,
                        self.transition_model,
                        args.planning_horizon,
                        action_scale=self.process_id)

                imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

                # Update model parameters
                with FreezeParameters(self.env_model_modules +
                                      self.value_model_l_modules):
                    imged_reward = bottle(self.reward_model,
                                          (imged_beliefs, imged_prior_states))
                    value_pred = bottle(self.value_l,
                                        (imged_beliefs, imged_prior_states))

                returns = lambda_return(imged_reward,
                                        value_pred,
                                        bootstrap=value_pred[-1],
                                        discount=args.discount,
                                        lambda_=args.disclam)
                actor_loss = -torch.mean(returns)

                # calculate local gradients and push local parameters to global
                self.actor_optimizer_l.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor_l.parameters(),
                                         args.grad_clip_norm,
                                         norm_type=2)
                # for la, ga in zip(self.actor_l.parameters(), self.actor_g.parameters()):
                #     ga._grad = la.grad
                self.actor_optimizer_l.step()

                # push global parameters
                # self.actor_l.load_state_dict(self.actor_g.state_dict())

                # Dreamer implementation: value loss calculation and optimization
                with torch.no_grad():
                    value_beliefs = imged_beliefs.detach()
                    value_prior_states = imged_prior_states.detach()
                    target_return = returns.detach()

                value_dist = Normal(
                    bottle(self.value_l, (value_beliefs, value_prior_states)),
                    1)  # detach the input tensor from the transition network.
                value_loss = -value_dist.log_prob(target_return).mean(dim=(0,
                                                                           1))
                # Update model parameters
                self.value_optimizer_l.zero_grad()
                value_loss.backward()
                nn.utils.clip_grad_norm_(self.value_l.parameters(),
                                         args.grad_clip_norm,
                                         norm_type=2)
                self.value_optimizer_l.step()

            # save the loss data
            self.losses.append([actor_loss.item(), value_loss.item()])
            if self.count == args.collect_interval - 1:
                losses = tuple(zip(*self.losses))
                self.metrics['actor_loss'].append(losses[0])
                self.metrics['value_loss'].append(losses[1])
                Save_Txt(self.metrics['episodes'][-1],
                         self.metrics['actor_loss'][-1],
                         'actor_loss' + str(self.process_id), self.results_dir)
                Save_Txt(self.metrics['episodes'][-1],
                         self.metrics['value_loss'][-1],
                         'value_loss' + str(self.process_id), self.results_dir)
                self.count = 0
                self.losses = []
                self.metrics['episodes'].append(self.metrics['episodes'][-1] +
                                                1)
            self.count += 1

            # print("End Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id))

            self.child_conn.send(1)
Exemplo n.º 20
0
                 total=args.episodes,
                 initial=metrics['episodes'][-1] + 1):
 # Model fitting
 losses = []
 for s in tqdm(range(args.collect_interval)):
     # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
     observations, actions, rewards, nonterminals = D.sample(
         args.batch_size,
         args.chunk_size)  # Transitions start at time t = 0
     # Create initial belief and state for time t = 0
     init_state = torch.zeros(args.batch_size,
                              args.state_size,
                              device=args.device)
     # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
     prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
         init_state, actions[:-1], bottle(encoder, (observations[1:], )))
     # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
     observation_loss = F.mse_loss(
         bottle(observation_model, (posterior_states, )),
         observations[1:],
         reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1))
     kl_loss = torch.max(
         kl_divergence(Normal(posterior_means, posterior_std_devs),
                       Normal(prior_means,
                              prior_std_devs)).sum(dim=2), free_nats
     ).mean(
         dim=(0, 1)
     )  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
     if args.global_kl_beta != 0:
         kl_loss += args.global_kl_beta * kl_divergence(
             Normal(posterior_means, posterior_std_devs),
Exemplo n.º 21
0
 for s in tqdm(range(args.collect_interval)):
     # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
     observations, actions, rewards, nonterminals = D.sample(
         args.batch_size,
         args.chunk_size)  # Transitions start at time t = 0
     # Create initial belief and state for time t = 0
     init_belief, init_state = torch.zeros(args.batch_size,
                                           args.belief_size,
                                           device=device), torch.zeros(
                                               args.batch_size,
                                               args.state_size,
                                               device=device)
     # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
     beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
         init_state, actions[:-1], init_belief,
         bottle(encoder, (observations[1:], )), nonterminals[:-1])
     # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
     if args.worldmodel_LogProbLoss:
         observation_dist = Normal(
             bottle(observation_model, (beliefs, posterior_states)), 1)
         observation_loss = -observation_dist.log_prob(
             observations[1:]).sum(dim=(2, 3, 4)).mean(dim=(0, 1))
     else:
         observation_loss = F.mse_loss(
             bottle(observation_model, (beliefs, posterior_states)),
             observations[1:],
             reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1))
     if args.worldmodel_LogProbLoss:
         reward_dist = Normal(
             bottle(reward_model, (beliefs, posterior_states)), 1)
         reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
Exemplo n.º 22
0
 # we also sample obs_aug
 observations, actions, rewards, nonterminals, observations_aug0, observations_aug1 = D.sample(
     args.batch_size, args.chunk_size)  # Transitions start at time t = 0
 # combine two obs_aug in as batches
 obs_aug_both = torch.cat((observations_aug0, observations_aug1), dim=1)
 # perhaps repeat is enough
 obs_gt = torch.cat((observations, observations), dim=1)
 rewards_gt = torch.cat((rewards, rewards), dim=1)
 # Create initial belief and state for time t = 0
 init_belief, init_state = torch.zeros(args.batch_size*2, args.belief_size, device=device), torch.zeros(
     args.batch_size*2, args.state_size, device=device)
 nonterminals_both=torch.cat((nonterminals,nonterminals),dim=1)
 # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
 # obs_aug is used for state estimation
 beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
     init_state, actions[:-1], init_belief, bottle(encoder, (obs_aug_both[1:], )), nonterminals_both[:-1])
 # we used the orginal observation for reconstruction
 if args.worldmodel_LogProbLoss:
     observation_dist = Normal(
         bottle(observation_model, (beliefs, posterior_states)), 1)
     observation_loss = -observation_dist.log_prob(
         obs_gt[1:]).sum(dim=(2, 3, 4)).mean(dim=(0, 1))
 else:
     observation_loss = F.mse_loss(
         bottle(observation_model, (beliefs, posterior_states)), obs_gt[1:], reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1))
 if args.worldmodel_LogProbLoss:
     reward_dist = Normal(
         bottle(reward_model, (beliefs, posterior_states)), 1)
     reward_loss = - \
         reward_dist.log_prob(rewards_gt[:-1]).mean(dim=(0, 1))
 else: