Exemplo n.º 1
0
    def _train_policy(self, metrics: dict, D: ExperienceReplay, epoch: int,
                      global_prior, free_nats):
        losses = []
        policy_ = deepcopy(self.policy.module)
        optimizer = optim.Adam(policy_.parameters(),
                               lr=cfg.learning_rate,
                               eps=cfg.adam_epsilon)
        O, A, _, _ = D.get_last()
        B, S_pos = torch.zeros(O.size(1) + 1, cfg.belief_size), torch.zeros(
            O.size(1) + 1, cfg.state_size)
        with torch.no_grad():
            a_0 = torch.zeros(1, 1, A.size(2))
            b_0 = torch.zeros(1, cfg.belief_size).cuda()
            s_0 = torch.zeros(1, cfg.state_size).cuda()
            B, _, _, _, S_pos, _, _ = self.wm.t_model(
                s_0,
                torch.cat((a_0, A), dim=1)[:, :-1, :], b_0, self.wm.e_model(O))
            B = torch.cat((b_0.unsqueeze(0), B), 1).squeeze(0)
            S_pos = torch.cat((s_0.unsqueeze(0), S_pos), 1).squeeze(0)

            A_tgt = torch.zeros(B.size(0), A.size(-1))
            for ii in tqdm(list(
                    chunks(list(range(A_tgt.size(0))), cfg.batch_size)),
                           desc=poem(f"{epoch} Query Expert"),
                           leave=False):
                A_tgt[ii] = self.planner(B[ii], S_pos[ii])
        A_tgt = A_tgt.cuda()

        for _ in tqdm(range(cfg.collect_interval_plcy),
                      desc=poem(f"{epoch} Policy Train"),
                      leave=False):
            ii = random.sample(range(A_tgt.size(0)), cfg.batch_size)
            A_pred = policy_(B[ii], S_pos[ii])
            loss = F.mse_loss(A_pred, A_tgt[ii], reduction='mean')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

        metrics['im_loss'].append(mean(losses))

        soft_update(self.policy.module, policy_, cfg.linear_policy_update)
Exemplo n.º 2
0
 def _reset_realsense(self, wait=False):
     ctx = rs.context()
     devices = ctx.query_devices()
     for dev in devices:
         dev.hardware_reset()
     if wait:
         for _ in tqdm(range(30), desc=poem("Resetting Realsense Camera")):
             time.sleep(1)
             if not self.local_queue.empty():
                 sys.exit(0)
Exemplo n.º 3
0
    def _train_policy(self, metrics: dict, D: ExperienceReplay, epoch: int,
                      global_prior, free_nats):
        self.wm.eval()
        self.policy.train()
        losses = []
        for _ in tqdm(range(cfg.collect_interval),
                      desc=poem(f"{epoch} Policy Interval"),
                      leave=False):
            O, A, _, M = D.sample()
            with torch.no_grad():
                b_0, _, _, _, s_0, _, _ = self.wm.t_model(
                    torch.zeros(cfg.batch_size, cfg.state_size), A[:, :-1],
                    torch.zeros(cfg.batch_size, cfg.belief_size),
                    bottle(self.wm.e_model, (O[:, 1:], )), M[:, :-1])
                b_0 = b_0.view(-1, cfg.belief_size)
                s_0 = s_0.view(cfg.batch_size * (cfg.chunk_size - 1),
                               cfg.state_size)
                m0 = M[:, 1:].reshape(cfg.batch_size *
                                      (cfg.chunk_size - 1)).byte()
                # b_0, s_0 = b_0[m0], s_0[m0]

            T = cfg.planning_horizon + 1
            B, S = [torch.empty(0)] * T, [torch.empty(0)] * T
            B[0], S[0] = b_0, s_0

            for t in range(T - 1):
                # forward actions
                A = self.policy(B[t], S[t])
                b_t, s_t, _, _ = self.wm.t_model(S[t], A.unsqueeze(dim=1),
                                                 B[t])
                B[t + 1], S[t + 1] = b_t.squeeze(dim=1), s_t.squeeze(dim=1)

            loss = -self.wm.r_model(torch.cat(B, dim=0), torch.cat(
                S, dim=0)).mean()

            if cfg.learning_rate_schedule != 0:
                _linearly_ramping_lr(self.plcy_optimizer,
                                     cfg.learning_rate_plcy)

            self.plcy_optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.policy.parameters(),
                                     cfg.grad_clip_norm,
                                     norm_type=2)
            self.plcy_optimizer.step()  # S

            losses.append(loss.item())
        metrics['p_loss'].append(mean(losses))
Exemplo n.º 4
0
    def collect_interval(self,
                         metrics: dict,
                         D: ExperienceReplay,
                         epoch: int,
                         save_loc=''):
        self.wm.eval()
        self.policy.eval()
        frames = []
        with torch.no_grad():
            o, r_tot = torch.tensor(self.env.reset(), dtype=torch.float32), 0
            b, s_post = torch.zeros(1, cfg.belief_size), torch.zeros(
                1, cfg.state_size)
            a = torch.zeros(1, self.env.action_size)
            for t in tqdm(range(
                    ceil(cfg.max_episode_length / cfg.action_repeat)),
                          desc=poem(f"{epoch} Collection"),
                          leave=False):
                b, _, _, _, s_post, _, _ = self.wm.t_model(
                    s_post, a.unsqueeze(dim=1), b,
                    self.wm.e_model(o.unsqueeze(dim=0)).unsqueeze(dim=0))
                b, s_post = b.squeeze(dim=1), s_post.squeeze(
                    dim=1)  # remove time dimension

                a = torch.clamp(
                    self.policy(b, s_post).cpu() +
                    self.action_noise * torch.randn_like(a), -1., 1.)

                o_, r, done = self.env.step(
                    a.view(self.env.action_size).numpy())
                frames.append(self.env.render())

                D.push(o, a.view(self.env.action_size), r, done)

                r_tot += r
                o = torch.tensor(o_, dtype=torch.float32)
                if done:
                    break
        if cfg.action_noise_schedule != 0:
            self._linearly_ramping_an()
        metrics['steps'].append(t if len(metrics['steps']) == 0 else t +
                                metrics['steps'][-1])
        metrics['episodes'].append(epoch)
        metrics['rewards'].append(r_tot)
Exemplo n.º 5
0
    def _train_worldmodel(self, metrics: dict, D: ExperienceReplay, epoch: int,
                          global_prior, free_nats):
        losses = []
        for _ in tqdm(range(cfg.collect_interval_worm),
                      desc=poem(f"{epoch} Train Interval"),
                      leave=False):
            # self.optimizer.zero_grad()
            O, A, R, M = D.sample()

            b_0 = torch.zeros(cfg.batch_size, cfg.belief_size)
            s_0 = torch.zeros(cfg.batch_size, cfg.state_size)

            # Y := B, S_pri, MU_pri, STD_pri, S_pos, MU_pos, STD_pos
            Y = self.wm.t_model(s_0, A[:, :-1], b_0,
                                bottle(self.wm.e_model, (O[:, 1:], )),
                                M[:, :-1])

            o_loss, r_loss, kl_loss = self._reconstruction_loss(
                Y, O, R, free_nats, global_prior)

            if cfg.overshooting_kl_beta != 0:
                kl_loss += self._latent_overshooting(Y, A, M, free_nats)

            if cfg.learning_rate_schedule != 0:
                self._linearly_ramping_lr(self.wm_optimizer)

            self.wm_optimizer.zero_grad()
            (o_loss + r_loss + kl_loss).backward()
            nn.utils.clip_grad_norm_(self.param_list,
                                     cfg.grad_clip_norm,
                                     norm_type=2)
            self.wm_optimizer.step()

            losses.append([o_loss.item(), r_loss.item(), kl_loss.item()])

        o_loss, r_loss, kl_loss = tuple(zip(*losses))
        metrics['o_loss'].append(mean(o_loss))
        metrics['r_loss'].append(mean(r_loss))
        metrics['kl_loss'].append(mean(kl_loss))