示例#1
0
        def rollout(env, policy, max_path_length, add_input=None, volatile=False, reset_args = None):
            sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[])
            obs = env.reset(reset_args)
            for s in range(max_path_length):
                policy_input = Variable(from_numpy(np.array([obs])).float(), volatile=volatile)

                if add_input is not None:
                    policy_input = torch.cat([policy_input, add_input], -1)
                if s == 0:
                    policy.reset(1)
                if policy.recurrent():
                    policy_input = policy_input.unsqueeze(0)
                action_dist = policy.forward(policy_input)
                action = action_dist.sample()

                x = env.step(get_numpy(action))
                next_obs = x[0]
                sd['obs'].append(obs)
                sd['rewards'].append(x[1])
                sd['actions'].append(action)
                obs = next_obs
            sd['obs'].append(obs)
            sd['obs'] = np.array(sd['obs']) # (bs, max_path_length, obs_dim)
            sd['rewards'] = np.array(sd['rewards']) # (bs, max_path_length)
            sd['actions'] = torch.stack(sd['actions'], 1)

            return sd
示例#2
0
 def __init__(self, encoder, decoder, latent_dim, step_dim, obs_dim, act_dim, policy, env, optimizer=None, loss_type='mse',
              init_kl_weight=.001, max_kl_weight=.1, kl_mul=1.07, vae_loss_weight=1, lr=1e-3, bc_weight=100, ego=False, egoidx=None):
     self.encoder = encoder
     self.obs_dim = obs_dim
     self.act_dim = act_dim
     self.decoder = decoder
     self.ego = ego
     self.egoidx = egoidx
     self.bc_weight = bc_weight
     self.env = env()
     self.policy = policy
     self.unit_n = Normal(Variable(torch.zeros(1, latent_dim)),
                          log_var=Variable(torch.zeros(1, latent_dim)))
     self.latent_dim = latent_dim
     self.step_dim = step_dim
     self.init_kl_weight = init_kl_weight
     self.max_kl_weight = max_kl_weight
     self.kl_mul = kl_mul
     if optimizer is None:
         optimizer = Adam(self.get_params(), lr=lr, eps=1e-5)
     self.loss_type = loss_type 
     self.vae_loss_weight = vae_loss_weight
     self.optimizer = optimizer
     if gpu_enabled():
         self.encoder.cuda()
         self.decoder.cuda()
示例#3
0
    def compute_loss(self, x, actdata, z, z_dist):
        y_dist = self.decode(x, z)
        y = x[:, self.step_dim:].contiguous()

        xview = x.view((x.size()[0], -1, self.obs_dim)).clone()
        if self.ego:
            xview[:, :, self.egoidx] -= xview[:, [0], self.egoidx].unsqueeze(1)
        zexpand = z.unsqueeze(1).expand(*xview.size()[:2], z.size()[-1])
        xz = torch.cat(( Variable(xview), zexpand), -1)
        xz_view = xz.view((-1, xz.size()[-1]))
        if self.policy.recurrent():
            self.policy.reset(x.size()[0])
            xz_view = xz.transpose(0, 1)
            actdata = actdata.view((actdata.size()[0], -1, self.act_dim))
            actdata = actdata.transpose(0, 1).contiguous()
        dist = self.policy.forward(xz_view)
        act_view = actdata.view((-1, self.act_dim))
        
        bcloss = -dist.log_likelihood(Variable(act_view))
        if self.policy.recurrent():
            bcloss = bcloss.view(*actdata.size()[:2]).mean(0)
        else:
            bcloss = bcloss.view((actdata.size()[0], -1)).mean(1)

        mse, neg_ll, kl = self.loss(Variable(y), y_dist, z_dist)
        return mse, neg_ll, kl, bcloss, z_dist
示例#4
0
 def test(self, dataset):
     data = FloatTensor(dataset.train_data)
     x, actdata = self.splitobs(data)
     y = x[:, self.step_dim:]
     z_dist = self.encode(Variable(x))
     z = z_dist.sample()
     y_dist = self.decode(x, z)
     log_likelihood = torch.pow(y_dist.mle - Variable(y), 2).mean(-1).mean(0)
     return get_numpy(log_likelihood).item()
示例#5
0
 def decode(self, x, z):
     if self.decoder.recurrent():
         initial_input = x[:, :self.step_dim].contiguous().clone()
         if self.ego:
             diff = initial_input[:, self.egoidx].clone()
             initial_input[:, self.egoidx] = 0
         output = self.decoder.forward(z, initial_input=Variable(initial_input))
         if self.ego:
             bs = output.mean.shape[0]
             mean = output.mean.view((bs, -1, self.step_dim))
             mean[:, :, self.egoidx] += Variable(diff[:, None])
             output.mean = mean.view(output.mean.shape)
         return output
     else:
         return self.decoder.forward(z)
示例#6
0
    def rollout(self, max_path_length, add_input=None, reset_args=None, volatile=False):
        sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[], states=[])
        obs = self.reset_envs(reset_args)
        self.policy.reset(obs.shape[0])

        for s in range(max_path_length):
            state = self.policy.get_state().data if self.policy.recurrent() else None

            if self.ego:
                obs_ego = obs.copy()
                obs_ego[:, self.egoidx] -= reset_args[:, self.egoidx]
                policy_input = Variable(from_numpy(obs_ego).float(), volatile=volatile)
            else:
                policy_input = Variable(from_numpy(obs).float(), volatile=volatile)

            if add_input is not None:
                policy_input = torch.cat([policy_input, add_input], -1)
            action_dist = self.policy.forward(policy_input)

            action = action_dist.sample()

            if self.random_action_p > 0:
                flip = np.random.binomial(1, self.random_action_p, size=len(obs))
                if flip.sum() > 0:
                    random_act = np.random.randint(0, self.policy.output_dim, size=flip.sum())
                    action[from_numpy(flip).byte()] = from_numpy(random_act)

            next_obs, rewards, done, info = self.step_envs(get_numpy(action))
            #env_step = self.step_envs(get_numpy(action))
            #next_obs = [x[0] for x in env_step]
            sd['obs'].append(obs)
            sd['rewards'].append(rewards)
            sd['actions'].append(action)
            sd['action_dist_lst'].append(action_dist)
            sd['states'].append(state)
            obs = next_obs
        # Append last obs
        sd['obs'].append(obs)
        sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim)
        #import pdb; pdb.set_trace()
        sd['states'] = torch.stack(sd['states'], 2) if self.policy.recurrent() else None
        sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length)
        sd['actions'] = torch.stack(sd['actions'], 1)

        sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'],
                torch.stack, axis=1)

        return sd
示例#7
0
    def forward_batch(self, batch):
        obsdata, actdata = self.splitobs(batch)

        x = obsdata
        z_dist = self.encode(Variable(x))
        z = z_dist.sample()
        return self.compute_loss(x, actdata, z, z_dist)
示例#8
0
    def rollout(self, max_path_length, add_input=None, volatile=False):
        sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[])
        obs = self.envs.reset()
        self.policy.reset(len(obs))
        for s in range(max_path_length):
            policy_input = Variable(from_numpy(np.stack(obs)).float(), volatile=volatile)
            if add_input is not None:
                policy_input = torch.cat([policy_input, add_input], -1)
            action_dist = self.policy.forward(policy_input)

            action = action_dist.sample()

            if self.random_action_p > 0:
                flip = np.random.binomial(1, self.random_action_p, size=len(obs))
                if flip.sum() > 0:
                    random_act = np.random.randint(0, int(self.env.action_space.flat_dim), size=flip.sum())
                    action[from_numpy(flip).byte()] = from_numpy(random_act)

            next_obs, rewards, done, info = self.envs.step(get_numpy(action))

            sd['obs'].append(obs)
            sd['rewards'].append(rewards)
            sd['actions'].append(action)
            sd['action_dist_lst'].append(action_dist)
            obs = next_obs
        # Append last obs
        sd['obs'].append(obs)
        sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim)
        sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length)
        sd['actions'] = torch.stack(sd['actions'], 1)

        sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'],
                torch.stack, axis=1)

        return sd
 def predict(self, obs_np):
     bs, path_len, obs_dim = obs_np.shape
     obs = obs_np.reshape(-1, obs_dim)
     if self._coeffs is None:
         return Variable(torch.zeros((bs, path_len)))
     returns = self._features(obs).dot(self._coeffs).reshape((-1, path_len))
     return np_to_var(returns)
示例#10
0
    def train_pd_match_sd(self, dataset, bs, itr, outer_itr):
        sampler = self.sampler
        expert_traj, _ = dataset.sample(bs)

        # sample from dataset to initialize trajectory from
        x, actdata = self.vae.splitobs(FloatTensor(expert_traj))

        z = Variable(torch.randn((bs, self.latent_dim)))
        pd_traj, sd_traj = self.forward(sampler, x, z)
        sd_traj_obs = get_numpy(sd_traj.mle)

        traj_3d_shape = (bs, -1, self.obs_dim)

        pd_traj_obs = np_to_var(pd_traj['obs'][:, 1:])

        se = sd_traj.reshape(traj_3d_shape).log_likelihood(pd_traj_obs)
        mse_sd_pd = self.compute_traj_mse(pd_traj_obs, sd_traj.mle,
                                          traj_3d_shape)

        pd_traj['rewards'] = get_numpy(se)

        self.policy_algo.process_samples(0, pd_traj, augment_obs=get_numpy(z))
        self.policy_algo.optimize_policy(0, pd_traj)

        traj_sets = [sd_traj_obs, pd_traj['obs'][:, 1:]]

        pd_traj['stats']['mse_sd_pd'] = get_numpy(mse_sd_pd.mean()).item()
        pd_traj['stats']['ll'] = np.mean(get_numpy(se))

        return pd_traj['stats']
示例#11
0
    def fit(self, obs_np, returns_np):
        self.network.apply(xavier_init)
        bs, path_len, obs_dim = obs_np.shape

        obs = from_numpy(obs_np.reshape(-1, obs_dim).astype(np.float32))
        returns = from_numpy(returns_np.reshape(-1).astype(np.float32))

        dataloader = DataLoader(TensorDataset(obs, returns), batch_size=self.batch_size,
                                 shuffle=True)
        for epoch in range(self.max_epochs):
            for x, y in dataloader:
                self.optimizer.zero_grad()
                x = Variable(x)
                y = Variable(y).float().view(-1, 1)
                loss = (self.network(x) - y).pow(2).mean()
                loss.backward()
                self.optimizer.step()
        print('loss %f' % get_numpy(loss).item())
示例#12
0
 def log_likelihood(self, x):
     # x is (bs, path_len * sum(cat_sizes)) one hots
     bs = x.size()[0]
     x_3d = x.view(bs, self.path_len, sum(self.cat_sizes))
     count = 0
     total_ll = Variable(torch.zeros(1))
     for cat_size in self.cat_sizes:
         prob = self.probs_3d[:, :, count:count + cat_size]
         onehot = x_3d[:, :, count:count + cat_size]
         ll = torch.log(torch.sum(prob * onehot, -1) + EPS)
         total_ll += ll.sum()
         count += cat_size
     return total_ll / bs
示例#13
0
    def plot_compare(self, dataset, itr, save_dir='trajs'):


        x = FloatTensor(dataset.sample_hard(5)[0])
        x, actdata = self.splitobs(x)
        target = x[:, self.step_dim:]

        y_dist = self.decode(x, self.encode(Variable(x)).sample())

        traj_sets = [dataset.unnormalize(get_numpy(traj_set)) for traj_set in [target, y_dist.mle]]

        traj_names = ['expert', 'sd']
        plot_traj_sets([dataset.process(traj_set) for traj_set in traj_sets], traj_names, itr, env_id=dataset.env_id)
        for traj_no in range(5):
            dataset.plot_pd_compare([x[traj_no, ...] for x in traj_sets], traj_names, itr, name='Full_State_%d' % traj_no,
                                    save_dir=save_dir)
示例#14
0
文件: trpo.py 项目: luisenp/Sectar
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params()

        constraint_grads = autograd.backward(f, params)
        for idx, (grad, param) in enumerate(zip(constraint_grads, params)):
            if grad is None:
                constraint_grads[idx] = Variable(torch.zeros(param.size()),
                                                 requires_grad=True)

        def Hx_plain(xs):
            Hx_plain_splits = autograd.backward(
                torch.sum(
                    torch.stack([
                        torch.sum(g * x) for g, x in zip(constraint_grads, xs)
                    ])), params)
            for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)):
                if Hx is None:
                    Hx_plain_splits[idx] = torch.zeros_like(param)
            return [x.view(-1) for x in Hx_plain_splits]

        self.f_Hx_plain = Hx_plain
示例#15
0
    def log_likelihood_full(self, x):
        # x is (bs, path_len, sum(cat_sizes)) one hots
        # probs = self.probs_3d * x
        #
        # return torch.log(torch.sum(self.probs_3d * x, -1) + EPS)
        bs = x.size()[0]
        x_3d = x.view(bs, self.path_len, sum(self.cat_sizes))
        count = 0
        total_ll = Variable(torch.zeros(bs, self.path_len))
        for cat_size in self.cat_sizes:
            prob = self.probs_3d[:, :, count:count + cat_size]
            onehot = x_3d[:, :, count:count + cat_size]
            #ll = torch.log(torch.sum(prob * onehot, -1) + EPS)
            #total_ll += ll
            #ll = torch.sum(prob * onehot, -1) + EPS
            _, p_idx = prob.max(-1)
            _, x_idx = onehot.max(-1)
            #import pdb; pdb.set_trace()
            ll = -torch.pow((p_idx - x_idx).float(), 2)

            total_ll += ll
            count += cat_size
        return total_ll
示例#16
0
文件: rnn.py 项目: yuanying-cc/Sectar
 def init_hidden(self, bs):
     self.hidden = (Variable(torch.zeros(self.h_size, bs, self.hidden_dim)),
                    Variable(torch.zeros(self.h_size, bs, self.hidden_dim)))
示例#17
0
 def init_input(self, bs):
     return Variable(torch.zeros(bs, self.output_dim))
示例#18
0
 def predict(self, obs_np):
     bs, path_len, obs_dim = obs_np.shape
     return Variable(torch.zeros(bs, path_len))
示例#19
0
 def forward(self, obs_np):
     #import pdb; pdb.set_trace()
     bs, obs_dim = obs_np.size()
     return Variable(torch.zeros(bs))
示例#20
0
 def init_input(self, bs):
     # Return one hot for each cat
     return Variable(torch.zeros(bs, self.output_dim))
示例#21
0
文件: ppo.py 项目: yuanying-cc/Sectar
    def optimize_policy(self,
                        itr,
                        samples_data,
                        add_input_fn=None,
                        add_input_input=None,
                        add_loss_fn=None,
                        print=True):

        advantages = from_numpy(samples_data['discount_adv'].astype(
            np.float32))
        n_traj = samples_data['obs'].shape[0]
        n_obs = n_traj * self.max_path_length
        #add_input_obs = from_numpy(samples_data['obs'][:, :, :self.obs_dim].astype(np.float32)).view(n_traj, -1)
        if add_input_fn is not None:
            obs = from_numpy(samples_data['obs']
                             [:, :self.max_path_length, :self.obs_dim].astype(
                                 np.float32)).view(n_obs, -1)
        else:
            obs = from_numpy(
                samples_data['obs'][:, :self.max_path_length, :].astype(
                    np.float32)).view(n_obs, -1)

        #obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :].astype(np.float32)).view(n_obs, -1)

        actions = samples_data['actions'].view(n_obs, -1).data
        returns = from_numpy(samples_data['discount_returns'].copy()).view(
            -1, 1).float()
        old_action_log_probs = samples_data['log_prob'].view(n_obs, -1).data
        states = samples_data['states'].view(
            samples_data['states'].size()[0], n_obs,
            -1) if self.policy.recurrent() else None

        for epoch_itr in range(self.epoch):
            sampler = BatchSampler(SubsetRandomSampler(range(n_obs)),
                                   self.ppo_batch_size,
                                   drop_last=False)
            for indices in sampler:
                indices = LongTensor(indices)
                obs_batch = Variable(obs[indices])
                actions_batch = actions[indices]
                return_batch = returns[indices]
                old_action_log_probs_batch = old_action_log_probs[indices]
                if states is not None:
                    self.policy.set_state(Variable(states[:, indices]))

                if add_input_fn is not None:
                    add_input_dist = add_input_fn(Variable(add_input_input))
                    add_input = add_input_dist.sample()
                    add_input_rep = torch.unsqueeze(add_input, 1).repeat(
                        1, self.max_path_length, 1).view(n_obs, -1)
                    #add_input_batch = add_input[indices/add_input.size()[0]]
                    add_input_batch = add_input_rep[indices]
                    obs_batch = torch.cat([obs_batch, add_input_batch], -1)

                values = self.baseline.forward(obs_batch.detach())
                action_dist = self.policy.forward(obs_batch)
                action_log_probs = action_dist.log_likelihood(
                    Variable(actions_batch)).unsqueeze(-1)
                dist_entropy = action_dist.entropy().mean()

                ratio = torch.exp(action_log_probs -
                                  Variable(old_action_log_probs_batch))
                adv_targ = Variable(advantages.view(-1, 1)[indices])
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * adv_targ
                action_loss = -torch.min(
                    surr1,
                    surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                self.optimizer.zero_grad()

                total_loss = (value_loss + action_loss -
                              dist_entropy * self.entropy_bonus)
                if add_loss_fn is not None:
                    total_loss += add_loss_fn(add_input_dist, add_input,
                                              add_input_input)
                total_loss.backward()
                self.optimizer.step()
            if print:
                stats = {
                    'total loss': get_numpy(total_loss)[0],
                    'action loss': get_numpy(action_loss)[0],
                    'value loss': get_numpy(value_loss)[0],
                    'entropy': get_numpy(dist_entropy)[0]
                }
                with logger.prefix('Train PPO itr %d epoch itr %d | ' %
                                   (itr, epoch_itr)):
                    self.print_diagnostics(stats)

        return total_loss
示例#22
0
 def sample(self, dataset, sample_size):
     trajs, _ = dataset.sample(sample_size)
     latent = Variable(torch.randn((sample_size, self.latent_dim)))
     return self.decode(FloatTensor(trajs), latent), latent
示例#23
0
 def sample(self, deterministic=False):
     if deterministic:
         return self.mean
     else:
         return Variable(torch.randn(self.mean.size())) * torch.exp(self.log_var) + self.mean