Пример #1
0
    def get_action(self, s):
        with t.no_grad():
            # convert s to tensor if not already
            if not t.is_tensor(s):
                s = t.tensor(s)
            s = s.to(dtype=self.dtype)
            assert s.shape[-1] == self.obs_shape[0]
            assert len(s.shape) <= 2
            item = False
            if len(s.shape) == 1:
                s = s.unsqueeze(0)
                item = True

            # get all possible state action pairs
            s_a = s.repeat_interleave(self.act_shape, 0)
            a_s = t.arange(self.act_shape, dtype=self.dtype)
            a_s = a_s.repeat(s.shape[0]).unsqueeze(1)
            theta = self._Qdist(s_a, a_s)
            s_dist = BDR_dist(*theta, fp=True)

            # Select action with max expected value:
            means = s_dist.mean().reshape(-1,
                                          self.act_shape).argmax(-1,
                                                                 keepdim=True)
            if item:
                return means.item()
            return means
Пример #2
0
def plot_icdf(net, x_loc):
    with t.no_grad():
        num_pts = 100
        samples = 70
        xs = t.tensor([x_loc, x_loc]).unsqueeze(1)
        xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device)

        theta, _, _ = bf.propagate(net, xs)

        dist = BDR_dist(*theta, fp=True)

        p_vals = t.linspace(0.005, 0.995, num_pts).to(dtype=dtype, device=device)

        icdfs = dist.icdf(p_vals, avg=True).detach().cpu()[0]

        ic_mean = icdfs.mean(0).squeeze()
        ic_std  = icdfs.std(0).squeeze()

        fig, (ax1) = plt.subplots(1, 1, figsize=(10,6))

        ax1.plot(p_vals, ic_mean)
        ax1.fill_between(p_vals, (ic_mean+ic_std), (ic_mean-ic_std), alpha=0.2)
        ax1.set_title(f"Distribution at x={x_loc}")
        ax1.set_ylabel("Return")
        ax1.set_xlabel("P")
        plt.show()
Пример #3
0
def plot_cdf(net, x_loc):
    with t.no_grad():
        # num points on graph
        num_pts = 100
        # samples from posterior
        samples = 50

        xs = t.tensor([x_loc]).unsqueeze(1)
        xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device)

        theta, _, _ = bf.propagate(net, xs)

        dist = BDR_dist(*theta)

        # -5 to 5 is arbitrary... Seems to work well with the given data.
        ys = t.linspace(-2, 5, num_pts).to(dtype=dtype, device=device)

        cdfs = dist.cdf(ys, avg=True).detach().cpu()
        c_mean = cdfs.mean(0).squeeze()
        c_std  = cdfs.std(0).squeeze()

        fig, (ax1) = plt.subplots(1, 1, figsize=(10,6))

        ax1.plot(ys.detach().cpu(), c_mean)
        ax1.fill_between(ys.detach().cpu(), (c_mean+c_std), (c_mean-c_std), alpha=0.2)
        ax1.set_title(f"Distribution at x={x_loc}")
        ax1.set_ylabel("Density")
        ax1.set_xlabel("Y value")
        plt.show()
Пример #4
0
def train_net(X, y):
    samples = 20
    net = BDR(in_features=X.shape[1],
              inducing_batch=40,
              N=7,
              layer_sizes=(40,),
              dtype=X.dtype,
              f_postproc='sort'
              )
    net.to(dtype=dtype, device=device)

    opt = t.optim.Adam(net.parameters(), lr=0.05)

    for _ in tqdm(range(100)):
        for batch in range(batches):
            l = batch * train_batch
            u = l     + train_batch
            batch_X = X[l:u].expand(samples, -1, -1)
            # no need to batch_y expand manually;
            # log_prob automatically broadcasts across the samples.
            batch_y = y[l:u]
            opt.zero_grad()
            theta, logpq, _ = bf.propagate(net, batch_X)
            ll = BDR_dist(*theta).log_prob(batch_y).sum(-1).mean(-1)
            assert ll.shape == (samples,)
            assert logpq.shape == (samples,)
            elbo = ll + logpq/data_size
            (-elbo.mean()).backward()
            opt.step()
    return net
Пример #5
0
def sample_test(net, x_loc, true_ys):
    with t.no_grad():
        num_pts = 100
        samples = 100

        # We are only evaluating this at 1 x location
        # Think of this as a single (state, action) pair
        xs = t.tensor([x_loc]).unsqueeze(1)
        # Must expand to predict <samples> different parameter values for each x
        # location in the batch
        xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device)

        # In order to plot this, we will evaluate the density at the single x
        # location num_pts time:
        # ys = t.linspace(true_ys.min(), true_ys.max(), num_pts).to(dtype=dtype, device=device)
        ys = t.linspace(-5, 5, num_pts).to(dtype=dtype, device=device)

        # Generate the <samples> parameter predictions at this single x or
        # (s,a) location
        theta, _, _ = bf.propagate(net, xs)
        dist = BDR_dist(*theta, fp=True)

        # Evaluate the density of each of the <num_pts> plotting points
        ss = dist.log_prob(ys).exp().detach().cpu()
        # ss has shape [samples, batch, num_pts]
        ss_mean = ss.mean(0)
        ss_std  = ss.std(0)

        mean = dist.mean()
        print("mean shape: ", mean.shape)

        fig, (ax1) = plt.subplots(1, 1, figsize=(10,6))

        (f, alpha, beta) = theta
        # f_avg is [1], because we only predicted params at 1 x location (x_loc)
        f_avg = f.mean(0)
        # ax1.vlines(f_avg, 0, 1)

        # dist is a batch of distributions, with only 1 batch!
        modes = dist.mode_at(0, avg=True)
        ax1.vlines(modes, 0, 1, color='r', linewidths=2)

        # plot modes samples
        modes = dist.modes()[0]
        ax1.vlines(modes, 0, 1, color='r', linewidths=0.5, alpha=0.2)

        ax1.vlines(mean, 0, 1, color='b', linewidths=2)

        samples = dist.sample(1000)
        ax1.hist(samples, bins=100, density=True, color='grey', alpha=0.2)

        ax1.scatter(true_ys, t.zeros_like(true_ys))

        ax1.plot(ys.detach().cpu(), ss_mean[0])
        ax1.fill_between(ys.detach().cpu(), (ss_mean+ss_std)[0], (ss_mean-ss_std)[0], alpha=0.2)
        ax1.set_title(f"Distribution at x={x_loc}")
        ax1.set_ylabel("Density")
        ax1.set_xlabel("Y value")
        plt.show()
Пример #6
0
 def get_return_dists(self, s):
     """
     Safe to assume that s is a single state (numpy array)
     """
     s = t.tensor(s, dtype=self.dtype, device=self.device).unsqueeze(0)
     s = s.repeat_interleave(self.act_shape, 0)
     a_s = t.arange(self.act_shape, dtype=self.dtype, device=self.device).unsqueeze(1)
     batch_sa = t.cat((s, one_hot(a_s, self.act_shape)), 1)
     theta, _, _ = self._Qdist(s, a_s, self.S_train)
     return BDR_dist(*theta, fp=True)
Пример #7
0
    def train_step(self):
        # Training step for BDRL agent

        # Sample transitions from buffer
        s, a, sp, r, done = self.buffer.random_batch(self.train_batch)

        # Find the optimal next action:
        with t.no_grad():
            s = s.to(dtype=self.dtype, device=self.device)

            # Get all possible subsequent state-action pairs
            ap_h = t.arange(self.act_shape, dtype=self.dtype, device=self.device)
            ap_h = ap_h.repeat(sp.shape[0]).unsqueeze(1)
            sp   = sp.repeat_interleave(self.act_shape, 0)
            batch_spap_h = t.cat((sp, one_hot(ap_h, self.act_shape)), 1)
            batch_spap_h = batch_spap_h.expand(self.S_train, -1, -1)

            assert batch_spap_h.shape[-1] == self.obs_shape[0] + self.act_shape
            theta, _, _ = bf.propagate(self.net, batch_spap_h)
            sp_dist = BDR_dist(*theta, fp=True)

            # Selection the action with the max expected value
            # TODO try other exploration policies
            means = sp_dist.mean().reshape(-1, self.act_shape)
            ap = means.argmax(-1, keepdims=True)

            # Locate the parameters of the corresponding (sp, ap)
            # distribution(s) so that we can sample from these.
            f_tmp, a_tmp, b_tmp = theta
            idxs = (t.arange(ap.size(0))*self.act_shape)+ap.flatten()
            theta_spap = (f_tmp[:,idxs].squeeze(),
                          a_tmp[:,idxs].squeeze(),
                          b_tmp[:,idxs].squeeze())

            # Generate targets
            spap_dist = BDR_dist(*theta_spap, fp=True)
            samples = spap_dist.sample(self.N_train, avg=True)
            targets = r + (1-done) * self.gamma * samples

        batch_sa = t.cat((s, one_hot(a, self.act_shape)), 1)
        batch_sa = batch_sa.expand(self.S_train, -1, -1)
        theta_sa, logpq, _ = bf.propagate(self.net, batch_sa)
        ll = BDR_dist(*theta_sa).log_prob(targets).sum(-1).mean(-1)

        elbo = ll + logpq/(self.buffer.filled_buffer)
        self.opt.zero_grad()
        (-elbo.mean()).backward()
        self.opt.step()
Пример #8
0
def sample_test(net, x_loc, true_ys):
    with t.no_grad():
        num_pts = 300
        samples = 100

        xs = t.tensor([x_loc]).unsqueeze(1).to(dtype=dtype, device=device)

        ys = t.linspace(-2, 5, num_pts).to(dtype=dtype, device=device)

        theta = net.f(xs)
        dist = BDR_dist(*theta, fp=True)

        ss = dist.log_prob(ys).exp().detach().cpu()
        ss_mean = ss.mean(0)
        ss_std = ss.std(0)

        mean = dist.mean().detach().cpu()
        print("mean shape: ", mean.shape)

        fig, (ax1) = plt.subplots(1, 1, figsize=(10, 6))

        (f, alpha, beta) = theta
        f_avg = f.mean(0)

        modes = dist.mode_at(0, avg=True).detach().cpu()
        ax1.vlines(modes, 0, 1, color='r', linewidths=2)

        modes = dist.modes()[0].detach().cpu()
        ax1.vlines(modes, 0, 1, color='r', linewidths=0.5, alpha=0.2)

        ax1.vlines(mean, 0, 1, color='b', linewidths=2)

        samples = dist.sample(1000, avg=True).detach().cpu()
        ax1.hist(samples, bins=100, density=True, color='grey', alpha=0.2)

        true_ys = true_ys.detach().cpu()
        ax1.scatter(true_ys, t.zeros_like(true_ys))

        ax1.plot(ys.detach().cpu(), ss_mean[0])
        ax1.fill_between(ys.detach().cpu(), (ss_mean + ss_std)[0],
                         (ss_mean - ss_std)[0],
                         alpha=0.2)
        ax1.set_title(f"Distribution at x={x_loc}")
        ax1.set_ylabel("Density")
        ax1.set_xlabel("Y value")
        plt.show()
Пример #9
0
    def train_step(self):
        # Trainig Step for distributional agent

        # Sample some transitions from the buffer
        s, a, sp, r, done = self.buffer.random_batch(self.train_batch)

        # Find the optimal next actions, and the rreturn distributions for
        # (sp, ap)
        with t.no_grad():
            s = s.to(dtype=self.dtype, device=self.device)

            # Get all the possible state-action pairs
            ap_h = t.arange(self.act_shape,
                            dtype=self.dtype,
                            device=self.device)
            ap_h = ap_h.repeat(sp.shape[0]).unsqueeze(1)
            sp = sp.repeat_interleave(self.act_shape, 0)
            batch_spap_h = t.cat((sp, one_hot(ap_h, self.act_shape)), 1)
            assert batch_spap_h.shape[-1] == self.obs_shape[0] + self.act_shape
            theta = self.net.f(batch_spap_h)
            sp_dist = BDR_dist(*theta, fp=True)

            # Select the action with the max expected value
            means = sp_dist.mean().reshape(-1, self.act_shape)
            ap = means.argmax(-1, keepdims=True)

            # Locate the parameters of the corresponding (sp,ap) distributions
            # so that we can sample from these
            f_tmp, a_tmp, b_tmp = theta
            idxs = (t.arange(ap.size(0)) * self.act_shape) + ap.flatten()
            theta_spap = (f_tmp[:, idxs].squeeze(), a_tmp[:, idxs].squeeze(),
                          b_tmp[:, idxs].squeeze())

            # Sanity check
            assert s.shape == (self.train_batch, self.obs_shape[0])
            assert a.shape == (self.train_batch, 1)
            assert a.shape == ap.shape

            # Generate targets
            spap_dist = BDR_dist(*theta_spap, fp=True)
            samples = spap_dist.sample(self.N_train, avg=True)
            targets = r + (1 - done) * self.gamma * samples

        # Evaluate the likelihood of the target points for each of the (s,a)
        # distributions in the original batch.
        batch_sa = t.cat((s, one_hot(a, self.act_shape)), 1)
        self.net.train(batch_sa, targets, epochs=2, batch_size=32)