def get_action(self, s): with t.no_grad(): # convert s to tensor if not already if not t.is_tensor(s): s = t.tensor(s) s = s.to(dtype=self.dtype) assert s.shape[-1] == self.obs_shape[0] assert len(s.shape) <= 2 item = False if len(s.shape) == 1: s = s.unsqueeze(0) item = True # get all possible state action pairs s_a = s.repeat_interleave(self.act_shape, 0) a_s = t.arange(self.act_shape, dtype=self.dtype) a_s = a_s.repeat(s.shape[0]).unsqueeze(1) theta = self._Qdist(s_a, a_s) s_dist = BDR_dist(*theta, fp=True) # Select action with max expected value: means = s_dist.mean().reshape(-1, self.act_shape).argmax(-1, keepdim=True) if item: return means.item() return means
def plot_icdf(net, x_loc): with t.no_grad(): num_pts = 100 samples = 70 xs = t.tensor([x_loc, x_loc]).unsqueeze(1) xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device) theta, _, _ = bf.propagate(net, xs) dist = BDR_dist(*theta, fp=True) p_vals = t.linspace(0.005, 0.995, num_pts).to(dtype=dtype, device=device) icdfs = dist.icdf(p_vals, avg=True).detach().cpu()[0] ic_mean = icdfs.mean(0).squeeze() ic_std = icdfs.std(0).squeeze() fig, (ax1) = plt.subplots(1, 1, figsize=(10,6)) ax1.plot(p_vals, ic_mean) ax1.fill_between(p_vals, (ic_mean+ic_std), (ic_mean-ic_std), alpha=0.2) ax1.set_title(f"Distribution at x={x_loc}") ax1.set_ylabel("Return") ax1.set_xlabel("P") plt.show()
def plot_cdf(net, x_loc): with t.no_grad(): # num points on graph num_pts = 100 # samples from posterior samples = 50 xs = t.tensor([x_loc]).unsqueeze(1) xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device) theta, _, _ = bf.propagate(net, xs) dist = BDR_dist(*theta) # -5 to 5 is arbitrary... Seems to work well with the given data. ys = t.linspace(-2, 5, num_pts).to(dtype=dtype, device=device) cdfs = dist.cdf(ys, avg=True).detach().cpu() c_mean = cdfs.mean(0).squeeze() c_std = cdfs.std(0).squeeze() fig, (ax1) = plt.subplots(1, 1, figsize=(10,6)) ax1.plot(ys.detach().cpu(), c_mean) ax1.fill_between(ys.detach().cpu(), (c_mean+c_std), (c_mean-c_std), alpha=0.2) ax1.set_title(f"Distribution at x={x_loc}") ax1.set_ylabel("Density") ax1.set_xlabel("Y value") plt.show()
def train_net(X, y): samples = 20 net = BDR(in_features=X.shape[1], inducing_batch=40, N=7, layer_sizes=(40,), dtype=X.dtype, f_postproc='sort' ) net.to(dtype=dtype, device=device) opt = t.optim.Adam(net.parameters(), lr=0.05) for _ in tqdm(range(100)): for batch in range(batches): l = batch * train_batch u = l + train_batch batch_X = X[l:u].expand(samples, -1, -1) # no need to batch_y expand manually; # log_prob automatically broadcasts across the samples. batch_y = y[l:u] opt.zero_grad() theta, logpq, _ = bf.propagate(net, batch_X) ll = BDR_dist(*theta).log_prob(batch_y).sum(-1).mean(-1) assert ll.shape == (samples,) assert logpq.shape == (samples,) elbo = ll + logpq/data_size (-elbo.mean()).backward() opt.step() return net
def sample_test(net, x_loc, true_ys): with t.no_grad(): num_pts = 100 samples = 100 # We are only evaluating this at 1 x location # Think of this as a single (state, action) pair xs = t.tensor([x_loc]).unsqueeze(1) # Must expand to predict <samples> different parameter values for each x # location in the batch xs = xs.expand(samples, -1, -1).to(dtype=dtype, device=device) # In order to plot this, we will evaluate the density at the single x # location num_pts time: # ys = t.linspace(true_ys.min(), true_ys.max(), num_pts).to(dtype=dtype, device=device) ys = t.linspace(-5, 5, num_pts).to(dtype=dtype, device=device) # Generate the <samples> parameter predictions at this single x or # (s,a) location theta, _, _ = bf.propagate(net, xs) dist = BDR_dist(*theta, fp=True) # Evaluate the density of each of the <num_pts> plotting points ss = dist.log_prob(ys).exp().detach().cpu() # ss has shape [samples, batch, num_pts] ss_mean = ss.mean(0) ss_std = ss.std(0) mean = dist.mean() print("mean shape: ", mean.shape) fig, (ax1) = plt.subplots(1, 1, figsize=(10,6)) (f, alpha, beta) = theta # f_avg is [1], because we only predicted params at 1 x location (x_loc) f_avg = f.mean(0) # ax1.vlines(f_avg, 0, 1) # dist is a batch of distributions, with only 1 batch! modes = dist.mode_at(0, avg=True) ax1.vlines(modes, 0, 1, color='r', linewidths=2) # plot modes samples modes = dist.modes()[0] ax1.vlines(modes, 0, 1, color='r', linewidths=0.5, alpha=0.2) ax1.vlines(mean, 0, 1, color='b', linewidths=2) samples = dist.sample(1000) ax1.hist(samples, bins=100, density=True, color='grey', alpha=0.2) ax1.scatter(true_ys, t.zeros_like(true_ys)) ax1.plot(ys.detach().cpu(), ss_mean[0]) ax1.fill_between(ys.detach().cpu(), (ss_mean+ss_std)[0], (ss_mean-ss_std)[0], alpha=0.2) ax1.set_title(f"Distribution at x={x_loc}") ax1.set_ylabel("Density") ax1.set_xlabel("Y value") plt.show()
def get_return_dists(self, s): """ Safe to assume that s is a single state (numpy array) """ s = t.tensor(s, dtype=self.dtype, device=self.device).unsqueeze(0) s = s.repeat_interleave(self.act_shape, 0) a_s = t.arange(self.act_shape, dtype=self.dtype, device=self.device).unsqueeze(1) batch_sa = t.cat((s, one_hot(a_s, self.act_shape)), 1) theta, _, _ = self._Qdist(s, a_s, self.S_train) return BDR_dist(*theta, fp=True)
def train_step(self): # Training step for BDRL agent # Sample transitions from buffer s, a, sp, r, done = self.buffer.random_batch(self.train_batch) # Find the optimal next action: with t.no_grad(): s = s.to(dtype=self.dtype, device=self.device) # Get all possible subsequent state-action pairs ap_h = t.arange(self.act_shape, dtype=self.dtype, device=self.device) ap_h = ap_h.repeat(sp.shape[0]).unsqueeze(1) sp = sp.repeat_interleave(self.act_shape, 0) batch_spap_h = t.cat((sp, one_hot(ap_h, self.act_shape)), 1) batch_spap_h = batch_spap_h.expand(self.S_train, -1, -1) assert batch_spap_h.shape[-1] == self.obs_shape[0] + self.act_shape theta, _, _ = bf.propagate(self.net, batch_spap_h) sp_dist = BDR_dist(*theta, fp=True) # Selection the action with the max expected value # TODO try other exploration policies means = sp_dist.mean().reshape(-1, self.act_shape) ap = means.argmax(-1, keepdims=True) # Locate the parameters of the corresponding (sp, ap) # distribution(s) so that we can sample from these. f_tmp, a_tmp, b_tmp = theta idxs = (t.arange(ap.size(0))*self.act_shape)+ap.flatten() theta_spap = (f_tmp[:,idxs].squeeze(), a_tmp[:,idxs].squeeze(), b_tmp[:,idxs].squeeze()) # Generate targets spap_dist = BDR_dist(*theta_spap, fp=True) samples = spap_dist.sample(self.N_train, avg=True) targets = r + (1-done) * self.gamma * samples batch_sa = t.cat((s, one_hot(a, self.act_shape)), 1) batch_sa = batch_sa.expand(self.S_train, -1, -1) theta_sa, logpq, _ = bf.propagate(self.net, batch_sa) ll = BDR_dist(*theta_sa).log_prob(targets).sum(-1).mean(-1) elbo = ll + logpq/(self.buffer.filled_buffer) self.opt.zero_grad() (-elbo.mean()).backward() self.opt.step()
def sample_test(net, x_loc, true_ys): with t.no_grad(): num_pts = 300 samples = 100 xs = t.tensor([x_loc]).unsqueeze(1).to(dtype=dtype, device=device) ys = t.linspace(-2, 5, num_pts).to(dtype=dtype, device=device) theta = net.f(xs) dist = BDR_dist(*theta, fp=True) ss = dist.log_prob(ys).exp().detach().cpu() ss_mean = ss.mean(0) ss_std = ss.std(0) mean = dist.mean().detach().cpu() print("mean shape: ", mean.shape) fig, (ax1) = plt.subplots(1, 1, figsize=(10, 6)) (f, alpha, beta) = theta f_avg = f.mean(0) modes = dist.mode_at(0, avg=True).detach().cpu() ax1.vlines(modes, 0, 1, color='r', linewidths=2) modes = dist.modes()[0].detach().cpu() ax1.vlines(modes, 0, 1, color='r', linewidths=0.5, alpha=0.2) ax1.vlines(mean, 0, 1, color='b', linewidths=2) samples = dist.sample(1000, avg=True).detach().cpu() ax1.hist(samples, bins=100, density=True, color='grey', alpha=0.2) true_ys = true_ys.detach().cpu() ax1.scatter(true_ys, t.zeros_like(true_ys)) ax1.plot(ys.detach().cpu(), ss_mean[0]) ax1.fill_between(ys.detach().cpu(), (ss_mean + ss_std)[0], (ss_mean - ss_std)[0], alpha=0.2) ax1.set_title(f"Distribution at x={x_loc}") ax1.set_ylabel("Density") ax1.set_xlabel("Y value") plt.show()
def train_step(self): # Trainig Step for distributional agent # Sample some transitions from the buffer s, a, sp, r, done = self.buffer.random_batch(self.train_batch) # Find the optimal next actions, and the rreturn distributions for # (sp, ap) with t.no_grad(): s = s.to(dtype=self.dtype, device=self.device) # Get all the possible state-action pairs ap_h = t.arange(self.act_shape, dtype=self.dtype, device=self.device) ap_h = ap_h.repeat(sp.shape[0]).unsqueeze(1) sp = sp.repeat_interleave(self.act_shape, 0) batch_spap_h = t.cat((sp, one_hot(ap_h, self.act_shape)), 1) assert batch_spap_h.shape[-1] == self.obs_shape[0] + self.act_shape theta = self.net.f(batch_spap_h) sp_dist = BDR_dist(*theta, fp=True) # Select the action with the max expected value means = sp_dist.mean().reshape(-1, self.act_shape) ap = means.argmax(-1, keepdims=True) # Locate the parameters of the corresponding (sp,ap) distributions # so that we can sample from these f_tmp, a_tmp, b_tmp = theta idxs = (t.arange(ap.size(0)) * self.act_shape) + ap.flatten() theta_spap = (f_tmp[:, idxs].squeeze(), a_tmp[:, idxs].squeeze(), b_tmp[:, idxs].squeeze()) # Sanity check assert s.shape == (self.train_batch, self.obs_shape[0]) assert a.shape == (self.train_batch, 1) assert a.shape == ap.shape # Generate targets spap_dist = BDR_dist(*theta_spap, fp=True) samples = spap_dist.sample(self.N_train, avg=True) targets = r + (1 - done) * self.gamma * samples # Evaluate the likelihood of the target points for each of the (s,a) # distributions in the original batch. batch_sa = t.cat((s, one_hot(a, self.act_shape)), 1) self.net.train(batch_sa, targets, epochs=2, batch_size=32)