def fit(self, X, U): """ Fit dynamics. """ N, T, dX = X.shape dU = U.shape[2] if N == 1: raise ValueError("Cannot fit dynamics on 1 sample") self.Fm = np.zeros([T, dX, dX+dU]) self.fv = np.zeros([T, dX]) self.dyn_covar = np.zeros([T, dX, dX]) it = slice(dX+dU) ip = slice(dX+dU, dX+dU+dX) # Fit dynamics with least squares regression. dwts = (1.0 / N) * np.ones(N) for t in range(T - 1): Ys = np.c_[X[:, t, :], U[:, t, :], X[:, t+1, :]] # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.prior.eval(dX, dU, Ys) sig_reg = np.zeros((dX+dU+dX, dX+dU+dX)) sig_reg[it, it] = self._hyperparams['regularization'] Fm, fv, dyn_covar = gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX+dU, dX, sig_reg) self.Fm[t, :, :] = Fm self.fv[t, :] = fv self.dyn_covar[t, :, :] = dyn_covar return self.Fm, self.fv, self.dyn_covar
def _update_policy_fit(self, m, init=False): """ Re-estimate the local policy values in the neighborhood of the trajectory. Args: m: Condition init: Whether this is the initial fitting of the policy. """ dX, dU, T = self.dX, self.dU, self.T # Choose samples to use. samples = self.cur[m].sample_list N = len(samples) pol_info = self.cur[m].pol_info X = samples.get_X() pol_mu, pol_sig = self.policy_opt.prob(samples.get_obs().copy())[:2] pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig # Update policy prior. if init: self.cur[m].pol_info.policy_prior.update( samples, self.policy_opt, SampleList(self.cur[m].pol_info.policy_samples) ) else: self.cur[m].pol_info.policy_prior.update( SampleList([]), self.policy_opt, SampleList(self.cur[m].pol_info.policy_samples) ) # Collapse policy covariances. This is not really correct, but # it works fine so long as the policy covariance doesn't depend # on state. pol_sig = np.mean(pol_sig, axis=0) # Estimate the policy linearization at each time step. for t in range(T): # Assemble diagonal weights matrix and data. dwts = (1.0 / N) * np.ones(N) Ts = X[:, t, :] Ps = pol_mu[:, t, :] Ys = np.concatenate((Ts, Ps), axis=1) # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.cur[m].pol_info.policy_prior.eval(Ts, Ps) sig_reg = np.zeros((dX+dU, dX+dU)) # On the first time step, always slightly regularize covariance. if t == 0: sig_reg[:dX, :dX] = 1e-8 * np.eye(dX) # Perform computation. pol_K, pol_k, pol_S = gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX, dU, sig_reg) pol_S += pol_sig[t, :, :] pol_info.pol_K[t, :, :], pol_info.pol_k[t, :] = pol_K, pol_k pol_info.pol_S[t, :, :], pol_info.chol_pol_S[t, :, :] = \ pol_S, sp.linalg.cholesky(pol_S)
def fit(self, X, pol_mu, pol_sig): """ Fit policy linearization. Args: X: Samples (N, T, dX) pol_mu: Policy means (N, T, dU) pol_sig: Policy covariance (N, T, dU) """ N, T, dX = X.shape dU = pol_mu.shape[2] if N == 1: raise ValueError("Cannot fit dynamics on 1 sample") # Collapse policy covariances. (This is only correct because # the policy doesn't depend on state). pol_sig = np.mean(pol_sig, axis=0) # Allocate. pol_K = np.zeros([T, dU, dX]) pol_k = np.zeros([T, dU]) pol_S = np.zeros([T, dU, dU]) # Fit policy linearization with least squares regression. dwts = (1.0 / N) * np.ones(N) for t in range(T): Ts = X[:, t, :] Ps = pol_mu[:, t, :] Ys = np.concatenate([Ts, Ps], axis=1) # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.eval(Ts, Ps) sig_reg = np.zeros((dX + dU, dX + dU)) # Slightly regularize on first timestep. if t == 0: #sig_reg[:dX, :dX] = self._init_sig_reg*np.eye(dX) #print(self._init_sig_reg.shape) np.fill_diagonal(sig_reg[:dX, :dX], self._init_sig_reg) else: #sig_reg[:dX, :dX] = self._subsequent_sig_reg*np.eye(dX) np.fill_diagonal(sig_reg[:dX, :dX], self._subsequent_sig_reg) pol_K[t, :, :], pol_k[t, :], pol_S[t, :, :] = \ gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX, dU, sig_reg) pol_S += pol_sig return pol_K, pol_k, pol_S
def fit(self, X, pol_mu, pol_sig): """ Fit policy linearization. Args: X: Samples (N, T, dX) pol_mu: Policy means (N, T, dU) pol_sig: Policy covariance (N, T, dU) """ N, T, dX = X.shape dU = pol_mu.shape[2] if N == 1: raise ValueError("Cannot fit dynamics on 1 sample") # Collapse policy covariances. (This is only correct because # the policy doesn't depend on state). pol_sig = np.mean(pol_sig, axis=0) # Allocate. pol_K = np.zeros([T, dU, dX]) pol_k = np.zeros([T, dU]) pol_S = np.zeros([T, dU, dU]) # Fit policy linearization with least squares regression. dwts = (1.0 / N) * np.ones(N) for t in range(T): Ts = X[:, t, :] Ps = pol_mu[:, t, :] Ys = np.concatenate([Ts, Ps], axis=1) # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.eval(Ts, Ps) sig_reg = np.zeros((dX+dU, dX+dU)) # Slightly regularize on first timestep. if t == 0: sig_reg[:dX, :dX] = 1e-8 pol_K[t, :, :], pol_k[t, :], pol_S[t, :, :] = \ gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX, dU, sig_reg) pol_S += pol_sig return pol_K, pol_k, pol_S
def fit_delta(self, X, U): N, T, dX = X.shape dU = U.shape[2] if N == 1: raise ValueError("Cannot fit dynamics on 1 sample") X_delta = np.zeros((N, T, dX)) n_count = 0 for states_in_single_rollout in X: output = states_in_single_rollout[1 : T, :] \ - states_in_single_rollout[0 : T - 1, :] X_delta[n_count, 1:T, :] = output n_count = n_count + 1 self.Fm = np.zeros([T, dX, dX + dU]) self.fv = np.zeros([T, dX]) self.dyn_covar = np.zeros([T, dX, dX]) Fm_delta = np.zeros([dX, dX + dU]) for i in range(dX): Fm_delta[i][i] = 1 it = slice(dX + dU) ip = slice(dX + dU, dX + dU + dX) # Fit dynamics with least squares regression. dwts = (1.0 / N) * np.ones(N) for t in range(T - 1): Ys = np.c_[X[:, t, :], U[:, t, :], X_delta[:, t + 1, :]] # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.prior.eval(dX, dU, Ys) sig_reg = np.zeros((dX + dU + dX, dX + dU + dX)) sig_reg[it, it] = self._hyperparams['regularization'] Fm, fv, dyn_covar = gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX + dU, dX, sig_reg) self.Fm[t, :, :] = Fm + Fm_delta self.fv[t, :] = fv self.dyn_covar[t, :, :] = dyn_covar return self.Fm, self.fv, self.dyn_covar