def run(self, num_epochs=1, num_episodes=1): for ei in range(num_epochs): task_performance = np.zeros(self.num_tasks) for ti in range(self.num_tasks): task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4) ti = np.argmin(task_performance) task = self.tasks[ti] self.last_task_ti = ti # (TODO) this breaks away the abstraction. self.deepQlearn.task = task self.dqn.task = task # run training. self.deepQlearn.run(num_episodes, task)
def run(self, num_epochs=1, num_episodes=1): cov_func = lambda task1, task2, t1, t2: self.gpt_v * np.exp(- (self.dist(task1, task2) ** 2 * self.gpt_r + self.gpt_eta * (t1 - t2) ** 2)) for ei in range(num_epochs): # task selection. # complexity max(#task * history, history ** 2.3) if len(self.examples) == 0: # no prior experience, choose randomly. task = prob.choice(self.tasks, 1)[0] else: # GP-t. mu = np.zeros(self.num_tasks) sigma = np.zeros(self.num_tasks) ucb = np.zeros(self.num_tasks) # Kinv = npla.inv(self.K + self.gpt_sigma ** 2) # Kinv_y = np.dot(Kinv, self.y) Kinv_y = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, self.y) for ti, task in enumerate(self.tasks): vec = np.zeros(self.t) for ei in range(self.t): (t_ei, task_ei, _) = self.examples[ei] vec[ei] = cov_func(task, task_ei, self.t, t_ei) mu[ti] = np.dot(vec, Kinv_y) Kinv_vec = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, vec) sigma[ti] = self.gpt_v + self.gpt_sigma ** 2 - np.dot(vec, Kinv_vec) ucb[ti] = mu[ti] + self.gpt_kappa * sigma[ti] best_ti = np.argmax(ucb) task = self.tasks[best_ti] # store information for diagnosis. self.mu = mu self.sigma = sigma self.ucb = ucb # import pdb; pdb.set_trace() # run training. self._run_task(task, num_episodes=num_episodes) # evaluate performance. self.last_task_performance = np.zeros(self.num_tasks) for ti in range(self.num_tasks): self.last_task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4) performance = np.mean(self.last_task_performance) progress = performance - self.last_performance # update statistics. self.examples.append((self.t, task, progress)) self.t += 1 t = self.t new_K = np.zeros((t, t)) new_y = np.zeros(t) if t > 1: new_K[:t - 1, :t - 1] = self.K new_y[:t - 1] = self.y new_K[t - 1, t - 1] = self.gpt_v new_y[t - 1] = progress for ei in range(t - 1): (t_ei, task_ei, _) = self.examples[ei] new_K[t - 1, ei] = cov_func(task_ei, task, t_ei, t - 1) new_K[ei, t - 1] = new_K[t - 1, ei] # symmetric. self.K = new_K self.y = new_y self.last_performance = performance self.last_progress = progress self.last_task = task self.last_task_ti = self.tasks.index(task)