示例#1
0
    def run(self, num_epochs=1, num_episodes=1):
        for ei in range(num_epochs):
            task_performance = np.zeros(self.num_tasks)
            for ti in range(self.num_tasks):
                task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4)
            ti = np.argmin(task_performance)
            task = self.tasks[ti]
            self.last_task_ti = ti

            # (TODO) this breaks away the abstraction.
            self.deepQlearn.task = task
            self.dqn.task = task
            # run training.
            self.deepQlearn.run(num_episodes, task)
示例#2
0
    def run(self, num_epochs=1, num_episodes=1):
        cov_func = lambda task1, task2, t1, t2: self.gpt_v * np.exp(- (self.dist(task1, task2) ** 2 * self.gpt_r + self.gpt_eta * (t1 - t2) ** 2))
        for ei in range(num_epochs):
            # task selection.
            # complexity max(#task * history, history ** 2.3)
            if len(self.examples) == 0: # no prior experience, choose randomly.
                task = prob.choice(self.tasks, 1)[0]
            else:
                # GP-t.
                mu = np.zeros(self.num_tasks)
                sigma = np.zeros(self.num_tasks)
                ucb = np.zeros(self.num_tasks)
                # Kinv = npla.inv(self.K + self.gpt_sigma ** 2)
                # Kinv_y = np.dot(Kinv, self.y)
                Kinv_y = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, self.y)
                for ti, task in enumerate(self.tasks):
                    vec = np.zeros(self.t)
                    for ei in range(self.t):
                        (t_ei, task_ei, _) = self.examples[ei]
                        vec[ei] = cov_func(task, task_ei, self.t, t_ei)
                    mu[ti] = np.dot(vec, Kinv_y)
                    Kinv_vec = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, vec)
                    sigma[ti] = self.gpt_v + self.gpt_sigma ** 2 - np.dot(vec, Kinv_vec)
                    ucb[ti] = mu[ti] + self.gpt_kappa * sigma[ti]
                best_ti = np.argmax(ucb)
                task = self.tasks[best_ti]
                # store information for diagnosis.
                self.mu = mu
                self.sigma = sigma
                self.ucb = ucb

            # import pdb; pdb.set_trace()
            # run training.
            self._run_task(task, num_episodes=num_episodes)

            # evaluate performance.
            self.last_task_performance = np.zeros(self.num_tasks)
            for ti in range(self.num_tasks):
                self.last_task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4)
            performance = np.mean(self.last_task_performance)
            progress = performance - self.last_performance
            # update statistics.
            self.examples.append((self.t, task, progress))
            self.t += 1
            t = self.t

            new_K = np.zeros((t, t))
            new_y = np.zeros(t)
            if t > 1:
                new_K[:t - 1, :t - 1] = self.K
                new_y[:t - 1] = self.y
            new_K[t - 1, t - 1] = self.gpt_v
            new_y[t - 1] = progress
            for ei in range(t - 1):
                (t_ei, task_ei, _) = self.examples[ei]
                new_K[t - 1, ei] = cov_func(task_ei, task, t_ei, t - 1)
                new_K[ei, t - 1] = new_K[t - 1, ei] # symmetric.
            self.K = new_K
            self.y = new_y
            self.last_performance = performance
            self.last_progress = progress
            self.last_task = task
            self.last_task_ti = self.tasks.index(task)