示例#1
0
    def teach(self, num_timesteps=100):
        # for t in range(num_timesteps):
        for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'):
            slopes = [
                estimate_slope(timesteps, scores) if len(scores) > 1 else 1
                for timesteps, scores in zip(self.timesteps, self.scores)
            ]
            p = self.policy(np.abs(slopes) if self.abs else slopes)
            r, train_done, val_reward, val_done = self.env.step(p)

            global VAL_REWARDS
            for i in range(len(val_reward)):
                VAL_REWARDS[i].append(val_reward[i])

            if val_done:
                return self.env.model.epochs

            for a, s in enumerate(r):
                if not np.isnan(s):
                    self.scores[a].append(s)
                    self.timesteps[a].append(t)

            if self.writer:
                for i in range(self.env.num_subtasks):
                    add_summary(self.writer, "slopes/task_%d" % (i + 1),
                                slopes[i], self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs
示例#2
0
    def teach(self, num_timesteps=100):
        # for t in range(num_timesteps):
        for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'):
            # find slopes for each task
            if len(self.dscores) > 0:
                if isinstance(self.policy, ThompsonPolicy):
                    slopes = [
                        np.random.choice(drs)
                        for drs in np.array(self.dscores).T
                    ]
                else:
                    slopes = np.mean(self.dscores, axis=0)
            else:
                slopes = np.ones(self.env.num_subtasks)
            print("slopes: ", slopes)
            p = self.policy(np.abs(slopes) if self.abs else slopes)
            r, train_done, val_reward, val_done = self.env.step(p)

            global VAL_REWARDS
            for i in range(len(val_reward)):
                VAL_REWARDS[i].append(val_reward[i])

            if val_done:
                return self.env.model.epochs

            # log delta score
            # print("r: ", r)
            # print("prevr: ", self.prevr)
            dr = r - self.prevr
            # print("dr: ",dr)
            self.prevr = r
            self.dscores.append(dr)
            # print("dscores: ",self.dscores)

            if self.writer:
                for i in range(self.env.num_subtasks):
                    add_summary(self.writer, "slopes/task_%d" % (i + 1),
                                slopes[i], self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs
示例#3
0
    def teach(self, num_timesteps=100):
        # for t in range(num_timesteps):
        for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'):
            p = self.policy(np.abs(self.Q) if self.abs else self.Q)
            r, train_done, val_reward, val_done = self.env.step(p)

            global VAL_REWARDS
            for i in range(len(val_reward)):
                VAL_REWARDS[i].append(val_reward[i])

            if val_done:
                return self.env.model.epochs

            s = r - self.prevr

            # safeguard against not sampling particular action at all
            s = np.nan_to_num(s)
            self.Q += self.lr * (s - self.Q)
            self.prevr = r

            if self.writer:
                for i in range(self.env.num_subtasks):
                    add_summary(self.writer, "Q_values/task_%d" % (i + 1),
                                self.Q[i], self.env.model.epochs)
                    add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i],
                                self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs
示例#4
0
    def teach(self, num_timesteps=100):
        for t in range(num_timesteps // self.window_size):
            p = self.policy(np.abs(self.Q) if self.abs else self.Q)
            scores = [[] for _ in range(len(self.Q))]
            for i in range(self.window_size):
                r, train_done, val_reward, val_done = self.env.step(p)

                global VAL_REWARDS
                for i in range(len(val_reward)):
                    VAL_REWARDS[i].append(val_reward[i])

                if val_done:
                    return self.env.model.epochs

                for a, score in enumerate(r):
                    if not np.isnan(score):
                        scores[a].append(score)
            s = [
                estimate_slope(list(range(len(sc))), sc) if len(sc) > 1 else 1
                for sc in scores
            ]
            self.Q += self.lr * (s - self.Q)

            if self.writer:
                for i in range(self.env.num_subtasks):
                    add_summary(self.writer, "Q_values/task_%d" % (i + 1),
                                self.Q[i], self.env.model.epochs)
                    add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i],
                                self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs