def teach(self, num_timesteps=100): # for t in range(num_timesteps): for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'): slopes = [ estimate_slope(timesteps, scores) if len(scores) > 1 else 1 for timesteps, scores in zip(self.timesteps, self.scores) ] p = self.policy(np.abs(slopes) if self.abs else slopes) r, train_done, val_reward, val_done = self.env.step(p) global VAL_REWARDS for i in range(len(val_reward)): VAL_REWARDS[i].append(val_reward[i]) if val_done: return self.env.model.epochs for a, s in enumerate(r): if not np.isnan(s): self.scores[a].append(s) self.timesteps[a].append(t) if self.writer: for i in range(self.env.num_subtasks): add_summary(self.writer, "slopes/task_%d" % (i + 1), slopes[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=100): # for t in range(num_timesteps): for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'): # find slopes for each task if len(self.dscores) > 0: if isinstance(self.policy, ThompsonPolicy): slopes = [ np.random.choice(drs) for drs in np.array(self.dscores).T ] else: slopes = np.mean(self.dscores, axis=0) else: slopes = np.ones(self.env.num_subtasks) print("slopes: ", slopes) p = self.policy(np.abs(slopes) if self.abs else slopes) r, train_done, val_reward, val_done = self.env.step(p) global VAL_REWARDS for i in range(len(val_reward)): VAL_REWARDS[i].append(val_reward[i]) if val_done: return self.env.model.epochs # log delta score # print("r: ", r) # print("prevr: ", self.prevr) dr = r - self.prevr # print("dr: ",dr) self.prevr = r self.dscores.append(dr) # print("dscores: ",self.dscores) if self.writer: for i in range(self.env.num_subtasks): add_summary(self.writer, "slopes/task_%d" % (i + 1), slopes[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=100): # for t in range(num_timesteps): for t in tqdm(range(num_timesteps), ascii=True, unit='timesteps'): p = self.policy(np.abs(self.Q) if self.abs else self.Q) r, train_done, val_reward, val_done = self.env.step(p) global VAL_REWARDS for i in range(len(val_reward)): VAL_REWARDS[i].append(val_reward[i]) if val_done: return self.env.model.epochs s = r - self.prevr # safeguard against not sampling particular action at all s = np.nan_to_num(s) self.Q += self.lr * (s - self.Q) self.prevr = r if self.writer: for i in range(self.env.num_subtasks): add_summary(self.writer, "Q_values/task_%d" % (i + 1), self.Q[i], self.env.model.epochs) add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=100): for t in range(num_timesteps // self.window_size): p = self.policy(np.abs(self.Q) if self.abs else self.Q) scores = [[] for _ in range(len(self.Q))] for i in range(self.window_size): r, train_done, val_reward, val_done = self.env.step(p) global VAL_REWARDS for i in range(len(val_reward)): VAL_REWARDS[i].append(val_reward[i]) if val_done: return self.env.model.epochs for a, score in enumerate(r): if not np.isnan(score): scores[a].append(score) s = [ estimate_slope(list(range(len(sc))), sc) if len(sc) > 1 else 1 for sc in scores ] self.Q += self.lr * (s - self.Q) if self.writer: for i in range(self.env.num_subtasks): add_summary(self.writer, "Q_values/task_%d" % (i + 1), self.Q[i], self.env.model.epochs) add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs