def create_option(self, subgoal): """Create a new option for the given subgoal with a pseudo reward function and value iteration policy.""" id = len(self.options) vi = ValueIteration(id, [self.fa.evaluate(subgoal.state)], self, self.plan_iterations, alpha=self.alpha, gamma=self.gamma) policy = VIPolicy(self.num_actions, vi) self.options[id] = Option(id, self.fa, policy, self.eta, self.gamma, subgoal, self.num_actions) self.intrinsic.append(np.zeros((self.fa.num_features, 1)))
def __init__(self, policy, fa, num_actions, alpha, gamma, eta, zeta, epsilon, plan_iter, sim_samples, sim_steps, retain_theta=True, subgoals=[], samples=[]): self.policy = policy self.fa = fa self.num_actions = num_actions self.alpha = alpha self.gamma = gamma self.eta = eta self.zeta = zeta self.epsilon = epsilon self.plan_iterations = plan_iter self.sim_samples = sim_samples self.sim_steps = sim_steps self.subgoals = subgoals self.samples = samples self.reached_subgoals = [] self.extrinsic = None self.intrinsic = [np.ones((self.fa.num_features, 1))] * num_actions self.options = { i: Option(i, fa, FixedPolicy(num_actions, i), eta, gamma, None, num_actions) for i in range(num_actions) } self.vi = ValueIteration(-1, self.intrinsic, self, plan_iter, retain_theta=retain_theta, use_options=False, alpha=alpha, gamma=gamma) self.vi_policy = VIPolicy(num_actions, self.vi) self.option_stack = [] self.step = 0 self.viz = None