def get_action(self, o_t): if self.model is None: raise RuntimeError('model was not initialized with the update_model method') H = self.H Ua, Sa, Va, wa, w_ter = self.model self.betas[0,:] = o_t ### Initialize the expected rewards for the plan ### ### Initialize to the immediate reward function ### self.alphas[:,:] = self.random_stream.normal(0,1e-1, self.alphas.shape) self.alphas[-1,:] = w_ter self.plan_param[:,:] = 0.0 self.stoch_plan, plan_val, values, alphas, betas = c_gradient_improve_plan(self.plan_param, self.alphas, self.betas, self.model, self.H, self.discount, learn_rate = self.learn_rate) print plan_val a = self.discrete_actions[sampled_argmax(self.stoch_plan[0], self.random_stream)] return a
def get_action(self, o_t): if self.embedded_models is None: raise RuntimeError('model was not initialized with the update_model method') H = self.H Kab, Da, wa, Ua, Va, imp_a, w_ter = self.embedded_models self.betas[0] = np.tensordot(Va, o_t, (1,0)) ### Initialize the expected rewards for the plan ### ### Initialize to the immediate reward function ### self.alphas[:,:,:] = wa[None,:,:] self.alphas[-1,:,:] += w_ter self.alphas *= Da[None,:,:] self.stoch_plan[:,:] = 1.0/self.stoch_plan.shape[1] self.stoch_plan, plan_val, alphas, betas, _ = lr_improve_plan(self.stoch_plan, self.alphas, self.betas, self.embedded_models, self.H, self.discount, start_temp = self.start_temp) a = self.discrete_actions[sampled_argmax(self.stoch_plan[0], self.random_stream)] return a
def get_action(self, o_t): if o_t.ndim > 1: return np.vstack(( self.get_action(o) for o in o_t )) else: qa = self.evaluate_actions(o_t) return self.discrete_actions[sampled_argmax(qa, self.random_stream)]
def get_action(self, o_t): if self.lem is None: raise RuntimeError('model was not initialized with the update_model method') qa = lr_evaluate_actions(o_t, self.lem, self.thetas) a = self.discrete_actions[sampled_argmax(qa, self.random_stream)] return a
def get_action(self, o_t): stoch_plan, plan_val, alphas, betas = self.plan(o_t) a = self.discrete_actions[sampled_argmax(stoch_plan[0], self.random_stream)] return a