def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # compute basis functions phi_s = np.zeros((self.n)) phi_ns = np.zeros((self.n)) k = self.representation.features_num phi_s[:k] = self.representation.phi(s, False) phi_s[k:] = self.policy.dlogpi(s, a) phi_ns[:k] = self.representation.phi(ns, terminal) # update statistics self.z *= self.lambda_ self.z += phi_s self.A += np.einsum("i,j", self.z, phi_s - self.discount_factor * phi_ns, out=self.buf_) self.b += self.z * r if terminal: self.z[:] = 0. self.steps_between_updates += 1 self.logger.debug("Statistics updated") if self.steps_between_updates > self.min_steps_between_updates: A = regularize(self.A) param, time = solveLinear(A, self.b) # v = param[:k] # parameters of the value function representation w = param[k:] # natural gradient estimate if self._gradient_sane( w ) or self.steps_between_updates > self.max_steps_between_updates: # update policy self.policy.theta = self.policy.theta + self.learn_rate * w self.last_w = w self.logger.debug("Policy updated, norm of gradient {}".format( np.linalg.norm(w))) # forget statistics self.z *= 1. - self.forgetting_rate self.A *= 1. - self.forgetting_rate self.b *= 1. - self.forgetting_rate self.steps_between_updates = 0 if terminal: self.episodeTerminated()
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # compute basis functions phi_s = np.zeros((self.n)) phi_ns = np.zeros((self.n)) k = self.representation.features_num phi_s[:k] = self.representation.phi(s, False) phi_s[k:] = self.policy.dlogpi(s, a) phi_ns[:k] = self.representation.phi(ns, terminal) # update statistics self.z *= self.lambda_ self.z += phi_s self.A += np.einsum("i,j", self.z, phi_s - self.discount_factor * phi_ns, out=self.buf_) self.b += self.z * r if terminal: self.z[:] = 0. self.steps_between_updates += 1 self.logger.debug("Statistics updated") if self.steps_between_updates > self.min_steps_between_updates: A = regularize(self.A) param, time = solveLinear(A, self.b) # v = param[:k] # parameters of the value function representation w = param[k:] # natural gradient estimate if self._gradient_sane(w) or self.steps_between_updates > self.max_steps_between_updates: # update policy self.policy.theta = self.policy.theta + self.learn_rate * w self.last_w = w self.logger.debug( "Policy updated, norm of gradient {}".format(np.linalg.norm(w))) # forget statistics self.z *= 1. - self.forgetting_rate self.A *= 1. - self.forgetting_rate self.b *= 1. - self.forgetting_rate self.steps_between_updates = 0 if terminal: self.episodeTerminated()
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy( self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.policy.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in xrange(self.samples_num): phi_s_a = self.representation.phi_sa( S[i], Actions[i, 0]).reshape((-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in xrange(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm( new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()