def representationExpansionLSPI(self): re_iteration = 0 added_feature = True if self.representation.features_num == 0: print "No features, hence no LSPI is necessary!" return self.logger.info( "============================\nRunning LSPI with %d Samples\n============================" % self.samples_count) while added_feature and re_iteration <= self.re_iterations: re_iteration += 1 # Some Prints if Tools.hasFunction(self.representation, 'batchDiscover'): self.logger.info( '-----------------\nRepresentation Expansion iteration #%d\n-----------------' % re_iteration) # Run LSTD for first solution self.LSTD() # Run Policy Iteration to change a_prime and recalculate weight_vec in a # loop td_errors = self.policyIteration() # Add new Features if Tools.hasFunction(self.representation, 'batchDiscover'): added_feature = self.representation.batchDiscover(td_errors, self.all_phi_s[:self.samples_count, :], self.data_s[:self.samples_count,:]) else: added_feature = False # print 'L_inf distance to V*= ', if added_feature: # Run LSPI one last time with the new features self.LSTD() self.policyIteration()
def representationExpansionLSPI(self): re_iteration = 0 added_feature = True if self.representation.features_num == 0: print("No features, hence no LSPI is necessary!") return self.logger.info( "============================\nRunning LSPI with %d Samples\n============================" % self.samples_count) while added_feature and re_iteration <= self.re_iterations: re_iteration += 1 # Some Prints if Tools.hasFunction(self.representation, 'batchDiscover'): self.logger.info( '-----------------\nRepresentation Expansion iteration #%d\n-----------------' % re_iteration) # Run LSTD for first solution self.LSTD() # Run Policy Iteration to change a_prime and recalculate weight_vec in a # loop td_errors = self.policyIteration() # Add new Features if Tools.hasFunction(self.representation, 'batchDiscover'): added_feature = self.representation.batchDiscover( td_errors, self.all_phi_s[:self.samples_count, :], self.data_s[:self.samples_count, :]) else: added_feature = False # print 'L_inf distance to V*= ', if added_feature: # Run LSPI one last time with the new features self.LSTD() self.policyIteration()
def SolveWeight(self): # sample s_id, a_id - N = 1000 pairs N = 10*self.representation.features_num #print "N: %d" %N Q_vec = np.zeros((N,1)) aid_vec = np.random.choice(np.arange(0, self.ActionNum), replace=True, size=(1, N)) sid_vec = np.random.choice(np.arange(0, self.StateNum), replace=True, size=(1, N)) data_s = np.zeros((N, self.representation.state_space_dims)) data_a = np.zeros((N, 1), dtype=np.uint32) f_size = self.representation.features_num * self.representation.actions_num all_phi_s_a = sp.lil_matrix((N, f_size)) all_phi_s = np.zeros((N, self.representation.features_num)) for i in xrange(N): data_s[i, :] = self.representation.stateID2state(sid_vec[0][i]) data_a[i] = aid_vec[0][i] all_phi_s[i, :] = self.representation.phi(data_s[i],False) Q_vec[i][0] = self.Q[sid_vec[0][i]][aid_vec[0][i]] all_phi_s_a = self.representation.batchPhi_s_a(all_phi_s[:N, :], data_a[:N, :], False) A = np.dot(all_phi_s_a.T,all_phi_s_a) b = np.dot(all_phi_s_a.T,Q_vec) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, b) weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec.copy()
def LSTD(self): """Run the LSTD algorithm on the collected data, and update the policy parameters. """ start_time = Tools.clock() if not self.fixedRep: # build phi_s and phi_ns for all samples p = self.samples_count n = self.representation.features_num self.all_phi_s = np.empty((p, n), dtype=self.representation.featureType()) self.all_phi_ns = np.empty((p, n), dtype=self.representation.featureType()) for i in np.arange(self.samples_count): self.all_phi_s[i, :] = self.representation.phi(self.data_s[i]) self.all_phi_ns[i, :] = self.representation.phi( self.data_ns[i]) # build phi_s_a and phi_ns_na for all samples given phi_s and # phi_ns self.all_phi_s_a = self.representation.batchPhi_s_a( self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count, :], use_sparse=self.use_sparse) self.all_phi_ns_na = self.representation.batchPhi_s_a( self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count, :], use_sparse=self.use_sparse) # calculate A and b for LSTD F1 = self.all_phi_s_a[:self.samples_count, :] F2 = self.all_phi_ns_na[:self.samples_count, :] R = self.data_r[:self.samples_count, :] discount_factor = self.discount_factor if self.use_sparse: self.b = (F1.T * R).reshape(-1, 1) self.A = F1.T * (F1 - discount_factor * F2) else: self.b = np.dot(F1.T, R).reshape(-1, 1) self.A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(self.A) # Calculate weight_vec self.representation.weight_vec, solve_time = Tools.solveLinear( A, self.b) # log solve time only if takes more than 1 second if solve_time > 1: self.logger.info( 'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' % (Tools.deltaT(start_time), solve_time)) else: self.logger.info('Total LSTD Time = %0.0f(s)' % (Tools.deltaT(start_time)))
def LSTD(self): """Run the LSTD algorithm on the collected data, and update the policy parameters. """ start_time = Tools.clock() if not self.fixedRep: # build phi_s and phi_ns for all samples p = self.samples_count n = self.representation.features_num self.all_phi_s = np.empty( (p, n), dtype=self.representation.featureType()) self.all_phi_ns = np.empty( (p, n), dtype=self.representation.featureType()) for i in np.arange(self.samples_count): self.all_phi_s[i, :] = self.representation.phi(self.data_s[i]) self.all_phi_ns[i, :] = self.representation.phi(self.data_ns[i]) # build phi_s_a and phi_ns_na for all samples given phi_s and # phi_ns self.all_phi_s_a = self.representation.batchPhi_s_a(self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count,:], use_sparse=self.use_sparse) self.all_phi_ns_na = self.representation.batchPhi_s_a(self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count,:], use_sparse=self.use_sparse) # calculate A and b for LSTD F1 = self.all_phi_s_a[:self.samples_count, :] F2 = self.all_phi_ns_na[:self.samples_count, :] R = self.data_r[:self.samples_count, :] discount_factor = self.discount_factor if self.use_sparse: self.b = (F1.T * R).reshape(-1, 1) self.A = F1.T * (F1 - discount_factor * F2) else: self.b = np.dot(F1.T, R).reshape(-1, 1) self.A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(self.A) # Calculate weight_vec self.representation.weight_vec, solve_time = Tools.solveLinear(A, self.b) # log solve time only if takes more than 1 second if solve_time > 1: self.logger.info( 'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' % (Tools.deltaT(start_time), solve_time)) else: self.logger.info( 'Total LSTD Time = %0.0f(s)' % (Tools.deltaT(start_time)))
def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal): # The previous state could never be terminal # (otherwise the episode would have already terminated) prevStateTerminal = False # MUST call this at start of learn() self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal) # Compute feature function values and next action to be taken discount_factor = self.discount_factor # 'gamma' in literature feat_weights = self.representation.weight_vec # Value function, expressed as feature weights features_s = self.representation.phi(s, prevStateTerminal) # active feats in state features = self.representation.phi_sa(s, prevStateTerminal, a, features_s) # active features or an (s,a) pair features_prime_s= self.representation.phi(ns, terminal) features_prime = self.representation.phi_sa(ns, terminal, na, features_prime_s) nnz = Tools.count_nonzero(features_s) # Number of non-zero elements # Compute td-error td_error = r + np.dot(discount_factor * features_prime - features, feat_weights) ######## Learn a model # and plan on the current learned model: policy iteration (LSPI) ## RMax self.ModelBasedLearn(s,a,ns,r) ############################### # MUST call this at end of learn() - add new features to representation as required. expanded = self.representation.post_discover(s, False, a, td_error, features_s) # MUST call this at end of learn() - handle episode termination cleanup as required. if terminal: self.episodeTerminated()
def policyIteration(self): """Update the policy by recalculating A based on new na. Returns the TD error for each sample based on the latest weights and next actions. """ start_time = Tools.clock() weight_diff = self.tol_epsilon + 1 # So that the loop starts lspi_iteration = 0 self.best_performance = -np.inf self.logger.info('Running Policy Iteration:') # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed # action_mask is a matrix that shows which actions are available for # each state action_mask = None discount_factor = self.discount_factor F1 = sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count,:] while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon: # Find the best action for each state given the current value function # Notice if actions have the same value the first action is # selected in the batch mode iteration_start_time = Tools.clock() bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse) # Recalculate A matrix (b remains the same) # Solve for the new weight_vec if self.use_sparse: F2 = sp.csr_matrix(self.all_phi_ns_new_na[:self.samples_count, :]) A = F1.T * (F1 - discount_factor * F2) else: F2 = self.all_phi_ns_new_na[:self.samples_count, :] A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, self.b) # Calculate TD_Errors #################### td_errors = self.calculateTDErrors() # Calculate the weight difference. If it is big enough update the # weight_vec weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec self.logger.info( "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1, Tools.deltaT( iteration_start_time), weight_diff, Tools.sparsity( A), self.representation.features_num)) lspi_iteration += 1 self.logger.info( 'Total Policy Iteration Time = %0.0f(s)' % Tools.deltaT(start_time)) return td_errors
def policyIteration(self): """Update the policy by recalculating A based on new na. Returns the TD error for each sample based on the latest weights and next actions. """ start_time = Tools.clock() weight_diff = self.tol_epsilon + 1 # So that the loop starts lspi_iteration = 0 self.best_performance = -np.inf self.logger.info('Running Policy Iteration:') # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed # action_mask is a matrix that shows which actions are available for # each state action_mask = None discount_factor = self.discount_factor F1 = sp.csr_matrix( self.all_phi_s_a[:self.samples_count, :] ) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :] while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon: # Find the best action for each state given the current value function # Notice if actions have the same value the first action is # selected in the batch mode iteration_start_time = Tools.clock() bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction( self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse) # Recalculate A matrix (b remains the same) # Solve for the new weight_vec if self.use_sparse: F2 = sp.csr_matrix( self.all_phi_ns_new_na[:self.samples_count, :]) A = F1.T * (F1 - discount_factor * F2) else: F2 = self.all_phi_ns_new_na[:self.samples_count, :] A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, self.b) # Calculate TD_Errors #################### td_errors = self.calculateTDErrors() # Calculate the weight difference. If it is big enough update the # weight_vec weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec self.logger.info( "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1, Tools.deltaT(iteration_start_time), weight_diff, Tools.sparsity(A), self.representation.features_num)) lspi_iteration += 1 self.logger.info('Total Policy Iteration Time = %0.0f(s)' % Tools.deltaT(start_time)) return td_errors