def convert_send_qmatrix_msg(self): # Converts numpy array to QMatrix msg self.q_matrix_msg = QMatrix() for i in range(len(self.q_matrix_arr)): row = QMatrixRow() row.q_matrix_row = self.q_matrix_arr[i].astype(int) self.q_matrix_msg.q_matrix.append(row) # Publish Q-matrix message to Q-matrix topic self.q_matrix_pub.publish(self.q_matrix_msg)
def init_q_matrix(self): ''' initializes the q matrix with all zeros''' m = QMatrix() m.q_matrix = [] for _ in range(len(self.states)): row = QMatrixRow() row.q_matrix_row = [0] * len(self.actions) m.q_matrix.append(row) return m
def publish_qmatrix(self): matrix = QMatrix() matrix.q_matrix = [] for x in range(0, 64): row = self.q[x] matrixrow = QMatrixRow() matrixrow.q_matrix_row = row matrix.q_matrix.append(matrixrow) matrix.header = Header(stamp=rospy.Time.now()) self.matrix_pub.publish(matrix)
def publish_q_matrix(self): """ Converts self.q_matrix (a numpy array) into the form of a QMatrix object """ matrix = QMatrix() matrix.q_matrix = [] for q_row in self.q_matrix: row = QMatrixRow() row.q_matrix_row = q_row.astype(np.int16).tolist() matrix.q_matrix.append(row) self.q_pub.publish(matrix) filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt" np.savetxt(filename, self.q_matrix, delimiter=" ")
def publish_qmat(self): """ Converts self.qmat into a QMatrix() object and publishes it to /q_learning/q_matrix """ qmat_to_pub = QMatrix() qmat_to_pub.header = Header(stamp=rospy.Time.now()) rows = [] for row in self.qmat: qmat_row = QMatrixRow() qmat_row.q_matrix_row = list(row) rows.append(qmat_row) qmat_to_pub.q_matrix = rows self.qmat_pub.publish(qmat_to_pub)
def get_reward(self, reward): """ When a reward is recieved, update the self.q_matrix, and then check for convergence. If training has not converged, run another iteration. """ # The actual Q Learning algorithm self.q_matrix[self.current_state][self.current_action] = self.q_matrix[ self.current_state][self.current_action] + self.alpha * ( reward.reward + self.gamma * max(self.q_matrix[self.next_state]) - self.q_matrix[self.current_state][self.current_action]) self.previous_matrices.append(copy.deepcopy(self.q_matrix)) self.current_state = self.next_state # Publish the matrix for grading matrix_msg = QMatrix() matrix_msg.q_matrix = [ QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist() ] self.matrix_publisher.publish(matrix_msg) #check if q-matrix has converged #if not, train another iteration if not self.is_converged(reward.iteration_num): self.train_one_iteration() #if so, stop training, save q matrix to file, and exit else: rospy.logfatal("converged") rospy.loginfo(self.q_matrix) self.save_q_matrix() sys.exit(0)
def initialize_q_matrix(self): for i in range(0, len(self.all_states)): qrow = QMatrixRow() for j in range(0, len(self.all_actions)): qrow.q_matrix_row.append(0) self.q_matrix.q_matrix.append(qrow)
def initialize_q_matrix(self): h = Header() h.stamp = rospy.Time.now() h.frame_id = "q_matrix" temp = QMatrixRow() temp.q_matrix_row = [0] * 9 self.actual_q_matrix = QMatrix() self.actual_q_matrix.header = h # 64 different states = 64 rows, with 9 possible actions each (columns) self.actual_q_matrix.q_matrix = [temp] * 64 # This variable is used to determine whether the matrix has converged self.prev_q_matrix = copy.deepcopy(self.actual_q_matrix) # Publish the initial, empty q_matrix to the topic self.q_matrix_pub.publish(self.actual_q_matrix)
def initialize_q_matrix(self): """ Initialize the Q-matrix with all 0s to start """ # Loop over 64 rows and 9 columns to set up the matrix for i in range(len(self.states)): q_matrix_row = QMatrixRow() for j in range(len(self.actions)): q_matrix_row.q_matrix_row.append(0) self.q_matrix.q_matrix.append(q_matrix_row)
def initialize_q_matrix(self): # loop over 64 rows and 9 cols putting 0 in each for i in range(64): x = QMatrixRow() for j in range(9): x.q_matrix_row.append(0) self.q_matrix.q_matrix.append(x) # publish q matrix self.q_matrix_pub.publish(self.q_matrix)
def get_reward(self, data): """ Calculate reward according to q learning algorithm. Check if q matrix has converged, and handle accordingly if so. """ # Calculate the max reward for next state next_reward = max(self.q_matrix[self.next_state]) # Calculate reward currently written in q matrix for our current state/action pair. current_reward = self.q_matrix[self.curr_state][self.curr_action] # Calculate a new reward using the q learning algorithm new_reward = current_reward + self.alpha * ( data.reward + self.gamma * next_reward - current_reward) # If newly calculated reward is the same as what's already written in the q matrix, # we count one instance of convergence. If the convergence count reaches 1000 instances, # we consider that our q matrix has converged and we save it to CSV. if current_reward == new_reward: self.convergence_count += 1 if self.convergence_count == self.convergence_max: self.save_q_matrix() return # If not converged, write the new reward in the q matrix and publish the updated q matrix. else: self.convergence_count = 0 self.q_matrix[self.curr_state][self.curr_action] = new_reward qm = QMatrix() temp = [] # potentially can cast rows as QMatrixRow more simply temp.append(row) for row in self.q_matrix: r = QMatrixRow() r.q_matrix_row = row temp.append(r) qm.header = Header(stamp=rospy.Time.now()) qm.q_matrix = temp self.matrix_pub.publish(qm) # Move to next state, and call another action to be taken. self.curr_state = self.next_state self.do_action()
def recieved_reward(self, data): # update new qmatrix self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action]) # create a new qmatrix ms type and translate our qmatrix into it new_qm = QMatrix() temp_qm = [] for row in self.qmatrix: temp_array = [] for val in row: temp_array.append(int(val)) temp_qm.append(QMatrixRow(temp_array)) new_qm.q_matrix = temp_qm # publish qmatrix self.qmatrix_pub.publish(new_qm) # calculate difference between current and previous qmatrix # set variables to store total diff between matrices percent_delta = 0 prev = 0 curr = 0 # iterate through prev and current qmatrices and calculate difference for i, row in np.ndenumerate(self.qmatrix): for j, val in np.ndenumerate(row): prev += int(self.previous_matrix[i][j]) curr += int(val) # if there is a meaningfull diff, modify our prev array if curr != 0: percent_delta = curr - prev if(data.reward > 0): self.prevs[self.index % 25] = percent_delta self.index += 1 # store this qmatrix in our previous_matrix variable self.previous_matrix = copy.deepcopy(self.qmatrix) # update our current state self.curr_state = self.next_state # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not avg_delta = np.mean(self.prevs) # if matrix hasn't converged, compute another iteration if (data.iteration_num < 300 or avg_delta > 1): self.run() # save matrix once it has converged else: self.save_q_matrix()
def train_q_matrix(self): # initialize q_matrix self.q_matrix = [[0 for i in range(len(self.actions))] for j in range(len(self.states))] self.last_checked_q_matrix = [[0 for i in range(len(self.actions))] for j in range(len(self.states))] current_state_num = 0 self.reward_update = 0 self.state_update = 0 q_matrix_difference = 100 iteration = 0 while q_matrix_difference > 10: # training process # generate a random action allowed_actions = [] for state in range(len(self.states)): action_num = self.action_matrix[current_state_num][state] if action_num >= 0: allowed_actions.append((action_num, state)) if len(allowed_actions) > 0: action_state_pair = allowed_actions[int( np.random.randint(0, len(allowed_actions)))] chosen_action_num = int(action_state_pair[0]) action_dict = self.actions[chosen_action_num] action_msg = RobotMoveDBToBlock() action_msg.robot_db = action_dict['dumbbell'] action_msg.block_id = action_dict['block'] self.action_pub.publish(action_msg) # wait until we get reward and update accordingly while self.reward_update == 0: time.sleep(1) # reset check for reward callback self.reward_update = 0 next_state_num = action_state_pair[1] best_q_value = 0 for action_q_value in self.q_matrix[next_state_num]: if action_q_value >= best_q_value: best_q_value = action_q_value q_matrix_adjustment = self.reward + ( self.discount_factor * best_q_value ) - self.q_matrix[current_state_num][chosen_action_num] self.q_matrix[current_state_num][chosen_action_num] += int( self.learning_rate * q_matrix_adjustment) # publish new q_matrix self.q_matrix_msg = QMatrix() for state_row in self.q_matrix: q_row = QMatrixRow() for q_value in state_row: q_row.q_matrix_row.append(q_value) self.q_matrix_msg.q_matrix.append(q_row) self.q_matrix_pub.publish(self.q_matrix_msg) current_state_num = next_state_num iteration += 1 print("Training iteration:", iteration) if (iteration % (len(self.states) * len(self.actions))) == 0: q_matrix_difference = 0 for s_index, state in enumerate(self.q_matrix): for a_index, value in enumerate(state): q_matrix_difference += abs( value - self.last_checked_q_matrix[s_index][a_index]) self.last_checked_q_matrix = copy.deepcopy(self.q_matrix) print("New q_matrix_difference:", q_matrix_difference) else: # no actions left current_state_num = 0