def get_reward(self, reward): """ When a reward is recieved, update the self.q_matrix, and then check for convergence. If training has not converged, run another iteration. """ # The actual Q Learning algorithm self.q_matrix[self.current_state][self.current_action] = self.q_matrix[ self.current_state][self.current_action] + self.alpha * ( reward.reward + self.gamma * max(self.q_matrix[self.next_state]) - self.q_matrix[self.current_state][self.current_action]) self.previous_matrices.append(copy.deepcopy(self.q_matrix)) self.current_state = self.next_state # Publish the matrix for grading matrix_msg = QMatrix() matrix_msg.q_matrix = [ QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist() ] self.matrix_publisher.publish(matrix_msg) #check if q-matrix has converged #if not, train another iteration if not self.is_converged(reward.iteration_num): self.train_one_iteration() #if so, stop training, save q matrix to file, and exit else: rospy.logfatal("converged") rospy.loginfo(self.q_matrix) self.save_q_matrix() sys.exit(0)
def init_q_matrix(self): ''' initializes the q matrix with all zeros''' m = QMatrix() m.q_matrix = [] for _ in range(len(self.states)): row = QMatrixRow() row.q_matrix_row = [0] * len(self.actions) m.q_matrix.append(row) return m
def publish_qmatrix(self): matrix = QMatrix() matrix.q_matrix = [] for x in range(0, 64): row = self.q[x] matrixrow = QMatrixRow() matrixrow.q_matrix_row = row matrix.q_matrix.append(matrixrow) matrix.header = Header(stamp=rospy.Time.now()) self.matrix_pub.publish(matrix)
def publish_q_matrix(self): """ Converts self.q_matrix (a numpy array) into the form of a QMatrix object """ matrix = QMatrix() matrix.q_matrix = [] for q_row in self.q_matrix: row = QMatrixRow() row.q_matrix_row = q_row.astype(np.int16).tolist() matrix.q_matrix.append(row) self.q_pub.publish(matrix) filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt" np.savetxt(filename, self.q_matrix, delimiter=" ")
def publish_qmat(self): """ Converts self.qmat into a QMatrix() object and publishes it to /q_learning/q_matrix """ qmat_to_pub = QMatrix() qmat_to_pub.header = Header(stamp=rospy.Time.now()) rows = [] for row in self.qmat: qmat_row = QMatrixRow() qmat_row.q_matrix_row = list(row) rows.append(qmat_row) qmat_to_pub.q_matrix = rows self.qmat_pub.publish(qmat_to_pub)
def recieved_reward(self, data): # update new qmatrix self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action]) # create a new qmatrix ms type and translate our qmatrix into it new_qm = QMatrix() temp_qm = [] for row in self.qmatrix: temp_array = [] for val in row: temp_array.append(int(val)) temp_qm.append(QMatrixRow(temp_array)) new_qm.q_matrix = temp_qm # publish qmatrix self.qmatrix_pub.publish(new_qm) # calculate difference between current and previous qmatrix # set variables to store total diff between matrices percent_delta = 0 prev = 0 curr = 0 # iterate through prev and current qmatrices and calculate difference for i, row in np.ndenumerate(self.qmatrix): for j, val in np.ndenumerate(row): prev += int(self.previous_matrix[i][j]) curr += int(val) # if there is a meaningfull diff, modify our prev array if curr != 0: percent_delta = curr - prev if(data.reward > 0): self.prevs[self.index % 25] = percent_delta self.index += 1 # store this qmatrix in our previous_matrix variable self.previous_matrix = copy.deepcopy(self.qmatrix) # update our current state self.curr_state = self.next_state # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not avg_delta = np.mean(self.prevs) # if matrix hasn't converged, compute another iteration if (data.iteration_num < 300 or avg_delta > 1): self.run() # save matrix once it has converged else: self.save_q_matrix()
def convert_send_qmatrix_msg(self): # Converts numpy array to QMatrix msg self.q_matrix_msg = QMatrix() for i in range(len(self.q_matrix_arr)): row = QMatrixRow() row.q_matrix_row = self.q_matrix_arr[i].astype(int) self.q_matrix_msg.q_matrix.append(row) # Publish Q-matrix message to Q-matrix topic self.q_matrix_pub.publish(self.q_matrix_msg)
def get_reward(self, data): """ Calculate reward according to q learning algorithm. Check if q matrix has converged, and handle accordingly if so. """ # Calculate the max reward for next state next_reward = max(self.q_matrix[self.next_state]) # Calculate reward currently written in q matrix for our current state/action pair. current_reward = self.q_matrix[self.curr_state][self.curr_action] # Calculate a new reward using the q learning algorithm new_reward = current_reward + self.alpha * ( data.reward + self.gamma * next_reward - current_reward) # If newly calculated reward is the same as what's already written in the q matrix, # we count one instance of convergence. If the convergence count reaches 1000 instances, # we consider that our q matrix has converged and we save it to CSV. if current_reward == new_reward: self.convergence_count += 1 if self.convergence_count == self.convergence_max: self.save_q_matrix() return # If not converged, write the new reward in the q matrix and publish the updated q matrix. else: self.convergence_count = 0 self.q_matrix[self.curr_state][self.curr_action] = new_reward qm = QMatrix() temp = [] # potentially can cast rows as QMatrixRow more simply temp.append(row) for row in self.q_matrix: r = QMatrixRow() r.q_matrix_row = row temp.append(r) qm.header = Header(stamp=rospy.Time.now()) qm.q_matrix = temp self.matrix_pub.publish(qm) # Move to next state, and call another action to be taken. self.curr_state = self.next_state self.do_action()
def __init__(self): self.initialized = False # initialize this node rospy.init_node('q_learning', disable_signals=True) # ROS publish to robot action so that phantom movement can occur self.robot_action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) # ROS publish to the Gazebo topic to set the locations of the models self.model_states_pub = rospy.Publisher("/gazebo/set_model_state", ModelState, queue_size=10) # ROS publish to Q Matrix to update Q-Learning Matrix self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix", QMatrix, queue_size=10) #ROS subscribe to Rewards to receive from the environment the reward after each action you take rospy.Subscriber("/q_learning/reward", QLearningReward, self.processReward) # intialize Q matrix, reward, minimum number of iterations to do before # checking convergence, current iteraton, and convergence self.q_matrix = QMatrix() self.initialize_q_matrix() self.reward = None self.min_iterations = 150 self.num_iterations_eps = 0 self.current_iteration = 1 self.num_rewards = 0 self.num_actions_pub = 0 self.converged = False self.past_iteration = -1 self.num_iterations = 0 self.final_actions = [] # intialize current state, next state, and action taken self.current_state = 0 self.next_state = -1 self.action = -1 # Create action matrix self.action_matrix = [[] for x in range(64)] for i in range(64): self.action_matrix[i] = [0 for x in range(64)] # Initialize acton matrix self.initialize_action_matrix() self.initialized = True
def __init__(self): # initialize node rospy.init_node('q_learning') # publish to /q_learning/q_matrix with message type q_learning_QMatrix each time Q matrix is updated self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix", QMatrix, queue_size=10) # publish to robot_action self.robot_action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) # subscribe to /q_learning_reward rospy.Subscriber("/q_learning/reward", QLearningReward, self.get_reward) self.action = RobotMoveDBToBlock() # get all possible actions and states self.all_actions = all_actions() # a list self.all_states = all_states() # a list # create action matrix and condense self.action_matrix = create_action_matrix(self.all_states, self.all_actions) self.possible_actions = get_possible_actions( self.action_matrix) # array of numpy arrays self.reward = 0 self.initialized = False #initialize q_matrix self.q_matrix = QMatrix() self.initialize_q_matrix() self.current_q_matrix = None self.converged = False self.initialized = True self.converge_q_matrix() self.converged = True self.action_seq = [] self.get_action_sequence()
def initialize_q_matrix(self): h = Header() h.stamp = rospy.Time.now() h.frame_id = "q_matrix" temp = QMatrixRow() temp.q_matrix_row = [0] * 9 self.actual_q_matrix = QMatrix() self.actual_q_matrix.header = h # 64 different states = 64 rows, with 9 possible actions each (columns) self.actual_q_matrix.q_matrix = [temp] * 64 # This variable is used to determine whether the matrix has converged self.prev_q_matrix = copy.deepcopy(self.actual_q_matrix) # Publish the initial, empty q_matrix to the topic self.q_matrix_pub.publish(self.actual_q_matrix)
def __init__(self): # Publish Q-matrix updates to Q-matrix topic self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix", QMatrix, queue_size=10) # Publish to /robot_action for phantom robot movement self.q_learning = QLearning() self.action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) # Subscribe to environment to receive reward updates self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward, self.update_q_matrix) rospy.sleep(2) # Initialize Q-matrix self.q_matrix_msg = QMatrix() # For ease of use in training, we'll store the Q-matrix in a numpy array self.q_matrix_arr = np.zeros((64, 9)) # Keep track of the 5 most recently updated Q-values self.q_history = [] # Keeps track of current (starting) state, initialized to 0th state self.current_state = 0 # Get new state and action from starting state new_state, action = self.get_random_action(self.current_state) # Current action is action from above (action from current state to new state) self.current_action = action # Keep track of new state self.new_state = new_state # Move the robot according to the current action self.move_DB(self.current_action ) # update_q_matrix() will be called as callback
def __init__(self): # Once everything is set up this will be set to true self.initialized = False # Initialize this node rospy.init_node("q_learning") # Set up publishers self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix", QMatrix, queue_size=10) self.robot_action_pub = rospy.Publisher("/q_learning/robot_action", RobotMoveDBToBlock, queue_size=10) self.cnt = 0 # Set up subscriber rospy.Subscriber("/q_learning/reward", QLearningReward, self.reward_received) # Fetch pre-built action matrix. This is a 2d numpy array where row indexes # correspond to the starting state and column indexes are the next states. # # A value of -1 indicates that it is not possible to get to the next state # from the starting state. Values 0-9 correspond to what action is needed # to go to the next state. # # e.g. self.action_matrix[0][12] = 5 self.action_matrix = np.loadtxt(path_prefix + "action_matrix.txt") # Fetch actions. These are the only 9 possible actions the system can take. # self.actions is an array of dictionaries where the row index corresponds # to the action number, and the value has the following form: # { dumbbell: "red", block: 1} colors = ["red", "green", "blue"] self.actions = np.loadtxt(path_prefix + "actions.txt") self.actions = list( map(lambda x: { "dumbbell": colors[int(x[0])], "block": int(x[1]) }, self.actions)) # Fetch states. There are 64 states. Each row index corresponds to the # state number, and the value is a list of 3 items indicating the positions # of the red, green, blue dumbbells respectively. # e.g. [[0, 0, 0], [1, 0 , 0], [2, 0, 0], ..., [3, 3, 3]] # e.g. [0, 1, 2] indicates that the green dumbbell is at block 1, and blue at block 2. # A value of 0 corresponds to the origin. 1/2/3 corresponds to the block number. # Note: that not all states are possible to get to. self.states = np.loadtxt(path_prefix + "states.txt") self.states = list( map(lambda x: list(map(lambda y: int(y), x)), self.states)) # Initialize current state and keep track of the next state self.curr_state = 0 self.next_state = 0 # Initialize current action index self.curr_action = 0 # Initialize variables to define static status and keep track of how many # iterations have the Q-matrix remained static self.epsilon = 1 self.static_iter_threshold = 500 self.static_tracker = 0 # Initialize and publish Q-matrix self.q_matrix = QMatrix() self.initialize_q_matrix() self.q_matrix_pub.publish(self.q_matrix) # Now everything is initialized, sleep for 1 second to make sure self.initialized = True rospy.sleep(1) # Start with a random action self.select_random_action()
def publish_qmatrix(self): matrix = QMatrix() matrix.header = Header(stamp=rospy.Time.now()) matrix.q_matrix = self.q self.matrix_pub.publish(matrix)
def train_q_matrix(self): # initialize q_matrix self.q_matrix = [[0 for i in range(len(self.actions))] for j in range(len(self.states))] self.last_checked_q_matrix = [[0 for i in range(len(self.actions))] for j in range(len(self.states))] current_state_num = 0 self.reward_update = 0 self.state_update = 0 q_matrix_difference = 100 iteration = 0 while q_matrix_difference > 10: # training process # generate a random action allowed_actions = [] for state in range(len(self.states)): action_num = self.action_matrix[current_state_num][state] if action_num >= 0: allowed_actions.append((action_num, state)) if len(allowed_actions) > 0: action_state_pair = allowed_actions[int( np.random.randint(0, len(allowed_actions)))] chosen_action_num = int(action_state_pair[0]) action_dict = self.actions[chosen_action_num] action_msg = RobotMoveDBToBlock() action_msg.robot_db = action_dict['dumbbell'] action_msg.block_id = action_dict['block'] self.action_pub.publish(action_msg) # wait until we get reward and update accordingly while self.reward_update == 0: time.sleep(1) # reset check for reward callback self.reward_update = 0 next_state_num = action_state_pair[1] best_q_value = 0 for action_q_value in self.q_matrix[next_state_num]: if action_q_value >= best_q_value: best_q_value = action_q_value q_matrix_adjustment = self.reward + ( self.discount_factor * best_q_value ) - self.q_matrix[current_state_num][chosen_action_num] self.q_matrix[current_state_num][chosen_action_num] += int( self.learning_rate * q_matrix_adjustment) # publish new q_matrix self.q_matrix_msg = QMatrix() for state_row in self.q_matrix: q_row = QMatrixRow() for q_value in state_row: q_row.q_matrix_row.append(q_value) self.q_matrix_msg.q_matrix.append(q_row) self.q_matrix_pub.publish(self.q_matrix_msg) current_state_num = next_state_num iteration += 1 print("Training iteration:", iteration) if (iteration % (len(self.states) * len(self.actions))) == 0: q_matrix_difference = 0 for s_index, state in enumerate(self.q_matrix): for a_index, value in enumerate(state): q_matrix_difference += abs( value - self.last_checked_q_matrix[s_index][a_index]) self.last_checked_q_matrix = copy.deepcopy(self.q_matrix) print("New q_matrix_difference:", q_matrix_difference) else: # no actions left current_state_num = 0