def convert_send_qmatrix_msg(self):  # Converts numpy array to QMatrix msg
        self.q_matrix_msg = QMatrix()
        for i in range(len(self.q_matrix_arr)):
            row = QMatrixRow()
            row.q_matrix_row = self.q_matrix_arr[i].astype(int)
            self.q_matrix_msg.q_matrix.append(row)

        # Publish Q-matrix message to Q-matrix topic
        self.q_matrix_pub.publish(self.q_matrix_msg)
예제 #2
0
 def init_q_matrix(self):
     ''' initializes the q matrix with all zeros'''
     m = QMatrix()
     m.q_matrix = []
     for _ in range(len(self.states)):
         row = QMatrixRow()
         row.q_matrix_row = [0] * len(self.actions)
         m.q_matrix.append(row)
     return m
예제 #3
0
 def publish_qmatrix(self):
     matrix = QMatrix()
     matrix.q_matrix = []
     for x in range(0, 64):
         row = self.q[x]
         matrixrow = QMatrixRow()
         matrixrow.q_matrix_row = row
         matrix.q_matrix.append(matrixrow)
     matrix.header = Header(stamp=rospy.Time.now())
     self.matrix_pub.publish(matrix)
예제 #4
0
    def publish_q_matrix(self):
        """ Converts self.q_matrix (a numpy array)
            into the form of a QMatrix object
        """
        matrix = QMatrix()
        matrix.q_matrix = []
        for q_row in self.q_matrix:
            row = QMatrixRow()
            row.q_matrix_row = q_row.astype(np.int16).tolist()
            matrix.q_matrix.append(row)
        self.q_pub.publish(matrix)

        filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt"
        np.savetxt(filename, self.q_matrix, delimiter=" ")
예제 #5
0
    def publish_qmat(self):
        """
        Converts self.qmat into a QMatrix() object and 
        publishes it to /q_learning/q_matrix
        """
        qmat_to_pub = QMatrix()
        qmat_to_pub.header = Header(stamp=rospy.Time.now())
        rows = []
        for row in self.qmat:
            qmat_row = QMatrixRow()
            qmat_row.q_matrix_row = list(row)
            rows.append(qmat_row)

        qmat_to_pub.q_matrix = rows
        self.qmat_pub.publish(qmat_to_pub)
예제 #6
0
    def get_reward(self, reward):
        """
        When a reward is recieved, update the self.q_matrix,
        and then check for convergence. If training has not
        converged, run another iteration.
        """

        # The actual Q Learning algorithm
        self.q_matrix[self.current_state][self.current_action] = self.q_matrix[
            self.current_state][self.current_action] + self.alpha * (
                reward.reward +
                self.gamma * max(self.q_matrix[self.next_state]) -
                self.q_matrix[self.current_state][self.current_action])
        self.previous_matrices.append(copy.deepcopy(self.q_matrix))
        self.current_state = self.next_state

        # Publish the matrix for grading
        matrix_msg = QMatrix()
        matrix_msg.q_matrix = [
            QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist()
        ]
        self.matrix_publisher.publish(matrix_msg)

        #check if q-matrix has converged

        #if not, train another iteration
        if not self.is_converged(reward.iteration_num):
            self.train_one_iteration()
        #if so, stop training, save q matrix to file, and exit
        else:
            rospy.logfatal("converged")
            rospy.loginfo(self.q_matrix)
            self.save_q_matrix()
            sys.exit(0)
예제 #7
0
    def initialize_q_matrix(self):
        for i in range(0, len(self.all_states)):
            qrow = QMatrixRow()

            for j in range(0, len(self.all_actions)):
                qrow.q_matrix_row.append(0)

            self.q_matrix.q_matrix.append(qrow)
    def initialize_q_matrix(self):
        h = Header()
        h.stamp = rospy.Time.now()
        h.frame_id = "q_matrix"
        temp = QMatrixRow()
        temp.q_matrix_row = [0] * 9

        self.actual_q_matrix = QMatrix()
        self.actual_q_matrix.header = h
        # 64 different states = 64 rows, with 9 possible actions each (columns)
        self.actual_q_matrix.q_matrix = [temp] * 64

        # This variable is used to determine whether the matrix has converged
        self.prev_q_matrix = copy.deepcopy(self.actual_q_matrix)

        # Publish the initial, empty q_matrix to the topic
        self.q_matrix_pub.publish(self.actual_q_matrix)
예제 #9
0
    def initialize_q_matrix(self):
        """ Initialize the Q-matrix with all 0s to start """

        # Loop over 64 rows and 9 columns to set up the matrix
        for i in range(len(self.states)):
            q_matrix_row = QMatrixRow()
            for j in range(len(self.actions)):
                q_matrix_row.q_matrix_row.append(0)
            self.q_matrix.q_matrix.append(q_matrix_row)
예제 #10
0
 def initialize_q_matrix(self):
     # loop over 64 rows and 9 cols putting 0 in each
     for i in range(64):
         x = QMatrixRow()
         for j in range(9):
             x.q_matrix_row.append(0)
         self.q_matrix.q_matrix.append(x)
     # publish q matrix
     self.q_matrix_pub.publish(self.q_matrix)
예제 #11
0
    def get_reward(self, data):
        """
        Calculate reward according to q learning algorithm. Check if q matrix has converged,
        and handle accordingly if so.
        """
        # Calculate the max reward for next state
        next_reward = max(self.q_matrix[self.next_state])

        # Calculate reward currently written in q matrix for our current state/action pair.
        current_reward = self.q_matrix[self.curr_state][self.curr_action]

        # Calculate a new reward using the q learning algorithm
        new_reward = current_reward + self.alpha * (
            data.reward + self.gamma * next_reward - current_reward)

        # If newly calculated reward is the same as what's already written in the q matrix,
        # we count one instance of convergence. If the convergence count reaches 1000 instances,
        # we consider that our q matrix has converged and we save it to CSV.
        if current_reward == new_reward:
            self.convergence_count += 1
            if self.convergence_count == self.convergence_max:
                self.save_q_matrix()
                return
        # If not converged, write the new reward in the q matrix and publish the updated q matrix.
        else:
            self.convergence_count = 0
            self.q_matrix[self.curr_state][self.curr_action] = new_reward

            qm = QMatrix()
            temp = []
            # potentially can cast rows as QMatrixRow more simply temp.append(row)
            for row in self.q_matrix:
                r = QMatrixRow()
                r.q_matrix_row = row
                temp.append(r)

            qm.header = Header(stamp=rospy.Time.now())
            qm.q_matrix = temp
            self.matrix_pub.publish(qm)

        # Move to next state, and call another action to be taken.
        self.curr_state = self.next_state
        self.do_action()
    def recieved_reward(self, data):
        
        # update new qmatrix
        self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action])
        
        # create a new qmatrix ms type and translate our qmatrix into it
        new_qm = QMatrix()
        temp_qm = []
        for row in self.qmatrix:
            temp_array = []
            for val in row:
                temp_array.append(int(val))
            temp_qm.append(QMatrixRow(temp_array))
        new_qm.q_matrix = temp_qm
        # publish qmatrix
        self.qmatrix_pub.publish(new_qm)

        # calculate difference between current and previous qmatrix
        # set variables to store total diff between matrices
        percent_delta = 0
        prev = 0
        curr = 0
        # iterate through prev and current qmatrices and calculate difference
        for i, row in np.ndenumerate(self.qmatrix):
            for j, val in np.ndenumerate(row):
                prev += int(self.previous_matrix[i][j])
                curr += int(val)
        # if there is a meaningfull diff, modify our prev array
        if curr != 0:
            percent_delta = curr - prev
            if(data.reward > 0):
                self.prevs[self.index % 25] = percent_delta
                self.index += 1
                 
                # store this qmatrix in our previous_matrix variable
                self.previous_matrix = copy.deepcopy(self.qmatrix)

        # update our current state
        self.curr_state = self.next_state

        # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not
        avg_delta = np.mean(self.prevs)
        # if matrix hasn't converged, compute another iteration
        if (data.iteration_num < 300 or avg_delta > 1): 
            self.run()
        # save matrix once it has converged
        else:
            self.save_q_matrix()        
예제 #13
0
    def train_q_matrix(self):
        # initialize q_matrix
        self.q_matrix = [[0 for i in range(len(self.actions))]
                         for j in range(len(self.states))]
        self.last_checked_q_matrix = [[0 for i in range(len(self.actions))]
                                      for j in range(len(self.states))]

        current_state_num = 0
        self.reward_update = 0
        self.state_update = 0
        q_matrix_difference = 100
        iteration = 0
        while q_matrix_difference > 10:
            # training process
            # generate a random action

            allowed_actions = []
            for state in range(len(self.states)):
                action_num = self.action_matrix[current_state_num][state]
                if action_num >= 0:
                    allowed_actions.append((action_num, state))
            if len(allowed_actions) > 0:
                action_state_pair = allowed_actions[int(
                    np.random.randint(0, len(allowed_actions)))]
                chosen_action_num = int(action_state_pair[0])
                action_dict = self.actions[chosen_action_num]
                action_msg = RobotMoveDBToBlock()
                action_msg.robot_db = action_dict['dumbbell']
                action_msg.block_id = action_dict['block']
                self.action_pub.publish(action_msg)
                # wait until we get reward and update accordingly
                while self.reward_update == 0:
                    time.sleep(1)
                # reset check for reward callback
                self.reward_update = 0

                next_state_num = action_state_pair[1]
                best_q_value = 0
                for action_q_value in self.q_matrix[next_state_num]:
                    if action_q_value >= best_q_value:
                        best_q_value = action_q_value
                q_matrix_adjustment = self.reward + (
                    self.discount_factor * best_q_value
                ) - self.q_matrix[current_state_num][chosen_action_num]
                self.q_matrix[current_state_num][chosen_action_num] += int(
                    self.learning_rate * q_matrix_adjustment)

                # publish new q_matrix
                self.q_matrix_msg = QMatrix()
                for state_row in self.q_matrix:
                    q_row = QMatrixRow()
                    for q_value in state_row:
                        q_row.q_matrix_row.append(q_value)
                    self.q_matrix_msg.q_matrix.append(q_row)
                self.q_matrix_pub.publish(self.q_matrix_msg)
                current_state_num = next_state_num

                iteration += 1
                print("Training iteration:", iteration)
                if (iteration % (len(self.states) * len(self.actions))) == 0:
                    q_matrix_difference = 0
                    for s_index, state in enumerate(self.q_matrix):
                        for a_index, value in enumerate(state):
                            q_matrix_difference += abs(
                                value -
                                self.last_checked_q_matrix[s_index][a_index])
                    self.last_checked_q_matrix = copy.deepcopy(self.q_matrix)
                    print("New q_matrix_difference:", q_matrix_difference)
            else:  # no actions left
                current_state_num = 0