示例#1
0
    def classify(self, data_to_classify):
        points_to_classify = []
        nearest = []
        count = 0
        for line in data_to_classify:
            points_to_classify.append(data_point(
                line[:],
                ''))  # turns the input dataset into a list of datapoints

        for point in points_to_classify:
            count += 1
            nearest = self.get_k_nearest(point)
            occurrence_counter = np.full(
                2000, 0
            )  # This is declared ahead of time to avoid any potential issues with out of bounds errors that might occur if dynamically declaring. It's huge because of the machine dataset
            for i in range(self.k):
                occurrence_counter[int(
                    nearest[i][-1]
                )] += 1  # We cast this particular index to int, since no classes in these sets are of float value
            max_occurrence = np.argmax(
                occurrence_counter
            )  # finds the index with the most common occurrence, and takes that to be the class of the current point
            point.class_type = max_occurrence
            # print("point " + str(count) + " classified")

        return points_to_classify
示例#2
0
 def find_average(
     self, points
 ):  #finds the average position of data points that belong to a cluster and assigns the new position to the centoid
     new_data = np.full(len(points[0].data), 0)
     new_class = 0
     for i in range(len(points[0].data)):  #average the position
         for row in range(len(points)):
             new_data[i] += points[row].data[i]
         new_data[i] = int(round(new_data[i] / len(points)))
     for i in range(len(points)):  #average the class
         new_class += points[i].class_type
     new_class = int(round(new_class / len(points)))
     return data_point(new_data, new_class)  #return the new centroid
示例#3
0
文件: PAM.py 项目: asalois/csci_447
    def recompute(self):
        max_passes = int(
            0.1 * len(self.d_set)
        )  # max passes (if the medoids don't stop changing) to kick out equal to 10% the dataset (tunable)
        distortion = 9223372036854775807  # this is the maximum integer value for the system, and is also used below
        distortion_prime = 9223372036854775807
        old_distortion = 9223372036854775807

        while (max_passes > 0):
            data_distort_med = []
            # assign points to medoids
            for x in self.d_map.points:
                medoid_assigned_to = data_point('', '')
                shortest_distance = 9223372036854775807
                for m in self.pam_map:  # find which medoid the datapoint is closest to
                    dist = self.euclidian(x, m)
                    if (dist < shortest_distance):
                        medoid_assigned_to = m
                        shortest_distance = dist
                data_distort_med.append(
                    [x, shortest_distance, medoid_assigned_to])
                distortion += shortest_distance  # calculate the distortion while finding closest (distances are unsquared)
            # swap-a-roo
            for m in range(len(self.pam_map)):
                for x in range(len(data_distort_med)):
                    #swap points
                    temp_point = self.pam_map[m]
                    self.pam_map[m] = data_distort_med[x][0]
                    data_distort_med[x][0] = temp_point
                    #calc distortion
                    for i in data_distort_med:
                        if (
                                i[-1] != temp_point
                        ):  # if the medoid wasn't changed then the distance hasn't changed
                            distortion_prime += i[1]
                        else:
                            distortion_prime += self.euclidian(
                                self.pam_map[m], i[0])
                    #if distortion is not decreased then swap back
                    if distortion <= distortion_prime:
                        temp_point = self.pam_map[m]
                        self.pam_map[m] = data_distort_med[x][0]
                        data_distort_med[x][0] = temp_point
                    else:
                        distortion = distortion_prime

            if ((old_distortion - distortion) / distortion) < 0.01:
                break
            max_passes -= 1
            old_distortion = distortion
        self.d_map.points = self.pam_map
示例#4
0
    def regression(self, data_to_regress):
        points_to_regress = []
        nearest = []
        for line in data_to_regress:
            points_to_regress.append(data_point(line[:], ''))

        for point in points_to_regress:
            nearest = self.get_k_nearest(point)
            nearest = sorted(nearest, key=lambda l: l[1], reverse=True)
            average = 0
            for i in range(self.k):
                average += nearest[i][-1]
            average = average / self.k
            point.class_type = int(round(average))

        return points_to_regress
示例#5
0
 def mini_gen(self, data_in):  # Makes a new list of points
     point_list = []
     for line in self.d_set:
         point_list.append(data_point(line[:-1], line[-1]))
     return point_list
示例#6
0
 def generate(self):
     point_list = []
     for line in self.d_set:
         point_list.append(data_point(line[:-1], line[-1]))
     self.d_map = point_map(point_list)