예제 #1
0
    def cluster_more(self, n):
        """Expands the cluster to include n more emails and returns these additional emails.
           If n more is not available, cluster size is simply truncated to include all remaining
           emails."""
        if 'frequency' in self.opt:
            if n >= len(self.dist_list):
                n = len(self.dist_list)
            print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt,  " method"
            self.size += n

            new_elements = []
            added = 0
            while added < n:
                nearest = self.dist_list[0][1] # get nearest email
                new_elements.append(nearest) # add to new list
                self.added.append(nearest)
                self.cluster_set.add(nearest) # add to original cluster set
                self.cluster_word_frequency = helpers.update_word_frequencies(self.cluster_word_frequency, nearest) # update word frequencies
                # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list
                del self.dist_list[0]
                self.update_dist_list()
                added += 1
            assert(len(new_elements) == n), str(len(new_elements)) + " " + str(n)
            assert(len(self.cluster_set) == self.size), str(len(self.cluster_set)) + " " + str(self.size)
            for msg in new_elements:
                if msg.train == 1 or msg.train == 3:
                    self.ham.add(msg)
                elif msg.train == 0 or msg.train == 2:
                    self.spam.add(msg)
            return new_elements 

        old_cluster_set = self.cluster_set
        if self.size + n <= len(self.dist_list):
            self.size += n

        else:
            print "\nTruncating cluster size...\n"
            if len(self.dist_list) > 0:
                self.size = len(self.dist_list)

        if self.sort_first:
            new_cluster_set = set(item[1] for item in self.dist_list[:self.size])
        else:
            k_smallest = quickselect.k_smallest
            new_cluster_set = set(item[1] for item in k_smallest(self.dist_list, self.size))

        new_elements = list(item for item in new_cluster_set if item not in old_cluster_set)
        self.cluster_set = new_cluster_set

        assert(len(self.cluster_set) == self.size), len(self.cluster_set)

        for msg in new_elements:
            if msg.train == 1 or msg.train == 3:
                self.ham.add(msg)
            elif msg.train == 0 or msg.train == 2:
                self.spam.add(msg)

        return new_elements
예제 #2
0
    def make_cluster(self):
        """Constructs the initial cluster of emails."""
        # self.dist_list = [t for t in self.dist_list if t is not None]
        if self.size > len(self.dist_list):
            print "\nTruncating cluster size...\n"
            self.size = len(self.dist_list)

        if self.sort_first:
            if 'frequency' in self.opt:
                emails = [self.clustroid]  # list of added emails

                for d, e in self.dist_list:  # Remove the duplicate clustroid in self.dist_list
                    if e.tag == self.clustroid.tag:
                        self.dist_list.remove((d, e))
                        # self.working_set.remove(e)
                        print "-> removed duplicate clustroid ", e.tag
                        break

                current_size = 1
                while current_size < self.size:
                    nearest = self.dist_list[0][1]  # get nearest email
                    assert (nearest.tag != self.clustroid.tag), str(
                        nearest.tag) + " " + str(self.clustroid.tag)
                    emails.append(nearest)  # add to list
                    self.added.append(
                        nearest)  # track order in which emails are added
                    # self.working_set.remove(nearest) # remove from working set so email doesn't show up again when we recreate dist_list
                    self.cluster_word_frequency = helpers.update_word_frequencies(
                        self.cluster_word_frequency,
                        nearest)  # update word frequencies
                    del self.dist_list[0]  # so we don't add the email twice
                    self.update_dist_list(
                    )  # new cluster_word_frequency, so need to resort closest emails
                    # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list
                    current_size += 1
                print "-> cluster initialized with size", len(emails)
                return set(emails)
            else:
                return set(item[1] for item in self.dist_list[:self.size])

        else:
            k_smallest = quickselect.k_smallest
            return set(item[1]
                       for item in k_smallest(self.dist_list, self.size))
예제 #3
0
    def make_cluster(self):
        """Constructs the initial cluster of emails."""
        # self.dist_list = [t for t in self.dist_list if t is not None]
        if self.size > len(self.dist_list):
            print "\nTruncating cluster size...\n"
            self.size = len(self.dist_list)

        if self.sort_first:
            if 'frequency' in self.opt:
                emails = [self.clustroid] # list of added emails
                
                for d,e in self.dist_list: # Remove the duplicate clustroid in self.dist_list 
                    if e.tag == self.clustroid.tag:
                        self.dist_list.remove((d,e))
                        # self.working_set.remove(e)
                        print "-> removed duplicate clustroid ", e.tag
                        break

                current_size = 1
                while current_size < self.size:
                    nearest = self.dist_list[0][1] # get nearest email
                    assert(nearest.tag != self.clustroid.tag), str(nearest.tag) + " " + str(self.clustroid.tag)
                    emails.append(nearest) # add to list
                    self.added.append(nearest) # track order in which emails are added
                    # self.working_set.remove(nearest) # remove from working set so email doesn't show up again when we recreate dist_list
                    self.cluster_word_frequency = helpers.update_word_frequencies(self.cluster_word_frequency, nearest) # update word frequencies
                    del self.dist_list[0] # so we don't add the email twice
                    self.update_dist_list() # new cluster_word_frequency, so need to resort closest emails
                    # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list
                    current_size += 1
                print "-> cluster initialized with size", len(emails)
                return set(emails)
            else:
                return set(item[1] for item in self.dist_list[:self.size])

        else:
            k_smallest = quickselect.k_smallest
            return set(item[1] for item in k_smallest(self.dist_list, self.size))
예제 #4
0
    def cluster_more(self, n):
        """Expands the cluster to include n more emails and returns these additional emails.
           If n more is not available, cluster size is simply truncated to include all remaining
           emails."""
        if 'frequency' in self.opt:
            if n >= len(self.dist_list):
                n = len(self.dist_list)
            print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method"
            self.size += n

            new_elements = []
            added = 0
            while added < n:
                nearest = self.dist_list[0][1]  # get nearest email
                new_elements.append(nearest)  # add to new list
                self.added.append(nearest)
                self.cluster_set.add(nearest)  # add to original cluster set
                self.cluster_word_frequency = helpers.update_word_frequencies(
                    self.cluster_word_frequency,
                    nearest)  # update word frequencies
                # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list
                del self.dist_list[0]
                self.update_dist_list()
                added += 1
            assert (
                len(new_elements) == n), str(len(new_elements)) + " " + str(n)
            assert (len(self.cluster_set) == self.size), str(
                len(self.cluster_set)) + " " + str(self.size)
            for msg in new_elements:
                if msg.train == 1 or msg.train == 3:
                    self.ham.add(msg)
                elif msg.train == 0 or msg.train == 2:
                    self.spam.add(msg)
            return new_elements

        old_cluster_set = self.cluster_set
        if self.size + n <= len(self.dist_list):
            self.size += n

        else:
            print "\nTruncating cluster size...\n"
            if len(self.dist_list) > 0:
                self.size = len(self.dist_list)

        if self.sort_first:
            new_cluster_set = set(item[1]
                                  for item in self.dist_list[:self.size])
        else:
            k_smallest = quickselect.k_smallest
            new_cluster_set = set(
                item[1] for item in k_smallest(self.dist_list, self.size))

        new_elements = list(item for item in new_cluster_set
                            if item not in old_cluster_set)
        self.cluster_set = new_cluster_set

        assert (len(self.cluster_set) == self.size), len(self.cluster_set)

        for msg in new_elements:
            if msg.train == 1 or msg.train == 3:
                self.ham.add(msg)
            elif msg.train == 0 or msg.train == 2:
                self.spam.add(msg)

        return new_elements