Exemplo n.º 1
0
    def cluster_more(self, n):
        """Expands the cluster to include n more emails and returns these additional emails.
           If n more is not available, cluster size is simply truncated to include all remaining
           emails."""
        if 'frequency' in self.opt:
            if n >= len(self.dist_list):
                n = len(self.dist_list)
            print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt,  " method"
            self.size += n

            new_elements = []
            added = 0
            while added < n:
                d,i = self.dist_list.pop(0) # get nearest email
                new_elements.append(i) # add to new list
                self.added.append(i)
                self.cluster_set.add(i) # add to original cluster set
                self.cluster_word_frequency = h.update_word_frequencies(self.cluster_word_frequency, self.data_x[i]) # update word frequencies
                self.update_dist_list()
                added += 1
                if added % 10 == 0:
                    print added, "/", n
            assert(len(new_elements) == n), str(len(new_elements)) + " " + str(n)
            assert(len(self.cluster_set) == self.size), str(len(self.cluster_set)) + " " + str(self.size)
            self.divide(new_elements)
            return new_elements
Exemplo n.º 2
0
    def make_cluster(self):
        """Constructs the initial cluster of emails."""
        # self.dist_list = [t for t in self.dist_list if t is not None]
        if self.size > len(self.dist_list):
            print "\nTruncating cluster size...\n"
            self.size = len(self.dist_list)

        if 'frequency' in self.opt:
            emails = [self.clustroid]  # list of added emails

            current_size = 1

            while current_size < self.size:
                d, i = self.dist_list.pop(0)  # get nearest email
                emails.append(i)  # add to list
                self.added.append(i)  # track order in which emails are added
                self.cluster_word_frequency = h.update_word_frequencies(
                    self.cluster_word_frequency,
                    self.data_x[i])  # update word frequencies
                self.update_dist_list()
                if current_size % 10 == 0:
                    print current_size, "/", self.size
                # new cluster_word_frequency, so need to resort closest emails
                current_size += 1

            print "-> cluster initialized with size", len(emails)
        return set(emails)
Exemplo n.º 3
0
    def make_cluster(self):
        """Constructs the initial cluster of emails."""
        # self.dist_list = [t for t in self.dist_list if t is not None]
        if self.size > len(self.dist_list):
            print "\nTruncating cluster size...\n"
            self.size = len(self.dist_list)

        if 'frequency' in self.opt:
            emails = [self.clustroid] # list of added emails

            current_size = 1

            while current_size < self.size:
                d,i = self.dist_list.pop(0) # get nearest email
                emails.append(i) # add to list
                self.added.append(i) # track order in which emails are added
                self.cluster_word_frequency = h.update_word_frequencies(self.cluster_word_frequency, self.data_x[i]) # update word frequencies
                self.update_dist_list()
                if current_size % 10 == 0:
                    print current_size, "/", self.size
                 # new cluster_word_frequency, so need to resort closest emails
                current_size += 1
                
            print "-> cluster initialized with size", len(emails)
        return set(emails)
Exemplo n.º 4
0
    def cluster_more(self, n):
        """Expands the cluster to include n more emails and returns these additional emails.
           If n more is not available, cluster size is simply truncated to include all remaining
           emails."""
        if 'frequency' in self.opt:
            if n >= len(self.dist_list):
                n = len(self.dist_list)
            print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method"
            self.size += n

            new_elements = []
            added = 0
            while added < n:
                d, i = self.dist_list.pop(0)  # get nearest email
                new_elements.append(i)  # add to new list
                self.added.append(i)
                self.cluster_set.add(i)  # add to original cluster set
                self.cluster_word_frequency = h.update_word_frequencies(
                    self.cluster_word_frequency,
                    self.data_x[i])  # update word frequencies
                self.update_dist_list()
                added += 1
                if added % 10 == 0:
                    print added, "/", n
            assert (
                len(new_elements) == n), str(len(new_elements)) + " " + str(n)
            assert (len(self.cluster_set) == self.size), str(
                len(self.cluster_set)) + " " + str(self.size)
            self.divide(new_elements)
            return new_elements