def unlearn(self, cluster):
        """Unlearns a cluster from the ActiveUnlearner."""
        print "unlearning cluster of size ", len(cluster.cluster_set), " from au"
        if len(cluster.ham) + len(cluster.spam) != cluster.size:
            print "\nUpdating cluster ham and spam sets...\n"
            cluster.divide()

        h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x], cluster.cluster_set)
 def divide_new_elements(self, messages, unlearn, original=None):
     """Divides a given set of emails to be unlearned into ham and spam lists and unlearns both.
        Param: messages contains indices of emails to learn/unlearn
     """
     if unlearn:
         h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x], messages)
     else:
         h.relearn([self.train_y, self.train_x, self.pol_y, self.pol_x], original, messages)
Пример #3
0
 def divide_new_elements(self, messages, unlearn, original=None):
     """Divides a given set of emails to be unlearned into ham and spam lists and unlearns both.
        Param: messages contains indices of emails to learn/unlearn
     """
     if unlearn:
         h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x],
                   messages)
     else:
         h.relearn([self.train_y, self.train_x, self.pol_y, self.pol_x],
                   original, messages)
Пример #4
0
    def unlearn(self, cluster):
        """Unlearns a cluster from the ActiveUnlearner."""
        print "unlearning cluster of size ", len(
            cluster.cluster_set), " from au"
        if len(cluster.ham) + len(cluster.spam) != cluster.size:
            print "\nUpdating cluster ham and spam sets...\n"
            cluster.divide()

        h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x],
                  cluster.cluster_set)
Пример #5
0
def cluster_au(au, gold=True):
    """Clusters the training space of an ActiveUnlearner and returns the list of clusters."""
    
    print "\n----------------------Beginning the Clustering Process-----------------------\n"
    cluster_list = [] # list of tuples (net_rate_change, cluster)
    train_y = copy.deepcopy(au.train_y)
    train_x = copy.deepcopy(au.train_x)
    pol_y = copy.deepcopy(au.pol_y)
    pol_x = copy.deepcopy(au.pol_x)

    training = [train_y, train_x, pol_y, pol_x] # create the working set

    original_training_size = len(h.strip(pol_y)) + len(h.strip(train_y))

    print "\nResetting mislabeled...\n"
    mislabeled = au.get_mislabeled(update=True) # gets an array of all false positives, false negatives
    au.mislabeled_chosen = [] # reset set of clustered mislabeled emails in this instance of au

    print "\n Clustering...\n"
    pre_cluster_rate = au.current_detection_rate
    training_size = len(h.strip(pol_y)) + len(h.strip(train_y))
    while training_size > 0: # loop until all emails in phantom training space have been assigned
        print "\n-----------------------------------------------------\n"
        print "\n" + str(training_size) + " emails out of " + str(original_training_size) + \
              " still unclustered.\n"

        # Choose an arbitrary email from the mislabeled emails and returns the training email closest to it.
        # Final call and source of current_seed is mislabeled_initial() function
        # current_seed = cluster_methods(au, "mislabeled", training, mislabeled) 
        current_seed = None 
        label = None
        while current_seed is None:
            label, init_pos, current_seed = au.select_initial(mislabeled, "weighted", training) 

        if str(current_seed) == 'NO_CENTROIDS':
            cluster_result = cluster_remaining(au, training)
        else:
            cluster_result = determine_cluster(current_seed, au, label, init_pos, working_set=training, gold=gold) # if true, relearn clusters after returning them
        if cluster_result is None:
            print "!!!How did this happen?????"
            sys.exit(cluster_result)

        net_rate_change, cluster = cluster_result
        # After getting the cluster and net_rate_change, you relearn the cluster in original dataset if impact=True

        post_cluster_rate = au.current_detection_rate

        # make sure the cluster was properly relearned
        # assert(post_cluster_rate == pre_cluster_rate), str(pre_cluster_rate) + " " + str(post_cluster_rate)
        # print "cluster relearned successfully: au detection rate back to ", post_cluster_rate

        cluster_list.append([net_rate_change, cluster])

        print "\nRemoving cluster from shuffled training set...\n"

        h.unlearn(training, cluster.cluster_set)
        training_size = len(h.strip(pol_y)) + len(h.strip(train_y))

    cluster_list.sort() # sorts by net_rate_change
    print "\nClustering process done and sorted.\n"
    return cluster_list