Exemplo n.º 1
0
    def __init__(self, msg, size, active_unlearner, distance_opt, working_set=None, sort_first=True, separate=True):
        self.clustroid = msg # seed of the cluster
        if msg.train == 1 or msg.train == 3: # if ham set1 or ham set3
            self.train = [1, 3]
        elif msg.train == 0 or msg.train == 2: # if spam set1 or spam set3
            self.train = [0, 2]
        self.common_features = []
        self.msg_index = {}
        self.separate = separate
        self.size = size # arbitrarily set to 100
        self.active_unlearner = active_unlearner # point to calling au instance
        self.sort_first = sort_first
        self.opt = distance_opt

        self.working_set = working_set

        # if 'frequency' in self.opt:
        #     self.working_set = [train for train in working_set]
        # else:
        #     self.working_set = working_set
        self.ham = set()
        self.spam = set()
        if 'frequency' in self.opt:
            self.cluster_word_frequency = helpers.get_word_frequencies(self.clustroid)
            self.added = [] # keeps track of order emails are added

        self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster() # adds closest emails to cluster
        self.divide() # adds cluster emails to ham and spam
    def weighted_initial(self, working_set, mislabeled):
        if mislabeled is None: # Note that mislabeled is sorted in descending order by fabs(.50-email.prob)
            mislabeled = self.get_mislabeled()
        t_e = self.driver.tester.train_examples

        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        possible_centroids = list(mislabeled - self.mislabeled_chosen)

        print len(possible_centroids), " mislabeled emails remaining as possible cluster centroids" 
        if len(possible_centroids) == 0: #No more centers to select
            return NO_CENTROIDS
        else:
            possible_centroids.sort(key=lambda x: fabs(.50-x.prob), reverse=True)

            mislabeled_point = possible_centroids[0] # Choose most potent mislabeled email
            self.mislabeled_chosen.add(mislabeled_point)

            print "Chose the mislabeled point: ", mislabeled_point.tag
            print "Probability: ", mislabeled_point.prob

            init_email = None

            training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                mislabeled_point_frequencies = helpers.get_word_frequencies(mislabeled_point)
                for email in training:
                    current_distance = distance(email, mislabeled_point_frequencies, self.distance_opt)
                    if current_distance < min_distance:
                        init_email = email
                        min_distance = current_distance
            elif self.distance_opt == "intersection":
                min_distance = -1
                for email in training: # select closest email to randomly selected mislabeled test email
                    current_distance = distance(email, mislabeled_point, self.distance_opt)
                    if current_distance > min_distance:
                        init_email = email
                        min_distance = current_distance
            else:
                min_distance = sys.maxint
                for email in training: # select closest email to randomly selected mislabeled test email
                    current_distance = distance(email, mislabeled_point, self.distance_opt)
                    if current_distance < min_distance:
                        init_email = email
                        min_distance = current_distance
            print type(init_email)
            
            if init_email is None:
                print "Training emails remaining: ", training
            else:
                print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point"

            return init_email
Exemplo n.º 3
0
    def __init__(self,
                 msg,
                 size,
                 active_unlearner,
                 distance_opt,
                 working_set=None,
                 sort_first=True,
                 separate=True):
        self.clustroid = msg  # seed of the cluster
        if msg.train == 1 or msg.train == 3:  # if ham set1 or ham set3
            self.train = [1, 3]
        elif msg.train == 0 or msg.train == 2:  # if spam set1 or spam set3
            self.train = [0, 2]
        self.common_features = []
        self.msg_index = {}
        self.separate = separate
        self.size = size  # arbitrarily set to 100
        self.active_unlearner = active_unlearner  # point to calling au instance
        self.sort_first = sort_first
        self.opt = distance_opt

        self.working_set = working_set

        # if 'frequency' in self.opt:
        #     self.working_set = [train for train in working_set]
        # else:
        #     self.working_set = working_set
        self.ham = set()
        self.spam = set()
        if 'frequency' in self.opt:
            self.cluster_word_frequency = helpers.get_word_frequencies(
                self.clustroid)
            self.added = []  # keeps track of order emails are added

        self.dist_list = self.distance_array(
            self.separate
        )  # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster(
        )  # adds closest emails to cluster
        self.divide()  # adds cluster emails to ham and spam
Exemplo n.º 4
0
    def weighted_initial(self, working_set, mislabeled):
        if mislabeled is None:  # Note that mislabeled is sorted in descending order by fabs(.50-email.prob)
            mislabeled = self.get_mislabeled()
        t_e = self.driver.tester.train_examples

        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        possible_centroids = list(mislabeled - self.mislabeled_chosen)

        print len(
            possible_centroids
        ), " mislabeled emails remaining as possible cluster centroids"
        if len(possible_centroids) == 0:  #No more centers to select
            return NO_CENTROIDS
        else:
            possible_centroids.sort(key=lambda x: fabs(.50 - x.prob),
                                    reverse=True)

            mislabeled_point = possible_centroids[
                0]  # Choose most potent mislabeled email
            self.mislabeled_chosen.add(mislabeled_point)

            print "Chose the mislabeled point: ", mislabeled_point.tag
            print "Probability: ", mislabeled_point.prob

            init_email = None

            training = chain(t_e[0], t_e[1], t_e[2],
                             t_e[3]) if working_set is None else working_set
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                mislabeled_point_frequencies = helpers.get_word_frequencies(
                    mislabeled_point)
                for email in training:
                    current_distance = distance(email,
                                                mislabeled_point_frequencies,
                                                self.distance_opt)
                    if current_distance < min_distance:
                        init_email = email
                        min_distance = current_distance
            elif self.distance_opt == "intersection":
                min_distance = -1
                for email in training:  # select closest email to randomly selected mislabeled test email
                    current_distance = distance(email, mislabeled_point,
                                                self.distance_opt)
                    if current_distance > min_distance:
                        init_email = email
                        min_distance = current_distance
            else:
                min_distance = sys.maxint
                for email in training:  # select closest email to randomly selected mislabeled test email
                    current_distance = distance(email, mislabeled_point,
                                                self.distance_opt)
                    if current_distance < min_distance:
                        init_email = email
                        min_distance = current_distance
            print type(init_email)

            if init_email is None:
                print "Training emails remaining: ", training
            else:
                print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point"

            return init_email