예제 #1
0
    def __init__(self, msg, size, active_unlearner, label, distance_opt, 
                working_set=None, separate=True):
        # Clustroid specs
        self.clustroid = msg[1] # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size # arbitrarily set to 100
        self.active_unlearner = active_unlearner # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[0] # actual vector representation of msg
            self.added = [] # keeps track of order emails are added

        self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster() # adds closest emails to cluster
        self.divide() # adds cluster emails to ham and spam
예제 #2
0
 def update_dist_list(self, t=False): 
     """Updates self.dist_list for the frequency method"""
     if t:
         time_1 = time.time()
     indices = [train[1] for train in self.dist_list] # get array of indices
     self.dist_list = [(distance(self.vec_data_x[i], self.cluster_word_frequency, self.opt), i) for i in indices]
     self.dist_list.sort()
     if t:
         time_2 = time.time()
         print 'update_dist_list took: ', h.sec_to_english(time_2 - time_1)
예제 #3
0
파일: cluster.py 프로젝트: lsabc/KARMA
 def update_dist_list(self, t=False):
     """Updates self.dist_list for the frequency method"""
     if t:
         time_1 = time.time()
     indices = [train[1]
                for train in self.dist_list]  # get array of indices
     self.dist_list = [(distance(self.vec_data_x[i],
                                 self.cluster_word_frequency, self.opt), i)
                       for i in indices]
     self.dist_list.sort()
     if t:
         time_2 = time.time()
         print 'update_dist_list took: ', h.sec_to_english(time_2 - time_1)
예제 #4
0
파일: cluster.py 프로젝트: lsabc/KARMA
    def __init__(self,
                 msg,
                 size,
                 active_unlearner,
                 label,
                 distance_opt,
                 working_set=None,
                 separate=True):
        # Clustroid specs
        self.clustroid = msg[1]  # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size  # arbitrarily set to 100
        self.active_unlearner = active_unlearner  # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() -
                                                            time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[
                0]  # actual vector representation of msg
            self.added = []  # keeps track of order emails are added

        self.dist_list = self.distance_array(
            self.separate
        )  # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster(
        )  # adds closest emails to cluster
        self.divide()  # adds cluster emails to ham and spam