def weighted_initial(self, working_set, mislabeled): print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) print len(mislabeled), " mislabeled emails remaining as possible cluster centroids" if len(mislabeled) == 0: #No more centers to select return (None, None, 'NO_CENTROIDS') else: prob, mislabeled_point = mislabeled.pop(0) # Choose most potent mislabeled email self.mislabeled_chosen.append(mislabeled_point) print "Chose the mislabeled point with z = ", prob data_y, data_x = h.compose_set(working_set) vec_data_x = vectorize_set(data_x) init_email = None init_pos = None label = None if "frequency" in self.distance_opt: min_distance = sys.maxint for i,email_indices in enumerate(vec_data_x): if None not in email_indices: # actual data current_distance = distance(email_indices, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = data_x[i] init_pos = i min_distance = current_distance if init_email is None: print "Training emails remaining: ", len(data_x) else: label = data_y[init_pos] print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point" return (label, init_pos, init_email)
def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster() # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[ 0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array( self.separate ) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster( ) # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def weighted_initial(self, working_set, mislabeled): print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) print len( mislabeled ), " mislabeled emails remaining as possible cluster centroids" if len(mislabeled) == 0: #No more centers to select return (None, None, 'NO_CENTROIDS') else: prob, mislabeled_point = mislabeled.pop( 0) # Choose most potent mislabeled email self.mislabeled_chosen.append(mislabeled_point) print "Chose the mislabeled point with z = ", prob data_y, data_x = h.compose_set(working_set) vec_data_x = vectorize_set(data_x) init_email = None init_pos = None label = None if "frequency" in self.distance_opt: min_distance = sys.maxint for i, email_indices in enumerate(vec_data_x): if None not in email_indices: # actual data current_distance = distance(email_indices, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = data_x[i] init_pos = i min_distance = current_distance if init_email is None: print "Training emails remaining: ", len(data_x) else: label = data_y[init_pos] print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point" return (label, init_pos, init_email)