def pre_training_without_SMOTE(self, pre_training_ratio): clf = ElasticNet(max_iter=10000) # self.smote_for_positive() positive_index = np.where(self.labels == 1) self.labels = [0 if label == -1 else label for label in self.labels] # verify the count of labeling negative label_negative_count = int(pre_training_ratio * np.bincount(self.labels)[0]) # fit pre-training model clf.fit(self.samples, self.labels) # get and sort distances distances = clf._decision_function(self.samples) distances_copy = distances.copy() distances_copy.sort() # get threshold negative_threshold = distances_copy[label_negative_count] self.labels = np.array( [-1 if label == 0 else label for label in self.labels]) label_negative_count_cur = 0 for i in range(self.length): if label_negative_count_cur >= label_negative_count: break if distances[i] <= negative_threshold and i not in positive_index[ 0]: self.labels[i] = 0 label_negative_count_cur += 1 print Counter(self.labels) return self.labels[:self.length]
def pre_training(self, pre_training_ratio): # clf = SVC(kernel='linear', probability=True, class_weight='balanced') # clf = SGDClassifier(loss='log', max_iter=10000) # clf = LogisticRegression(max_iter=10000) clf = ElasticNet(max_iter=10000) # clf = RandomForestClassifier() self.smote_for_positive() positive_index = np.where(self.labels == 1) self.labels = [0 if label == -1 else label for label in self.labels] # verify the count of labeling negative label_negative_count = int(pre_training_ratio * np.bincount(self.labels)[0]) # fit pre-training model clf.fit(self.samples, self.labels) # get and sort distances distances = clf._decision_function(self.samples) # proba = clf.predict_proba(self.samples) # print [distances[index] for index in positive_index] distances_copy = distances.copy() distances_copy.sort() # get threshold negative_threshold = distances_copy[label_negative_count] self.labels = np.array([ 0 if distance < negative_threshold else -1 for distance in distances ]) self.labels[positive_index] = 1 return self.labels[:self.length]
def add_reliable_samples(self, class_prior, speed, add_ratio, real_label, model=RandomForestClassifier(n_estimators=100)): max_label_count = int(add_ratio * self.length) label_count = 0 positive_label_count = 0 positive_count_each_round = int(speed * class_prior) negative_count_each_round = int(speed * (1 - class_prior)) while label_count < max_label_count: self.smote_for_positive() labeled_index = np.where(self.labels != -1)[0] unlabeled_index = np.where(self.labels == -1)[0] labeled_set = self.samples[labeled_index] unlabeled_set = self.samples[unlabeled_index] if len(unlabeled_set) == 0: break ''' random forest ''' clf = model clf.fit(labeled_set, self.labels[np.where(self.labels != -1)]) prob_list = clf.predict_proba(unlabeled_set) prob_list = np.array([prob[1] for prob in prob_list]) ''' linear model ''' clf_2 = ElasticNet(max_iter=10000) clf_2.fit(labeled_set, self.labels[np.where(self.labels != -1)]) dis_list = clf_2._decision_function(unlabeled_set) distance_list = prob_list.tolist() distance_list.sort() '''get the index of positive and negative index in unlabeled samples''' negative_threshold = distance_list[negative_count_each_round - 1] positive_threshold = distance_list[-positive_count_each_round] '''use linear model select samples''' select_dis_list = dis_list[np.where( prob_list <= negative_threshold)].tolist() select_dis_list.sort() negative_dis_threshold = select_dis_list[negative_count_each_round - 1] select_dis_list = dis_list[np.where( prob_list >= positive_threshold)].tolist() select_dis_list.sort() positive_dis_threshold = select_dis_list[ -positive_count_each_round] positive_count_cur = 0 negative_count_cur = 0 for i in range(len(prob_list)): if prob_list[i] <= negative_threshold and dis_list[ i] <= negative_dis_threshold and negative_count_cur < negative_count_each_round: label_count += 1 negative_count_cur += 1 self.labels[unlabeled_index[i]] = 0 elif prob_list[i] >= positive_threshold and dis_list[ i] >= positive_dis_threshold and positive_count_cur < positive_count_each_round: label_count += 1 positive_label_count += 1 positive_count_cur += 1 self.labels[unlabeled_index[i]] = real_label[ unlabeled_index[i]] print real_label[unlabeled_index[i]] if label_count > max_label_count: break print max(distance_list), min(distance_list) print label_count, positive_label_count print 'finish add reliable samples' return self.labels[:self.length], positive_label_count