def label_test(): from labeler import Labelers model = ObjectClassifier(NZ = False) labelers = Labelers() test = np.ix_(np.arange(4096/3, 4096), np.arange(4096/2)) train = np.ix_( np.arange(4096/3, 4096), np.arange(4096/2, 4096)) haiti_map = map_overlay.haiti_setup() train_map = haiti_map.sub_map(train) test_map = haiti_map.sub_map(test) predictions = np.zeros((labelers.labels.shape[0], test_map.unique_segs(20).shape[0])) agreement = (labelers.labels == labelers.majority_vote())[:,train_map.unique_segs(20)] for i in range(labelers.labels.shape[0]): print labelers.emails[i] new_model = ObjectClassifier(NZ = False) new_model.fit(train_map, agreement[i]) probs = new_model.predict_proba_segs(test_map) predictions[i] = probs print predictions best_labelers = np.argmax(predictions, axis = 0) print best_labelers np.save('predictions.npy', predictions) np.save('best.npy',best_labelers) assert(best_labelers.shape[0] == test_map.unique_segs(20).shape[0]) model_labels = labelers.labels[best_labelers,test_map.unique_segs(20)] np.save('vote.npy', model_labels)
def __init__(self, postfix = '', random = False, update_method = 'donmez', unique_email = None, show = False): self.set_params() self.show = show self.unique_email = unique_email self.update_method = update_method self.postfix = postfix self.uncertainty = self.rf_uncertainty if not random else self.random_uncertainty self.setup_map_split() self.labelers = Labelers() self.training_labels = self._gen_training_labels(self.labelers.majority_vote()[self.train_map.unique_segs(self.seg)]) self.test_progress()
def main_haiti(): from Xie import EM from labeler import Labelers model = ObjectClassifier(0,1) labelers = Labelers() y = labelers.majority_vote() train = np.ix_(np.arange(4096/3, 4096), np.arange(4096/2)) test = np.ix_( np.arange(4096/3, 4096), np.arange(4096/2, 4096)) haiti_map = map_overlay.haiti_setup() train_map = haiti_map.sub_map(train) test_map = haiti_map.sub_map(test) #em = EM(train_map, labelers) #em.run() #y2 = em.G[:,1]>0.5 g_truth = y[test_map.segmentations[20]] FPRs = [] TPRs = [] for email in labelers.emails: print email a = time.time() labels = labelers.labeler(email)[test_map.segmentations[20]] b = time.time() FPR, TPR = analyze_results.confusion_analytics(g_truth.ravel(), labels.ravel()) c = time.time() FPRs.append(FPR) TPRs.append(TPR) probs = model.fit_and_predict(train_map, test_map, y[train_map.unique_segs(20)]) print analyze_results.FPR_from_FNR(g_truth.ravel(), probs.ravel(), TPR = .95) analyze_results.probability_heat_map(test_map, probs.ravel(), '') fig, _, _, _, _, _ = analyze_results.ROC(g_truth.ravel(), probs.ravel(), 'Classifier') plt.scatter(FPRs, TPRs) names = labelers.emails for i in range(len(FPRs)): plt.annotate(names[i], (FPRs[i], TPRs[i])) fig.savefig('All_ROCs/{}_ROC.png'.format('Classifier'), format='png') plt.show()
class al(object): """ Responsible for setting up and running active learning experiments Parameters ---------- postfix : String String added to the end of filenames when saving results. Important for being able to run multiple experiments at once without overwriting [random] : boolean New training data selected to be sampled is done so randomly if True, and based on the random forest probability if False [update_method] : String Describes how to assign labels to new training data based on a number of methods Examples include: donmez, donmez1, majority, random, email, yan, xie [unique_email] : String If update type is 'email', labels will be assigned according to unique_email's labels [show] : boolean If True, figures are generated at various stages to help illustrate progress Fields ------ All Parameters are also set as fields with identical purposes start_n : int Number of initial training data to label batch_size : int Number of new training data to label for each iteration updates : int Number of iterations to run through verbose : int Prints various messages indicating progress if 1. Prints nothing if 0 TPR : int TPR value to evaluate FPR at seg : int Segmentation level associated with data thresh : float Threshold around which to select most uncertain data for random forest method path : String Folder path to save all files in fprs : float list Stores all FPRs from evaluation as active learning run moves forward UIs : float list Stores all UI confidences generated from Donmez run if that is the labeling method unceratinty : void->ndarray Function used to select new training data to label train : ndarray index list indicating portion of map to set as training (see Px_Map.submap for details) test : ndarray index list indicating portion of map to set as testing (see Px_Map.submap for details) haiti_map : Px_Map Haiti map with all data for active learning run train_map : Px_Map Sub map of Haiti Map used for training as assigned by train test_map : Px_Map Sub map of Haiti Map used for testing as assigned by test labelers : Labelers Stores all crowdsourcing labelers who have labeled the iamge training_labels : ndarray Array with length equal to the number of segments in the training data Indices that have not yet been assigned a label have value -1 Indices that have been labeled as non damage have value 0 Indices that have been labeled as damage have value 1 """ def __init__(self, postfix = '', random = False, update_method = 'donmez', unique_email = None, show = False): self.set_params() self.show = show self.unique_email = unique_email self.update_method = update_method self.postfix = postfix self.uncertainty = self.rf_uncertainty if not random else self.random_uncertainty self.setup_map_split() self.labelers = Labelers() self.training_labels = self._gen_training_labels(self.labelers.majority_vote()[self.train_map.unique_segs(self.seg)]) self.test_progress() def set_params(self): """Sets basic parameters""" self.start_n = 50 self.batch_size = 50 self.updates = 700 self.verbose = 1 self.TPR = .95 self.seg = 20 self.thresh = .06 self.path = 'al_9/' self.fprs = [] self.UIs = [] def setup_map_split(self): """Splits haiti map into training portion and testing portion""" self.train = np.ix_(np.arange(4096/3, 4096), np.arange(4096/2)) self.test = np.ix_(np.arange(4096/3, 4096), np.arange(4096/2, 4096)) self.haiti_map = map_overlay.haiti_setup() self.train_map = self.haiti_map.sub_map(self.train) self.test_map = self.haiti_map.sub_map(self.test) def _gen_training_labels(self, y_train): """ Initializes training_labels with start_n labels Labels are 50% damage and 50% non damage to give a good starting distribution """ #initialize training labels training_labels = np.ones_like(y_train)*-1 np.random.seed() #For each class value, choose start_n/2 random training examples to label using majority vote for i in range(2): sub_samp = np.where(y_train==i)[0] train_indices = np.random.choice(sub_samp, self.start_n//2, replace = False) seg_indices = self.train_map.unique_segs(self.seg)[train_indices] self.labelers.donmez_vote(seg_indices, .85, True) training_labels[train_indices] = i #If using Yan labeling model, also train models for each labeler to learn performance if self.update_method == 'yan': indcs = np.where(training_labels>-1)[0] self.labelers.model_start(self.train_map, indcs) return training_labels def _uncertain_order(self, importance, valid_indices): """Returns valid_indices sorted by each index's value in importance""" order = valid_indices[np.argsort(importance[valid_indices])] order = order[::-1] return order def show_selected(self): """Generates and saves image masking all segments that have already been labeled""" lab_train_indcs = np.where(self.training_labels != -1)[0] lab_indcs = self.train_map.unique_segs(self.seg)[lab_train_indcs] img = self.train_map.mask_segments_by_indx(lab_indcs, self.seg, 1, True) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) n_labeled = np.where(self.training_labels != -1)[0].shape[0] cv2.imwrite('{}selected_{}{}.png'.format(self.path, n_labeled,\ self.postfix), img) def rf_uncertainty(self): """ Selcts [self.batch_size] new segments to label based on their distance from [self.thresh] First trains classifier on all labeled data, then predicts probability of all other segments being damage and chooses segments closest to [self.thresh] Returns ------- ndarray All indices of unlabeled data sorted in decreasing uncertainty """ model = ObjectClassifier(NZ = 0, verbose = 0) #train and predict segs of classifier training_sample = model.sample(self.training_labels, EVEN = 2) model.fit(self.train_map, self.training_labels, training_sample) proba_segs = model.predict_proba_segs(self.train_map) #If show, save figures of heatmap from prediction if self.show: self.show_selected() fig = plt.figure() n_labeled = np.where(self.training_labels != -1)[0].shape[0] img = self.train_map.seg_convert(self.seg, proba_segs) plt.imshow(img, cmap = 'seismic', norm = plt.Normalize(0,1)) fig.savefig('{}test_{}{}.png'.format(self.path, n_labeled, self.postfix), format='png') plt.close(fig) #choose indices whose predictions minus thresh were closest to zero unknown_indcs = np.where(self.training_labels == -1)[0] uncertainties = 1-np.abs(proba_segs-self.thresh) return self._uncertain_order(uncertainties.ravel(), unknown_indcs) def random_uncertainty(self): """Randomly selects [self.batch_size] new segments to be labeled""" if self.show: self.show_selected() return np.random.permutation(np.where(self.training_labels == -1)[0]) def update_labels(self, new_training): """ Assigns labels to the segment indices listed in new_training based on the update model Parameters ---------- new_training : ndarray Array listing indices of segments that need labels. Note that these indices are in terms of the the training indices. So if the smallest training segment index is 33000, then index 0 in new_training will mean segment index 33000. In order to get indices in terms of all indices, need to use: self.train_map.unique_segs(self.seg)[new_training] """ train_segs = self.train_map.unique_segs(self.seg) if self.update_method == "donmez": #Based on the donmez algorithm new_labs = self.labelers.donmez_vote(train_segs[new_training], .85, True) self.UIs.append(self.labelers.UI()) np.save('{}UIs{}.npy'.format(self.path, self.postfix), np.array(self.UIs)) elif self.update_method == "donmez_1": #Variant of the donmez algorithm in which the algorithm may only sample one labeler each time new_labs = self.labelers.donmez_pick_1(train_segs[new_training]) self.UIs.append(self.labelers.UI()) np.save('{}UIs{}.npy'.format(self.path, self.postfix), np.array(self.UIs)) elif self.update_method == "majority": #Uses majority vote new_labs = self.labelers.majority_vote(train_segs[new_training]) elif self.update_method == "random": #Randomly selects a labeler for each data point labelers = np.random.randint(0, len(self.labelers.labels), len(new_training)) new_labs = self.labelers.labels[labelers, train_segs[new_training]] elif self.update_method == "email": #Uses labels that unique_email chose new_labs = self.labelers.labeler(self.unique_email)[train_segs[new_training]] elif self.update_method == "yan": #Uses the yan algorithm to pick new labels new_labs = self.labelers.model_vote(new_training) elif self.update_method == 'xie': #Uses the xie algorithm to pick new labels indcs = np.concatenate((np.where(self.training_labels>-1)[0], new_training)) em = EM(self.train_map, self.labelers, train_segs[indcs]) em.run() new_labs = (em.G[:,1]>0.5)[-self.batch_size:] print zip(new_labs, self.labelers.majority_vote(train_segs[new_training])) self.training_labels[new_training] = new_labs def test_progress(self): """Evaluates progress of active learning run by looking at results tested on testing map""" model = ObjectClassifier(NZ = 0, verbose = 0) #Pulls all training data thats been labeled and samples evenly between the classes training_sample = model.sample(self.training_labels, EVEN = 2) #Trains on training data and tests on test map model.fit(self.train_map, self.training_labels, training_sample) proba = model.predict_proba(self.test_map) #Uses majority vote as ground truth g_truth = self.labelers.majority_vote()[self.test_map.segmentations[self.seg]] n_labeled = np.where(self.training_labels > -1)[0].shape[0] #If show is true, saves the ROC curve if self.show: fig, AUC = analyze_results.ROC(g_truth.ravel(), proba.ravel(), 'Haiti Test')[:2] fig.savefig('{}ROC_{}{}.png'.format(self.path, n_labeled, self.postfix), format='png') plt.close(fig) #Evaluates progress by finding FPR at self.FNR and adding it to the fprs list FPR, thresh = analyze_results.FPR_from_FNR(g_truth.ravel(), proba.ravel(), TPR = self.TPR) self.fprs.append(FPR) #saves all fprs to document every iteration so results are not loss and progress can be seen mid-run np.save('{}fprs{}.npy'.format(self.path, self.postfix), self.fprs) def update(self): """Performs one active learning run by choosing uncertain data, assigning labels, and testing progress""" new_training = self.uncertainty()[:self.batch_size] self.update_labels(new_training) self.test_progress() def run(self): """Performs full active learning run by calling update repeatedly""" for i in range(self.updates): print 'Iteration {}'.format(i) self.update()