def make_line(self, x, n_images=50): """ Returns tuple of lambda function for query line and line segment (which has shape (n_images, n_features)) :param x: query point in scaled space :param n_images: how many images the line segment will consist of. :return: """ x = to_vector(x) # in scaled space x_p = project_point_on_decision_boundary(self.model.w, self.model.b, x) # in scaled space line = lambda t: x_p + (x - x_p) * t # in scaled space all_train_data_scaled = self.dataset.scaling_transformation.transform( self.dataset.data["features"]) # Get all data in scaled space all_data_center = to_vector(all_train_data_scaled.mean( axis=0)) # Mean in scaled space (probably, near zero) if self.r is None: # Compute r only the first time self.r = compute_radius_sphere(scaled_data=all_train_data_scaled) print "Radius", self.r del all_train_data_scaled if self.base_precision is not None: n_images = int( math.ceil(self.longest_line_possible / float(self.base_precision)) + 1) line_segment = make_line_segment(radius=self.r, mu=all_data_center, a=to_vector(x_p), b=to_vector(x), n_points_line=n_images) return line, line_segment, to_vector(x_p)
def label(self, sample, line=None, line_segment=None, sample_already_scaled=False, intersection_point_cdb=None): """ This function labels a query line with its decision boundary point (= intersection line with decision boundary) and the label of the query point from which the query line was created. :param sample: query point: (n_features, 1) in original dataspace :param line: query line in scaled data space :param line_segment: can be used to convert to human understandable line query. In scaled data space :return: """ sample = to_vector(sample).T if self.ideal_labeler is None: if not sample_already_scaled: # Transform from original data space to ground truth data space = scaled space. sample = self.dataset.scaling_transformation.transform(sample) label = self.predict(sample) else: label = self.ideal_labeler.label(sample.squeeze()) A, B = line(0), line(1) # a + (b-a)*0 = a and a+(b-a)*1 = b db_point = compute_intersection_line_decision_boundary( A, B, self.w, self.b0) assert db_point.shape == A.shape return label, db_point
def label(self, sample, sample_already_scaled=False, *args): # Transform from original data space to ground truth data space = scaled space. sample = to_vector(sample).T if not sample_already_scaled: sample = self.dataset.scaling_transformation.transform(sample) label = self.predict(sample) return label
def make_query(self): try: unlabeled_train_data = self.dataset.get_unlabeled_train_data() except ValueError: raise IndexError("No more unlabeled train samples") unlabeled_entry_ids, X_pool = unlabeled_train_data[ "entry_ids"], unlabeled_train_data["features"] del unlabeled_train_data if len(X_pool) > 0: start_time = time.time() # Cluster centroids! if self.n_queries % self.batch_size == 0: # Cluster again! self.clustered = KMeans(n_clusters=self.batch_size).fit(X_pool) query_image = self.clustered.cluster_centers_[self.n_queries % self.batch_size] query_image_original_space = self.dataset._scaling_transformation.inverse_transform( to_vector(query_image).T).T # (n_features, 1) print "Found new query using %d unlabeled clustered samples in %.2f seconds" % ( X_pool.shape[0], time.time() - start_time) self.n_queries += 1 if self.save_path_queries is not None: self.save_query_to_hdf5_point(self.save_path_queries_hdf5, -1, query_image_original_space.T) return None, query_image else: raise IndexError("No more unlabeled train samples")
def save_query_to_hdf5_point(self, save_path_queries_hdf5, entry_id, sample_original_space): """ :return: """ sample_original_space = to_vector(sample_original_space).T if os.path.isfile(save_path_queries_hdf5): with h5py.File(save_path_queries_hdf5, 'r+') as hf: points_dataset = hf.get('point_queries') already_in_points_ds = points_dataset.shape[0] points_dataset.resize(already_in_points_ds + sample_original_space.shape[0], axis=0) points_dataset[ already_in_points_ds:already_in_points_ds + sample_original_space.shape[0], :] = sample_original_space entryids_dataset = hf.get('entry_ids') already_in_entryids_ds = entryids_dataset.len() entryids_dataset.resize(already_in_entryids_ds + 1, axis=0) entryids_dataset[ already_in_entryids_ds:already_in_entryids_ds + 1] = entry_id split_dict = { "data": { "point_queries": (0, already_in_points_ds + sample_original_space.shape[0]), "entry_ids": (0, already_in_entryids_ds + 1) } } hf.attrs["split"] = H5PYDataset.create_split_array(split_dict) else: # HDF5 query line save file does not exist yet! f = h5py.File(save_path_queries_hdf5, "w") points_dataset = f.create_dataset( 'point_queries', sample_original_space.shape, maxshape=(None, sample_original_space.shape[1]), dtype="float32") points_dataset[...] = sample_original_space entryids_dataset = f.create_dataset('entry_ids', (1, ), maxshape=(None, ), dtype=int) entryids_dataset[...] = entry_id split_dict = { "data": { "point_queries": (0, sample_original_space.shape[0]), "entry_ids": (0, 1) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def generate_images_line_save(self, line_segment, query_id, image_original_space=None): """ ID of query point from which query line was generated is added to the filename of the saved line query. :param line_segment: :param query_id: :return: """ try: if image_original_space is not None: x = self.generative_model.decode(image_original_space.T) else: x = self.generative_model.decode( to_vector(self.dataset.data["features"][query_id]).T ) # comes from dataset.data["features"], so is already in original space in which ALI operates. save_path = os.path.join( self.save_path_queries, "pointquery_%d_%d.png" % (self.n_queries + 1, query_id)) if x.shape[1] == 1: plt.imsave(save_path, x[0, 0, :, :], cmap=cm.Greys) else: plt.imsave(save_path, x[0, :, :, :].transpose(1, 2, 0), cmap=cm.Greys_r) decoded_images = self.generative_model.decode( self.dataset.scaling_transformation.inverse_transform( line_segment) ) # Transform to original space, in which ALI operates. figure = plt.figure() grid = ImageGrid(figure, 111, (1, decoded_images.shape[0]), axes_pad=0.1) for image, axis in zip(decoded_images, grid): if image.shape[0] == 1: axis.imshow(image[0, :, :].squeeze(), cmap=cm.Greys, interpolation='nearest') else: axis.imshow(image.transpose(1, 2, 0).squeeze(), cmap=cm.Greys_r, interpolation='nearest') axis.set_yticklabels(['' for _ in range(image.shape[1])]) axis.set_xticklabels(['' for _ in range(image.shape[2])]) axis.axis('off') save_path = os.path.join( self.save_path_queries, "linequery_%d_%d.pdf" % (self.n_queries + 1, query_id)) plt.savefig(save_path, transparent=True, bbox_inches='tight') except Exception as e: print "EXCEPTION:", traceback.format_exc() raise e
def label(self, sample, sample_already_scaled=False, *args): # Transform from original data space to ground truth data space = scaled space. sample = to_vector(sample) if sample_already_scaled: sample = self.dataset.scaling_transformation.inverse_transform( sample.T).T point_query_image = self.ali.decode(sample.T) oracle = PointLabelInterface( point_query_image, list(self.dataset.classes), classes_dictionary=self.dataset.classes_dictionary) return oracle.label_point_query
def make_query(self, n_images=50): try: unlabeled_train_data = self.dataset.get_unlabeled_train_data() except ValueError: raise IndexError("No more unlabeled train samples") unlabeled_entry_ids, X_pool = unlabeled_train_data[ "entry_ids"], unlabeled_train_data["features"] del unlabeled_train_data if len(X_pool) > 0: start_time = time.time() # Cluster centroids! if self.n_queries % self.batch_size == 0: # Cluster again! self.clustered = KMeans(n_clusters=self.batch_size).fit(X_pool) query_image = self.clustered.cluster_centers_[self.n_queries % self.batch_size] query_image_original_space = self.dataset._scaling_transformation.inverse_transform( to_vector(query_image).T).T # (n_features, 1) print "Found new query using %d unlabeled clustered samples in %.2f seconds" % ( X_pool.shape[0], time.time() - start_time) # Make line. Project query_image on current decision boundary start_time = time.time() line, line_segment, intersection_point = self.make_line( query_image, n_images) # scaled space print "Made line from found query in %.2f seconds" % (time.time() - start_time) if not self.human_experiment and self.generative_model is not None: # Change 1 to higher number for faster algorithm (less generating and plotting) if self.n_queries % 1 == 0: start_time = time.time() self.generate_images_line_save( line_segment, None, image_original_space=query_image_original_space) print "Plotted query line in %.2f seconds" % (time.time() - start_time) else: self.save_query_to_hdf5( query_image_original_space.T, # original space -1, self.dataset.scaling_transformation.inverse_transform( line_segment), self.dataset.scaling_transformation.inverse_transform( intersection_point.T)) self.n_queries += 1 return None, line, line_segment, query_image, intersection_point else: raise IndexError("No more unlabeled train samples")
def update(self, entry_id, new_label, sample=None): """ Updates an entry with entry_id with the given label. :param entry_id: entry id of the sample to update. :param label: Label of the sample to be update. """ if isinstance(new_label, int): new_label = np.array(new_label).reshape(1, 1) if entry_id is None and sample is not None: self.data["features"] = np.concatenate( (self.data["features"], to_vector(sample).T), axis=0) self.data["targets"] = np.concatenate( (self.data["targets"], new_label), axis=0) else: self.data["targets"][entry_id] = new_label for callback in self._update_callback: callback(entry_id, new_label)
def make_query(self, n_images=50): try: unlabeled_train_data = self.dataset.get_unlabeled_train_data() except ValueError: raise IndexError("No more unlabeled train samples") unlabeled_entry_ids, X_pool = unlabeled_train_data[ "entry_ids"], unlabeled_train_data["features"] del unlabeled_train_data if len(X_pool) > 0: # least confident and most representative of data start_time = time.time() uncertainties = np.max(self.model.predict_real(X_pool), axis=1) ask_id = self.get_most_uncertainty_dense(uncertainties, self.similarity_matrix, beta=1) self.delete_index_similarity_matrix(ask_id) print "Found new query amongst %d unlabeled samples in %.2f seconds" % ( X_pool.shape[0], time.time() - start_time) # Make line. Project query_image on current decision boundary start_time = time.time() line, line_segment, intersection_point = self.make_line( X_pool[ask_id], n_images) print "Made line from found query in %.2f seconds" % (time.time() - start_time) if not self.human_experiment and self.generative_model is not None: # Change 1 to higher number for faster algorithm (less generating and plotting) if self.n_queries % 1 == 0: start_time = time.time() self.generate_images_line_save(line_segment, unlabeled_entry_ids[ask_id]) print "Plotted query line in %.2f seconds" % (time.time() - start_time) else: self.save_query_to_hdf5( to_vector(self.dataset.data["features"][ unlabeled_entry_ids[ask_id]]).T, unlabeled_entry_ids[ask_id], self.dataset.scaling_transformation.inverse_transform( line_segment), self.dataset.scaling_transformation.inverse_transform( intersection_point.T)) self.n_queries += 1 return unlabeled_entry_ids[ ask_id], line, line_segment, None, intersection_point else: raise IndexError("No more unlabeled train samples")
def save_decision_boundary(self, w, b): """ :return: """ w = to_vector(w).T if os.path.isfile(self.save_path_boundaries): with h5py.File(self.save_path_boundaries, 'r+') as hf: w_dataset = hf.get('w') already_in_w_ds = w_dataset.shape[0] w_dataset.resize(already_in_w_ds + w.shape[0], axis=0) w_dataset[already_in_w_ds:already_in_w_ds + w.shape[0], :] = w b_dataset = hf.get('b') already_in_b_ds = b_dataset.len() b_dataset.resize(already_in_b_ds + 1, axis=0) b_dataset[already_in_b_ds:already_in_b_ds + 1] = b split_dict = { "data": { "w": (0, already_in_w_ds + w.shape[0]), "b": (0, already_in_b_ds + 1) } } hf.attrs["split"] = H5PYDataset.create_split_array(split_dict) else: # HDF5 query line save file does not exist yet! f = h5py.File(self.save_path_boundaries, "w") w_dataset = f.create_dataset('w', w.shape, maxshape=(None, w.shape[1]), dtype="float32") w_dataset[...] = w b_dataset = f.create_dataset('b', (1, ), maxshape=(None, ), dtype="float32") b_dataset[...] = b split_dict = {"data": {"w": (0, w.shape[0]), "b": (0, 1)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def make_query(self, n_images=50): try: unlabeled_train_data = self.dataset.get_unlabeled_train_data() except ValueError: raise IndexError("No more unlabeled train samples") unlabeled_entry_ids, X_pool = unlabeled_train_data[ "entry_ids"], unlabeled_train_data["features"] del unlabeled_train_data if len(X_pool) > 0: # least confident start_time = time.time() ask_id = np.random.randint(0, len(unlabeled_entry_ids)) print "Found new query amongst %d unlabeled samples in %.2f seconds" % ( X_pool.shape[0], time.time() - start_time) # Make line. Project query_image on current decision boundary start_time = time.time() line, line_segment, intersection_point = self.make_line( X_pool[ask_id], n_images) print "Made line from found query in %.2f seconds" % (time.time() - start_time) if not self.human_experiment and self.generative_model is not None: # Change 1 to higher number for faster algorithm (less generating and plotting) if self.n_queries % 1 == 0: start_time = time.time() self.generate_images_line_save(line_segment, unlabeled_entry_ids[ask_id]) print "Plotted query line in %.2f seconds" % (time.time() - start_time) else: self.save_query_to_hdf5( to_vector(self.dataset.data["features"][ unlabeled_entry_ids[ask_id]]).T, # Already in original space unlabeled_entry_ids[ask_id], self.dataset.scaling_transformation.inverse_transform( line_segment), # Transform to original space, self.dataset.scaling_transformation.inverse_transform( intersection_point.T)) self.n_queries += 1 return unlabeled_entry_ids[ ask_id], line, line_segment, None, intersection_point else: raise IndexError("No more unlabeled train samples")
def label(self, sample, line=None, line_segment=None, sample_already_scaled=False, intersection_point_cdb=None): """ NB ONLY HANDLES LINES FROM UNCERTAINTY STRATEGY (for clustercentroids, check if everything is in correct space!) :param sample: for uncertainty strategy in original ALI space :param line: lambda function in scaled space :param line_segment: scaled space :param sample_already_scaled: :return: """ sample = to_vector(sample) # original space if uncertainty strategy if sample_already_scaled: sample = self.dataset.scaling_transformation.inverse_transform( sample.T).T line_segment_original_space = self.dataset.scaling_transformation.inverse_transform( line_segment) line_images = self.generate_images(line_segment_original_space) point_query_image = self.generate_images(sample.T) intersection_point_cdb_original_space = self.dataset.scaling_transformation.inverse_transform( intersection_point_cdb.T).T oracle = LabelInterface( line_segment_original_space, line_images, point_query=sample, point_query_image=point_query_image, intersection_point_cdb=intersection_point_cdb_original_space, classes=list(self.dataset.classes), classes_dictionary=self.dataset.classes_dictionary) db_point_original_space = oracle.chosen_point if db_point_original_space is not None: db_point_scaled_space = self.dataset.scaling_transformation.transform( db_point_original_space.T).T else: db_point_scaled_space = None label_point_query = oracle.label_point_query print "Chosen label", label_point_query return label_point_query, db_point_scaled_space