def get_bags_of_sifts(image_paths, vocab_filename): # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# for i in tqdm.trange(len(image_paths), desc='getting bags of SIFT'): img = load_image_gray(image_paths[i]).astype(DTYPE) _, descriptors = vlfeat.sift.dsift(img, step=4, fast=True, float_descriptors=True) d_norm = np.linalg.norm(descriptors, axis=1) idx_nonzero = np.nonzero(d_norm) d_norm = d_norm[idx_nonzero].astype(DTYPE) descriptors = descriptors[idx_nonzero].astype(DTYPE) descriptors /= d_norm[:, None] assignments = vlfeat.kmeans.kmeans_quantize(descriptors, vocab) feat, _ = np.histogram(assignments, bins=vocab.shape[0]) feat = feat.astype('float32') feat /= np.linalg.norm(feat) feats.append(feat) ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_bags_of_sifts(image_paths, vocab_filename): # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] for path in image_paths: image = load_image_gray(path) _, descriptors = vlfeat.sift.dsift(image, step=5, fast=True) descriptors = np.float32(descriptors) centers = vlfeat.kmeans.kmeans_quantize(descriptors, vocab) feature, _ = np.histogram(centers, bins=np.linspace(0, len(vocab), num=len(vocab) + 1)) feature = (feature / np.linalg.norm(feature))**(0.3) feats.append(feature) feats = np.asarray(feats) feats = np.reshape(feats, (len(image_paths), len(vocab))) return feats
def get_fisher_encoding(image_paths, stat_filename): with open(stat_filename, 'rb') as f: stats = pickle.load(f) means = stats[:, 0:128] covariances = stats[:, 128:256] priors = stats[:, 257] feats = [] for i in range(len(image_paths)): image = load_image_gray(image_paths[i]) [locations, SIFT_features] = vlfeat.sift.dsift(image.astype('float32'), fast=True, step=5, bin=8) result = vlfeats.fisher.fisher(SIFT_features.astype('float32'), means, covariances, priors, Improved=True) feats.append(result) feats = np.array(feats) return feats
def build_vocabulary(image_paths, vocab_size): dim = 128 # length of the SIFT descriptors that you are going to compute. image = load_image_gray(image_paths[0]) vs = 20 vb = 9 _, X = vlfeat.sift.dsift(image, step=vs, size=vb, fast=True) for i in range(1, len(image_paths)): image = load_image_gray(image_paths[i]) _, descriptors = vlfeat.sift.dsift(image, step=vs, size=vb, fast=True) X = np.vstack((X, descriptors)) X = np.float32(X) vocab = vlfeat.kmeans.kmeans(X, vocab_size) return vocab
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size,dim)) N = 400 StepSize = 10 TotalImages = len(image_paths) for i in range(TotalImages): Image = load_image_gray(image_paths[i]) Frames, Descriptors = vlfeat.sift.dsift(Image, fast = 1, step = StepSize) Descriptors = np.random.randint(0, high = Descriptors.shape[0] - 1, size = (400,128)) if i == 0: SIFT = np.stack(Descriptors) else: SIFT = np.vstack((SIFT,Descriptors)) SIFT = SIFT.astype(float) ClusterCenters = vlfeat.kmeans.kmeans(SIFT, vocab_size) vocab = ClusterCenters return vocab
def parallel_func(i, image_paths, step, vocab, vocab_size): image = load_image_gray(image_paths[i]) _, descriptors = vlfeat.sift.dsift(image, fast=True, step=step) assignments = vlfeat.kmeans.kmeans_quantize( descriptors.astype('float64'), vocab) bags_of_sifts = np.zeros((1, vocab_size)) for assignment in assignments: bags_of_sifts[0, assignment] += 1 return bags_of_sifts / np.linalg.norm(bags_of_sifts)
def get_bags_of_sifts(image_paths, vocab_filename): """ Args: - image_paths: paths to N images - vocab_filename: Path to the precomputed vocabulary. This function assumes that vocab_filename exists and contains an vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid or visual word. This ndarray is saved to disk rather than passed in as a parameter to avoid recomputing the vocabulary every run. Returns: - image_feats: N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram (vocab_size) below. """ # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) vocab_size = 200 TotalImages = len(image_paths) StepSize = 3 feats = np.zeros((TotalImages, vocab_size)) for i in range(TotalImages): Image = load_image_gray(image_paths[i]) Frames, Descriptors = vlfeat.sift.dsift(Image, fast = 1, step = StepSize) Descriptors = Descriptors.astype(float) assignments = vlfeat.kmeans.kmeans_quantize(Descriptors, vocab) AssignmentHist, edges = np.histogram(assignments, bins = vocab_size, density = True) AssignmentHist = np.asarray(AssignmentHist) feats[i, : ] = AssignmentHist return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable feats = [] ############################################################################# # TODO: YOUR CODE HERE # #############################################################################= for i in range(len(image_paths)): image = load_image_gray(image_paths[i]) image = cv2.resize(image, (16, 16)) ##print(image) Ir = image.flatten() Izm = Ir - np.mean(Ir) Iul = Izm / np.max(np.abs(Izm)) feats.append(Iul) feats = np.array(feats) # raise NotImplementedError('`get_tiny_images` function in ' + # '`student_code.py` needs to be implemented') ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def transform_image(filepath, n_dim): original_img = load_image_gray(filepath) scale = n_dim / original_img.shape[0] label = rescale(original_img, scale=scale, mode='reflect', multichannel=False) theta = generate_theta(n_dim) data = radon(label, theta=theta, circle=False) # data = torch.from_numpy(data) # label = torch.from_numpy(label) # theta = torch.from_numpy(theta) return data, label, theta
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ TotalImages = len(image_paths) Resize = 16 feats = np.zeros((TotalImages, Resize*Resize)) for i in range(TotalImages): #taking out each individual image from the given image path and resizing. Image = load_image_gray(image_paths[i]) ResizedImage = cv2.resize(Image,(Resize,Resize)) # creating a feature from the resized image; Feature = np.reshape(ResizedImage,(1,256)) # zero mean and unit length FeatureNew = (Feature - np.mean(Feature))/np.std(Feature) #print(np.linalg.norm(FeatureNew, ord = 1)) feats[i,:] = FeatureNew return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable feats = [] feats = np.zeros((len(image_paths), 256)) for x, y in enumerate(image_paths): image1 = load_image_gray(y) image2 = cv2.resize(image1, (16, 16)) image_mean = np.mean(image2) normalized_image = image2 / image_mean flat_image = np.ndarray.flatten(normalized_image) feats[x, :] = flat_image print(feats.shape) #print(len(feats[1,:])) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# h = 16 # hight w = 16 #width feats = np.zeros((len(image_paths), h*w)) # zero amtrix of shape (number of samples, 16x16) for i , path in enumerate(image_paths): #for all the images image = load_image_gray(path) #load graysclae image img_reshape = cv2.resize(image,(16,16)).flatten() #resize the image and converts intot a vector of size (1,256) image_normalized = (img_reshape - np.mean(img_reshape))/np.std(img_reshape) # Normalizing the vector feats[i,:] = image_normalized #saving the feature for each image ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable feats = [] ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# for img_path in image_paths: img = load_image_gray(img_path) feat = cv2.resize(img, (24, 24), interpolation=cv2.INTER_AREA).flatten() feat_zero_mean = feat - np.mean(feat) feat_unit_length = feat_zero_mean / np.linalg.norm(feat_zero_mean) feats.append(feat_unit_length) ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# d = 16 N = len(image_paths) feats = np.zeros([N, d * d]) for x in range(N): temp = np.reshape(cv2.resize(load_image_gray(image_paths[x]), (d, d)), (1, -1)) temp -= np.average(temp) temp /= (np.sum((temp)**2, axis=None))**0.5 #np.linalg.norm(temp,2) feats[x, :] = temp ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # parameter width = 16 N = len(image_paths) d = width * width # dummy feats variable feats = np.zeros((N, d)) for i in range(N): image = load_image_gray(image_paths[i]) image = cv2.resize(image, (width, width), interpolation=cv2.INTER_LINEAR) image = np.reshape(image, (1, d)) image -= np.mean(image) image_normalized = image / np.std(image) feats[i, :] = image_normalized return feats
def get_tiny_images(image_paths): feats = [] w = 16 h = 16 N = len(image_paths) for path in (image_paths): image = load_image_gray(path) img = cv2.resize(image, (w, h)) feature = np.reshape(img, (1, w * h)) feature -= np.mean(feature) feature /= np.linalg.norm(feature) #print(feature.shape) feats.append(feature) feats = np.asarray(feats) feats = np.reshape(feats, (N, w * h)) return feats
def get_tiny_images(image_paths): # dummy feats variable feats = [] ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# for img_path in image_paths: img = load_image_gray(img_path).astype(np.float32) feat = cv2.resize(img, (24, 24), interpolation=cv2.INTER_AREA) feat = feat.flatten() feat -= np.mean(feat, dtype=DTYPE) feat /= np.linalg.norm(feat) feats.append(feat) ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def get_tiny_images(image_paths): """ This feature is inspired by the simple tiny images used as features in 80 million tiny images: a large dataset for non-parametric object and scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ To build a tiny image feature, simply resize the original image to a very small square resolution, e.g. 16x16. You can either resize the images to square while ignoring their aspect ratio or you can crop the center square portion out of each image. Making the tiny images zero mean and unit length (normalizing them) will increase performance modestly. Useful functions: - cv2.resize - use load_image(path) to load a RGB images and load_image_gray(path) to load grayscale images Args: - image_paths: list of N elements containing image paths Returns: - feats: N x d numpy array of resized and then vectorized tiny images e.g. if the images are resized to 16x16, d would be 256 """ # dummy feats variable m = 16 M = m**2 v = np.ones((1, M)) for i in image_paths: im = load_image_gray(i) im = cv2.resize(im, (m, m)) im = im.reshape((1, M)) im = im / np.linalg.norm(im) v = np.append(v, im, axis=0) feats = v[1:] return feats
def build_vocabulary(image_paths, vocab_size): # length of the SIFT descriptors that you are going to compute. dim = 128 vocab = np.zeros((vocab_size, dim)) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# feats = [] for i in tqdm.trange(len(image_paths), desc='getting a vocab SIFT'): img = load_image_gray(image_paths[i]).astype(np.float32) _, descriptors = vlfeat.sift.dsift(img, step=8, fast=True, float_descriptors=True) d_norm = np.linalg.norm(descriptors, axis=1) idx_nonzero = np.nonzero(d_norm) d_norm = d_norm[idx_nonzero] descriptors = descriptors[idx_nonzero].astype(DTYPE) d_norm = np.linalg.norm(descriptors, axis=1) descriptors /= d_norm[:, None] feats.append(descriptors) feats = np.vstack([feat for feat in feats]) vocab = vlfeat.kmeans.kmeans( np.asarray(feats, dtype=DTYPE), vocab_size, initialization='PLUSPLUS', # RANDSEL, PLUSPLUS distance='l2', # l1, l2 algorithm='LLOYD') # LLOYD, ELKAN ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def build_gaussian_gmm(image_paths, vocab_size): ## Used to build gaussian gmm bin_size = 8 level = 3 data = [] for i in range(len(image_paths)): for j in range(level): image = load_image_gray(image_paths[i]) G_low = cv2.getGaussianKernel(9, 2) filtered_image = cv2.filter2D(image, -1, G_low) resize_image = cv2.resize(filtered_image, 0.5 ^ (j - 1)) [locations, SIFT_features] = vlfeat.sift.dsift(resize_image.astype('float32'), fast=True, step=15, bin=8) SIFT_features = SIFT_features.astype('float32') data = np.hstack(data, SIFT_features) [means, covariances, priors] = vlfeat.gmm.gmm(SIFT_features, vocab_size) stats = [means, covariances, priors] return stats
def kernel_codebook_encoding(image_paths, vocab_filename, gamma = 1): # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] for img_path in image_paths: img = load_image_gray(img_path) _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 10) # Equivalent to K-mean center assignments: # First, Calculate the distance to the centers defined in vocab D = sklearn_pairwise.pairwise_distances(descriptors.astype('float64'), vocab.astype('float64'), metric = 'euclidean') #(N,vocab_size) # K(x,u) = exp(-gamma*(x-u)^2/2) D = np.exp(-gamma*0.5*D) #(N,vocab_size) # Normalize D = D/np.sum(D, axis = 1)[:,None] #(N,vocab_size)--axis=1--> (N,) -[:,None]-> (N,1) # hist hists = np.sum(D, axis = 0) #(N,vocab_size)--axis=0--> (vocab_size) hists = hists/np.linalg.norm(hists) feats.append(list(hists)) ############################################################################# # END OF YOUR CODE # ############################################################################# return np.array(feats).astype('float64')
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html - frames is a N x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). - descriptors is a N x 128 matrix of SIFT features Note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - cluster_centers = vlfeat.kmeans.kmeans(X, K) test_image_feats - X is a N x d numpy array of sampled SIFT features, where N is the number of features sampled. N should be pretty large! - K is the number of clusters desired (vocab_size) cluster_centers is a K x d matrix of cluster centers. This is your vocabulary. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ # Load images from the training set. To save computation time, you don't # necessarily need to sample from all images, although it would be better # to do so. You can randomly sample the descriptors from each image to save # memory and speed up the clustering. Or you can simply call vl_dsift with # a large step size here, but a smaller step size in get_bags_of_sifts. # # For each loaded image, get some SIFT features. You don't have to get as # many SIFT features as you will in get_bags_of_sift, because you're only # trying to get a representative sample here. # # Once you have tens of thousands of SIFT features from many training # images, cluster them with kmeans. The resulting centroids are now your # visual word vocabulary. ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size,dim)) # intialization of vocab bag_of_features = [] for path in image_paths: img = np.asarray(load_image_gray(path),dtype='float32') # loading grayscale image and converting it to numpy array frames, descriptors = dsift(img, step=[10,10], fast=True) #SIFT descriptor using step size of 10 and fast true bag_of_features.append(descriptors) bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') #list into an array print("Compute vocab") start_time = time.time() vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") # using kmeans for clusters center end_time = time.time() print("It takes ", (end_time - start_time), " to compute vocab.") ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def get_bags_of_sifts(image_paths, vocab_filename): """ This feature representation is described in the handout, lecture materials, and Szeliski chapter 14. You will want to construct SIFT features here in the same way you did in build_vocabulary() (except for possibly changing the sampling rate) and then assign each local feature to its nearest cluster center and build a histogram indicating how many times each cluster was used. Don't forget to normalize the histogram, or else a larger image with more SIFT features will look very different from a smaller version of the same image. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html frames is a M x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). descriptors is a M x 128 matrix of SIFT features note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - assignments = vlfeat.kmeans.kmeans_quantize(data, vocab) finds the cluster assigments for features in data - data is a M x d matrix of image features - vocab is the vocab_size x d matrix of cluster centers (vocabulary) - assignments is a Mx1 array of assignments of feature vectors to nearest cluster centers, each element is an integer in [0, vocab_size) Args: - image_paths: paths to N images - vocab_filename: Path to the precomputed vocabulary. This function assumes that vocab_filename exists and contains an vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid or visual word. This ndarray is saved to disk rather than passed in as a parameter to avoid recomputing the vocabulary every run. Returns: - image_feats: N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram (vocab_size) below. """ ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] start_time = time.time() print("Construct bags of sifts...") for path in image_paths: img = np.asarray(load_image_gray(path),dtype='float32') # reading the image frames, descriptors = dsift(img, step=[5,5], fast=True) # SIFT descriptor with step size 5 dist = distance.cdist(descriptors,vocab, metric='euclidean')# euclidean distance calcualtion from each clusster center closest_vocab = np.argsort(dist,axis=1)[:,0] # sorting the index of distance ind ,count = np.unique(closest_vocab,return_counts=True) # finding unique values histogram = np.zeros(len(vocab)) histogram[ind] += count histogram = [float(i)/sum(histogram) for i in histogram] # Normalizing histogram feats.append(histogram) feats = np.asarray(feats) # List to array end_time = time.time() print("It takes ", (end_time - start_time), " to construct bags of sifts.") ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html - frames is a N x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). - descriptors is a N x 128 matrix of SIFT features Note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - cluster_centers = vlfeat.kmeans.kmeans(X, K) http://www.vlfeat.org/matlab/vl_kmeans.html - X is a N x d numpy array of sampled SIFT features, where N is the number of features sampled. N should be pretty large! - K is the number of clusters desired (vocab_size) cluster_centers is a K x d matrix of cluster centers. This is your vocabulary. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ # Load images from the training set. To save computation time, you don't # necessarily need to sample from all images, although it would be better # to do so. You can randomly sample the descriptors from each image to save # memory and speed up the clustering. Or you can simply call vl_dsift with # a large step size here, but a smaller step size in get_bags_of_sifts. # # For each loaded image, get some SIFT features. You don't have to get as # many SIFT features as you will in get_bags_of_sift, because you're only # trying to get a representative sample here. # # Once you have tens of thousands of SIFT features from many training # images, cluster them with kmeans. The resulting centroids are now your # visual word vocabulary. dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size, dim)) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# total_SIFT_features = np.zeros((20 * len(image_paths), dim)) index = 0 for i in range(len(image_paths)): image = load_image_gray(image_paths[i]).astype('float32') [locations, SIFT_features] = vlfeat.sift.dsift(image, fast=True, step=15) rand_permutation = np.random.permutation(SIFT_features.shape[0]) for j in range(20): k = rand_permutation[j] total_SIFT_features[j + index, :] = SIFT_features[k, :] index = index + 20 vocab = vlfeat.kmeans.kmeans(total_SIFT_features.astype('float32'), vocab_size) # raise NotImplementedError('`build_vocabulary` function in ' + # '`student_code.py` needs to be implemented') ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def get_spyramid_fisher_encoding(image_paths, stat_filename): with open(stat_filename, 'rb') as f: stats = pickle.load(f) means = stats[:, 0:128] covariances = stats[:, 128:256] priors = stats[:, 257] feats = [] feats_L0 = [] feats_L1_1 = [] feats_L1_2 = [] feats_L1_3 = [] feats_L1_4 = [] feats_L2_1 = [] feats_L2_2 = [] feats_L2_3 = [] feats_L2_4 = [] feats_L2_5 = [] feats_L2_6 = [] feats_L2_7 = [] feats_L2_8 = [] feats_L2_9 = [] feats_L2_10 = [] feats_L2_11 = [] feats_L2_12 = [] feats_L2_13 = [] feats_L2_14 = [] feats_L2_15 = [] feats_L2_16 = [] for i in range(len(image_paths)): ##level0 image = load_image_gray(image_paths[i]) W = image.shape[0] L = image.shape[1] [locations, SIFT_features_L0] = vlfeat.sift.dsift(image.astype('float32'), fast=True, step=5, bin=8) result0 = vlfeats.fisher.fisher(SIFT_features_L0.astype('float32'), means, covariances, priors, Improved=True) feats_L0.append(result0) ##level1 img_L1_1 = image[0:int16(W / 2), 0:int16(L / 2)] img_L1_2 = image[0:int16(W / 2), int16(L / 2):L] img_L1_3 = image[int16(W / 2):W, 0:int16(L / 2)] img_L1_4 = image[int16(W / 2):W, int16(L / 2):L] [locations, SIFT_features_L1_1] = vlfeat.sift.dsift(img_L1_1.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L1_2] = vlfeat.sift.dsift(img_L1_2.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L1_3] = vlfeat.sift.dsift(img_L1_3.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L1_4] = vlfeat.sift.dsift(img_L1_4.astype('float32'), fast=True, step=5, bin=8) result1_1 = vlfeats.fisher.fisher(SIFT_features_L1_1.astype('float32'), means, covariances, priors, Improved=True) result1_2 = vlfeats.fisher.fisher(SIFT_features_L1_2.astype('float32'), means, covariances, priors, Improved=True) result1_3 = vlfeats.fisher.fisher(SIFT_features_L1_3.astype('float32'), means, covariances, priors, Improved=True) result1_4 = vlfeats.fisher.fisher(SIFT_features_L1_4.astype('float32'), means, covariances, priors, Improved=True) feats_L1_1.append(result1_1) feats_L1_2.append(result1_2) feats_L1_3.append(result1_3) feats_L1_4.append(result1_4) ##level2 img_L2_1 = img[0:int16(W / 4), 0:int16(L / 4)] img_L2_2 = img[0:int16(W / 4), int16(L / 4):int16(L / 2)] img_L2_3 = img[0:int16(W / 4), int16(L / 2):int16(3 * L / 4)] img_L2_4 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L] img_L2_5 = img[int16(W / 4):int16(W / 2), 0:int16(L / 4)] img_L2_6 = img[int16(W / 4):int16(W / 2), int16(L / 4):int16(L / 2)] img_L2_7 = img[int16(W / 4):int16(W / 2), int16(L / 2):int16(3 * L / 4)] img_L2_8 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L] img_L2_9 = img[int16(W / 2):int16(3 * W / 4), 0:int16(L / 4)] img_L2_10 = img[int16(W / 2):int16(3 * W / 4), int16(L / 4):int16(L / 2)] img_L2_11 = img[int16(W / 2):int16(3 * W / 4), int16(L / 2):int16(3 * L / 4)] img_L2_12 = img[int16(W / 2):int16(3 * W / 4), int16(3 * L / 4):L] img_L2_13 = img[int16(3 * W / 4):W, 0:int16(L / 4)] img_L2_14 = img[int16(3 * W / 4):W, int16(L / 4):int16(L / 2)] img_L2_15 = img[int16(3 * W / 4):W, int16(L / 2):int16(3 * L / 4)] img_L2_16 = img[int16(3 * W / 4):W, int16(3 * L / 4):L] [locations, SIFT_features_L2_1] = vlfeat.sift.dsift(img_L2_1.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_2] = vlfeat.sift.dsift(img_L2_2.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_3] = vlfeat.sift.dsift(img_L2_3.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_4] = vlfeat.sift.dsift(img_L2_4.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_5] = vlfeat.sift.dsift(img_L2_5.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_6] = vlfeat.sift.dsift(img_L2_6.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_7] = vlfeat.sift.dsift(img_L2_7.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_8] = vlfeat.sift.dsift(img_L2_8.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_9] = vlfeat.sift.dsift(img_L2_9.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_10] = vlfeat.sift.dsift(img_L2_10.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_11] = vlfeat.sift.dsift(img_L2_11.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_12] = vlfeat.sift.dsift(img_L2_12.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_13] = vlfeat.sift.dsift(img_L2_13.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_14] = vlfeat.sift.dsift(img_L2_14.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_15] = vlfeat.sift.dsift(img_L2_15.astype('float32'), fast=True, step=5, bin=8) [locations, SIFT_features_L2_16] = vlfeat.sift.dsift(img_L2_16.astype('float32'), fast=True, step=5, bin=8) result2_1 = vlfeats.fisher.fisher(SIFT_features_L2_1.astype('float32'), means, covariances, priors, Improved=True) result2_2 = vlfeats.fisher.fisher(SIFT_features_L2_2.astype('float32'), means, covariances, priors, Improved=True) result2_3 = vlfeats.fisher.fisher(SIFT_features_L2_3.astype('float32'), means, covariances, priors, Improved=True) result2_4 = vlfeats.fisher.fisher(SIFT_features_L2_4.astype('float32'), means, covariances, priors, Improved=True) result2_5 = vlfeats.fisher.fisher(SIFT_features_L2_5.astype('float32'), means, covariances, priors, Improved=True) result2_6 = vlfeats.fisher.fisher(SIFT_features_L2_6.astype('float32'), means, covariances, priors, Improved=True) result2_7 = vlfeats.fisher.fisher(SIFT_features_L2_7.astype('float32'), means, covariances, priors, Improved=True) result2_8 = vlfeats.fisher.fisher(SIFT_features_L2_8.astype('float32'), means, covariances, priors, Improved=True) result2_9 = vlfeats.fisher.fisher(SIFT_features_L2_9.astype('float32'), means, covariances, priors, Improved=True) result2_10 = vlfeats.fisher.fisher( SIFT_features_L2_10.astype('float32'), means, covariances, priors, Improved=True) result2_11 = vlfeats.fisher.fisher( SIFT_features_L2_11.astype('float32'), means, covariances, priors, Improved=True) result2_12 = vlfeats.fisher.fisher( SIFT_features_L2_12.astype('float32'), means, covariances, priors, Improved=True) result2_13 = vlfeats.fisher.fisher( SIFT_features_L2_13.astype('float32'), means, covariances, priors, Improved=True) result2_14 = vlfeats.fisher.fisher( SIFT_features_L2_14.astype('float32'), means, covariances, priors, Improved=True) result2_15 = vlfeats.fisher.fisher( SIFT_features_L2_15.astype('float32'), means, covariances, priors, Improved=True) result2_16 = vlfeats.fisher.fisher( SIFT_features_L2_16.astype('float32'), means, covariances, priors, Improved=True) feats_L2_1.append(result2_1) feats_L2_2.append(result2_2) feats_L2_3.append(result2_3) feats_L2_4.append(result2_4) feats_L2_5.append(result2_5) feats_L2_6.append(result2_6) feats_L2_7.append(result2_7) feats_L2_8.append(result2_8) feats_L2_9.append(result2_9) feats_L2_10.append(result2_10) feats_L2_11.append(result2_11) feats_L2_12.append(result2_12) feats_L2_13.append(result2_13) feats_L2_14.append(result2_14) feats_L2_15.append(result2_15) feats_L2_16.append(result2_16) feats = np.append(feats, feats_L0, feats_L1_1, feats_L1_2, feats_L1_3, feats_L1_4, feats_L1_5, feats_L2_1, feats_L2_2, feats_L2_3, feats_L2_4, feats_L2_5, feats_L2_6, feats_L2_7, feats_L2_8, feats_L2_9, feats_L2_10, feats_L2_11, feats_L2_12, feats_L2_13, feats_L2_14, feats_L2_15, feats_L2_16) feats = np.array(feats) return feats
def build_spyramid_gmm(image_paths, vocab_size): level = 2 data = [] for i in range(len(image_paths)): image = load_image_gray(image_paths[i]) W = image.shape[0] L = image.shape[1] [locations, SIFT_features_L0] = vlfeat.sift.dsift(image.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L0) img_L1_1 = image[0:int16(W / 2), 0:int16(L / 2)] img_L1_2 = image[0:int16(W / 2), int16(L / 2):L] img_L1_3 = image[int16(W / 2):W, 0:int16(L / 2)] img_L1_4 = image[int16(W / 2):W, int16(L / 2):L] [locations, SIFT_features_L1_1] = vlfeat.sift.dsift(img_L1_1.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L1_2] = vlfeat.sift.dsift(img_L1_2.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L1_3] = vlfeat.sift.dsift(img_L1_3.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L1_4] = vlfeat.sift.dsift(img_L1_4.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L1_1, SIFT_features_L1_2, SIFT_features_L1_3, SIFT_features_L1_4) img_L2_1 = img[0:int16(W / 4), 0:int16(L / 4)] img_L2_2 = img[0:int16(W / 4), int16(L / 4):int16(L / 2)] img_L2_3 = img[0:int16(W / 4), int16(L / 2):int16(3 * L / 4)] img_L2_4 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L] img_L2_5 = img[int16(W / 4):int16(W / 2), 0:int16(L / 4)] img_L2_6 = img[int16(W / 4):int16(W / 2), int16(L / 4):int16(L / 2)] img_L2_7 = img[int16(W / 4):int16(W / 2), int16(L / 2):int16(3 * L / 4)] img_L2_8 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L] img_L2_9 = img[int16(W / 2):int16(3 * W / 4), 0:int16(L / 4)] img_L2_10 = img[int16(W / 2):int16(3 * W / 4), int16(L / 4):int16(L / 2)] img_L2_11 = img[int16(W / 2):int16(3 * W / 4), int16(L / 2):int16(3 * L / 4)] img_L2_12 = img[int16(W / 2):int16(3 * W / 4), int16(3 * L / 4):L] img_L2_13 = img[int16(3 * W / 4):W, 0:int16(L / 4)] img_L2_14 = img[int16(3 * W / 4):W, int16(L / 4):int16(L / 2)] img_L2_15 = img[int16(3 * W / 4):W, int16(L / 2):int16(3 * L / 4)] img_L2_16 = img[int16(3 * W / 4):W, int16(3 * L / 4):L] [locations, SIFT_features_L2_1] = vlfeat.sift.dsift(img_L2_1.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_2] = vlfeat.sift.dsift(img_L2_2.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_3] = vlfeat.sift.dsift(img_L2_3.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_4] = vlfeat.sift.dsift(img_L2_4.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L2_1, SIFT_features_L2_2, SIFT_features_L2_3, SIFT_features_L2_4) [locations, SIFT_features_L2_5] = vlfeat.sift.dsift(img_L2_5.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_6] = vlfeat.sift.dsift(img_L2_6.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_7] = vlfeat.sift.dsift(img_L2_7.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_8] = vlfeat.sift.dsift(img_L2_8.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L2_5, SIFT_features_L2_6, SIFT_features_L2_7, SIFT_features_L2_8) [locations, SIFT_features_L2_9] = vlfeat.sift.dsift(img_L2_9.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_10] = vlfeat.sift.dsift(img_L2_10.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_11] = vlfeat.sift.dsift(img_L2_11.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_12] = vlfeat.sift.dsift(img_L2_12.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L2_9, SIFT_features_L2_10, SIFT_features_L2_11, SIFT_features_L2_12) [locations, SIFT_features_L2_13] = vlfeat.sift.dsift(img_L2_13.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_14] = vlfeat.sift.dsift(img_L2_14.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_15] = vlfeat.sift.dsift(img_L2_15.astype('float32'), fast=True, step=15, bin=8) [locations, SIFT_features_L2_16] = vlfeat.sift.dsift(img_L2_16.astype('float32'), fast=True, step=15, bin=8) data = np.hstack(data, SIFT_features_L2_13, SIFT_features_L2_14, SIFT_features_L2_15, SIFT_features_L2_16) [means, covariances, priors] = vlfeat.gmm.gmm(SIFT_features, vocab_size) stats = [means, covariances, priors] return stats
def get_bags_of_sifts(image_paths, vocab_filename): """ This feature representation is described in the handout, lecture materials, and Szeliski chapter 14. You will want to construct SIFT features here in the same way you did in build_vocabulary() (except for possibly changing the sampling rate) and then assign each local feature to its nearest cluster center and build a histogram indicating how many times each cluster was used. Don't forget to normalize the histogram, or else a larger image with more SIFT features will look very different from a smaller version of the same image. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html frames is a M x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). descriptors is a M x 128 matrix of SIFT features note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - assignments = vlfeat.kmeans.kmeans_quantize(data, vocab) finds the cluster assigments for features in data - data is a M x d matrix of image features - vocab is the vocab_size x d matrix of cluster centers (vocabulary) - assignments is a Mx1 array of assignments of feature vectors to nearest cluster centers, each element is an integer in [0, vocab_size) Args: - image_paths: paths to N images - vocab_filename: Path to the precomputed vocabulary. This function assumes that vocab_filename exists and contains an vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid or visual word. This ndarray is saved to disk rather than passed in as a parameter to avoid recomputing the vocabulary every run. Returns: - image_feats: N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram (vocab_size) below. """ # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable vocab_size = vocab.shape[0] feats = [] for i in range(len(image_paths)): image = load_image_gray(image_paths[i]).astype('float32') [locations, SIFT_features] = vlfeat.sift.dsift(image, fast=True, step=10) SIFT_features = SIFT_features.astype('float32') Hist = np.zeros(vocab_size) D = sklearn_pairwise.pairwise_distances(SIFT_features, vocab) for j in D: closet = np.argmin(a=j, axis=0) Hist[closet] += 1 Hist = Hist / np.linalg.norm(Hist) feats.append(Hist) # assignments = vlfeat.kmeans.kmeans_quantize(SIFT_features, vocab) # map_to_bins = np.digitize(assignments, bins) # Hist = np.zeros(bins.shape) # for j in map_to_bins: # Hist[j-1] += 1 # Hist = Hist/np.linalg.norm(Hist) # feats.append(Hist) # print(Hist.shape) # print(assignments.shape) feats = np.array(feats) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# # raise NotImplementedError('`get_bags_of_sifts` function in ' + # '`student_code.py` needs to be implemented') ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html - frames is a N x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). - descriptors is a N x 128 matrix of SIFT features Note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - cluster_centers = vlfeat.kmeans.kmeans(X, K) http://www.vlfeat.org/matlab/vl_kmeans.html - X is a N x d numpy array of sampled SIFT features, where N is the number of features sampled. N should be pretty large! - K is the number of clusters desired (vocab_size) cluster_centers is a K x d matrix of cluster centers. This is your vocabulary. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ # Load images from the training set. To save computation time, you don't # necessarily need to sample from all images, although it would be better # to do so. You can randomly sample the descriptors from each image to save # memory and speed up the clustering. Or you can simply call vl_dsift with # a large step size here, but a smaller step size in get_bags_of_sifts. # # For each loaded image, get some SIFT features. You don't have to get as # many SIFT features as you will in get_bags_of_sift, because you're only # trying to get a representative sample here. # # Once you have tens of thousands of SIFT features from many training # images, cluster them with kmeans. The resulting centroids are now your # visual word vocabulary. dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size, dim)) count = len(image_paths) #count=1 SIFT = [] #print(np.shape(image_paths)) """ a =load_image_gray(image_paths[2]) print(np.shape(a)) a1 = a.astype('float32') frames, descriptors = vlfeat.sift.dsift(a1,15,8,fast=True) print(np.shape(descriptors)) print(np.shape(frames)) """ #counts=4 SIFT = np.zeros((1, 128)) for y in range(count): a = load_image_gray(image_paths[y]) #print(a) #print(np.shape(a)) #print(type(a)) #a1 = a.astype('float32') #print(np.shape(a1)) #print(type(a1)) frames, descriptors = vlfeat.sift.dsift(a, step=15, fast=True) #print("hi") #print(descriptors) #print(np.shape(descriptors)) #print(np.shape(descriptors)) #SIFT.append(descriptors) SIFT = np.vstack((SIFT, descriptors)) #size=np.shape(descriptors) #var=size[1] #a1=np.random.permutation(var) #print(np.shape(SIFT)) #print(np.shape(SIFT)) #print(SIFT) bh = np.shape(SIFT) bh1 = bh[0] SIFT = SIFT[1:bh1, :] #print(np.shape(SIFT)) b1 = np.float32(SIFT) #print(b1) cluster_centers = vlfeat.kmeans.kmeans(b1, vocab_size) #print(np.shape(cluster_centers)) #print(cluster_centers) vocab = cluster_centers #print(vocab) print(np.shape(vocab)) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html - frames is a N x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). - descriptors is a N x 128 matrix of SIFT features Note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - cluster_centers = vl_kmeans(X, K) http://www.vlfeat.org/matlab/vl_kmeans.html - X is a N x d numpy array of sampled SIFT features, where N is the number of features sampled. N should be pretty large! - K is the number of clusters desired (vocab_size) cluster_centers is a K x d matrix of cluster centers. This is your vocabulary. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ # Load images from the training set. To save computation time, you don't # necessarily need to sample from all images, although it would be better # to do so. You can randomly sample the descriptors from each image to save # memory and speed up the clustering. Or you can simply call vl_dsift with # a large step size here, but a smaller step size in get_bags_of_sifts. # # For each loaded image, get some SIFT features. You don't have to get as # many SIFT features as you will in get_bags_of_sift, because you're only # trying to get a representative sample here. # # Once you have tens of thousands of SIFT features from many training # images, cluster them with kmeans. The resulting centroids are now your # visual word vocabulary. dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size,dim)) ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# # Loop over the images and randomly sample the descriptors stack_desciptors = [] sample_size = int(20000//len(image_paths)) #take 20 000 sift features in total #print(sample_size,len(image_paths)) for img_path in image_paths: img = load_image_gray(img_path) _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 20) sample_indexes = np.random.permutation(len(descriptors))[:sample_size] sample_descriptors = descriptors[sample_indexes] stack_desciptors.append(sample_descriptors) stack_desciptors = np.array(stack_desciptors).reshape(-1, dim) # K_mean clustering to find the center kmeans = KMeans(n_clusters=vocab_size, random_state=0).fit(stack_desciptors) vocab = kmeans.cluster_centers_ ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def get_bags_of_sifts(image_paths, vocab_filename): """ This feature representation is described in the handout, lecture materials, and Szeliski chapter 14. You will want to construct SIFT features here in the same way you did in build_vocabulary() (except for possibly changing the sampling rate) and then assign each local feature to its nearest cluster center and build a histogram indicating how many times each cluster was used. Don't forget to normalize the histogram, or else a larger image with more SIFT features will look very different from a smaller version of the same image. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html frames is a M x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). descriptors is a M x 128 matrix of SIFT features note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - assignments = vlfeat.kmeans.kmeans_quantize(data, vocab) finds the cluster assigments for features in data - data is a M x d matrix of image features - vocab is the vocab_size x d matrix of cluster centers (vocabulary) - assignments is a Mx1 array of assignments of feature vectors to nearest cluster centers, each element is an integer in [0, vocab_size) Args: - image_paths: paths to N images - vocab_filename: Path to the precomputed vocabulary. This function assumes that vocab_filename exists and contains an vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid or visual word. This ndarray is saved to disk rather than passed in as a parameter to avoid recomputing the vocabulary every run. Returns: - image_feats: N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram (vocab_size) below. """ # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# for img_path in image_paths: img = load_image_gray(img_path) _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 10) # Equivalent to K-mean center assignments: # First, Calculate the distance to the centers defined in vocab D = sklearn_pairwise.pairwise_distances(descriptors.astype('float64'), vocab.astype('float64'), metric = 'euclidean') #(N,vocab_size) # Second, Assign label + bincount the label + normalize + append labels = np.argmin(D, axis = 1) #(N,vocab_size) --axis=1--> (N,) hists = np.bincount(labels, minlength = len(vocab)) # minlength as len(vocab) hists = hists/np.linalg.norm(hists) feats.append(list(hists)) ############################################################################# # END OF YOUR CODE # ############################################################################# return np.array(feats).astype('float64')