def createDataset(sources,output): """ Create a codebook for a bag of visual words representation using k-means clustering """ global has_joblib out_path = str(output) # delete the output file if os.path.exists(out_path): os.remove(out_path) # first, list the source files fpaths_src, fnames_src = utils.listFiles(directory=os.path.abspath(sources), ext='png') n_imgs = len(fpaths_src) all_features_list = [] # parallel implementation (default, if joblib available) if has_joblib: image_features = Parallel(n_jobs=args.njobs,verbose=5) (delayed(processImage)(fpaths_src, fnames_src, img_idx) for img_idx in range(n_imgs)) # stack the individual images image_features = np.concatenate(image_features,axis=0) #print image_features.shape all_features_list.append(image_features) else: for img_idx in xrange(n_imgs): image_features = processImage(fpaths_src, fnames_src, img_idx) image_features = np.concatenate(image_features,axis=0) all_features_list.append(image_features) # create k clusters from all features print "Clustering (k=%s)"%str(args.k) feat_matrix = np.concatenate(all_features_list, axis=0).astype(np.float32) criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0) flags = cv2.KMEANS_RANDOM_CENTERS _,labels,codebook = cv2.kmeans( feat_matrix, args.k, criteria, 10, flags) # write the codebook to the file using savetext() on the numpy array np.savetxt(out_path, codebook, delimiter=' ', header=('Codebook, %s words, %s dimensions'%(str(args.k),str(feat_matrix.shape[1])))) return 0
def createDataset(sources,output,labels,sparse): """ Create a dataset by vectorizing the images and writing them line by line to a txt-file. Each pixel is a feature and is thus stored in libsvm-format: [label] [index0:value0] [index1:value1] ... [indexN:valueN] """ global has_joblib out_path = str(output) # delete the output file if os.path.exists(os.path.abspath(out_path)): os.remove(os.path.abspath(out_path)) # first, list the source files fpaths_src, fnames_src = utils.listFiles(directory=os.path.abspath(sources), ext='png') label_map={} # read the label file if not (labels == None): label_map = utils.readLabelMap(labels) # check that the numbers match print("Number of images in label map : %s"%str(len(label_map.keys())-1)) print("Number of images in source dir: %s"%str(len(fpaths_src))) assert len(label_map.keys())-1 == len(fpaths_src) # generate KNN classifier if not (args.codebook == 'None' or args.codebook == None): args.knn = getKNNClassifier() else: args.knn = None # precompute number of images n_imgs = len(fpaths_src) # preallocate array # if augmentation, calculate (9*4+1)*n samples all_features_list = [] # parallel implementation (default, if joblib available) if has_joblib: image_features = Parallel(n_jobs=args.njobs,verbose=5) (delayed(processImage)(fpaths_src, label_map, fnames_src, img_idx) for img_idx in range(n_imgs)) # collect all images into a single matrix image_features = np.concatenate(image_features, axis=0) all_features_list.append(image_features) else: for img_idx in xrange(n_imgs): image_features = processImage(fpaths_src, label_map, fnames_src, img_idx) all_features_list.append(image_features) # make a 2D matrix from the list of features (stack all images vertically) feat_matrix = np.concatenate(all_features_list, axis=0).astype(np.float32) # do scaling of each feature dimension #if False: if not (args.scale == 0): print "Scaling data..." # preserve the labels label_vec = feat_matrix[:,0] feat_matrix = np.delete(feat_matrix,0,1) featurestats = np.zeros((2,feat_matrix.shape[1])) # use soft-normalization (zero-mean, unit var whitening) if (args.scale == 1): # if we specified featurestats from a training set, use them if not (args.featurestats == None): # load the statistics featurestats = loadFeatureStats() # featurestats contains 2 rows, first row = mean, second row = std # and n feature dimensions assert feat_matrix.shape[1]==featurestats.shape[1] else: pass # use hard-normalization elif (args.scale == 2): # if we specified featurestats from a training set, use them if not (args.featurestats == None): # load the statistics featurestats = loadFeatureStats() # the featurestats contains 2 rows, first row = min, second row = max # and n feature dimensions assert feat_matrix.shape[1]==featurestats.shape[1] else: pass # normalize each feature dimension for feat_idx in xrange(feat_matrix.shape[1]): feat_vec = feat_matrix[:,feat_idx] # soft-normalization (zero-mean, approx. unit variance) if (args.scale == 1): # if feature statistics are specified if not (args.featurestats == None): feat_mean = featurestats[0,feat_idx] feat_std = featurestats[1,feat_idx] else: # compute them from the data feat_mean = feat_vec.mean() feat_std = (feat_vec.std() + 1e-10) # store them featurestats[0,feat_idx] = feat_mean featurestats[1,feat_idx] = feat_std # shift to zero mean and (unit) variance feat_vec_scaled = (feat_vec - feat_mean) / (1.*feat_std) # hard-normalization (min/max = borders estimated from the (training) dataset) elif (args.scale == 2): if not (args.featurestats == None): feat_min = featurestats[0,feat_idx] feat_max = featurestats[1,feat_idx] else: # compute them freshly feat_min = np.min(feat_vec) feat_max = np.max(feat_vec) # store them featurestats[0,feat_idx] = feat_min featurestats[1,feat_idx] = feat_max # standardize/normalize between 0 and 1 feat_vec_std = (feat_vec - feat_min) / (feat_max - feat_min + 1e-10) # linearly scale between -1 and 1 feat_vec_scaled = (1.0*feat_vec_std * (1 - -1)) - 1 # set column back to matrix feat_matrix[:,feat_idx] = feat_vec_scaled # finally prepend the label_vec again feat_matrix = np.concatenate((np.reshape(label_vec,(feat_matrix.shape[0],1)),feat_matrix), axis=1) print "Done." else: print "Data may not be properly scaled, use the 'svm-scale' implementation of libsvm." if not (args.savefeaturestats == None): saveFeatureStats(featurestats) #Parallel(n_jobs=args.njobs, verbose=5)(delayed(function)(params) for i in range(10)) # open the output file output_file = open(os.path.abspath(out_path), 'wb') # run through the feature matrix print "Writing %s rows and %s cols to file..."%(feat_matrix.shape) # parallel implementation (default, if joblib available) if has_joblib: lines = Parallel(n_jobs=args.njobs, verbose=5)(delayed(writeLine)(i, feat_matrix) for i in range(feat_matrix.shape[0])) output_file.writelines(lines) else: for i in xrange(feat_matrix.shape[0]): line = writeLine(i, feat_matrix) output_file.writelines(line) output_file.close() return 0