def project_features(self, tool, extractor, indices=None, force=False): """Projects the features for all files of the database.""" # load the projector file if tool.performs_projection: tool.load_projector(str(self.m_file_selector.projector_file)) feature_files = self.m_file_selector.feature_list() projected_files = self.m_file_selector.projected_list() # select a subset of indices to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Projection: splitting of index range %s" % str(indices)) else: index_range = range(len(feature_files)) utils.ensure_dir(self.m_file_selector.projected_directory) utils.info( "- Projection: projecting %d features from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.features_directory, self.m_file_selector.projected_directory)) # extract the features for i in index_range: feature_file = feature_files[i] projected_file = projected_files[i] if not self.__check_file__(projected_file, force): # load feature feature = extractor.read_feature(str(feature_file)) # project feature projected = tool.project(feature) # write it utils.ensure_dir(os.path.dirname(projected_file)) tool.save_feature(projected, str(projected_file))
def __train_kmeans__(self, feature_space): """Compute KMeans classification of the data""" utils.info(" -> Training KMeans") # Form the feature space for training KMeans. data_list = [] for client in feature_space: for feature in client: data_list.append(feature) data = numpy.vstack(data_list) del data_list # Compute the number of clusers of KMeans. global m_kmeans_means self.m_kmeans_means = numpy.uint32( data.shape[0] * self.m_kmeans_means) if isinstance( self.m_kmeans_means, float) else self.m_kmeans_means # Machine. dimension = feature_space[0].shape[1] kmeans = bob.machine.KMeansMachine(self.m_kmeans_means, dimension) # Training. t = bob.trainer.KMeansTrainer() t.max_iterations = self.m_kmeans_training_iterations t.convergence_threshold = self.m_kmeans_training_threshold t.train(kmeans, data) del data # Return machine. return kmeans
def train_enroller(self, tool, extractor, force=False): """Trains the model enroller using the extracted or projected features, depending on your setup of the base class Tool.""" reader = tool if tool.use_projected_features_for_enrollment else extractor if tool.requires_enroller_training: enroller_file = self.m_file_selector.enroller_file if self.__check_file__(enroller_file, force, 1000): utils.info("- Enrollment: enroller '%s' already exists." % enroller_file) else: utils.ensure_dir(os.path.dirname(enroller_file)) # first, load the projector tool.load_projector(str(self.m_file_selector.projector_file)) # training models train_files = self.m_file_selector.training_list( 'projected' if tool.use_projected_features_for_enrollment else 'features', 'train_enroller', arrange_by_client=True) train_features = self.__read_features_by_client__( train_files, reader) # perform training utils.info( "- Enrollment: training enroller '%s' using %d identities: " % (enroller_file, len(train_features))) tool.train_enroller(train_features, str(enroller_file))
def extract_features(self, extractor, preprocessor, indices=None, force=False): """Extracts the features from the preprocessed data using the given extractor.""" extractor.load(str(self.m_file_selector.extractor_file)) data_files = self.m_file_selector.preprocessed_data_list() feature_files = self.m_file_selector.feature_list() # select a subset of indices to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Extraction: splitting of index range %s" % str(indices)) else: index_range = range(len(data_files)) utils.ensure_dir(self.m_file_selector.features_directory) utils.info( "- Extraction: extracting %d features from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.preprocessed_directory, self.m_file_selector.features_directory)) for i in index_range: data_file = data_files[i] feature_file = feature_files[i] if not self.__check_file__(feature_file, force): # load data data = preprocessor.read_data(str(data_file)) # extract feature feature = extractor(data, data_file) # Save feature utils.ensure_dir(os.path.dirname(feature_file)) extractor.save_feature(feature, str(feature_file))
def _plot_scores(figsize, args, totalModels, ids_group, group, title): figure = mpl.figure(figsize=figsize) eer_mean = [] thr_mean = [] print("List of the non zero EER Models:") for model in range(len(totalModels)): scores = ids_group[ids_group[:,0] == totalModels[model], :] scoresTarget = scores[scores[:,4]=='1',3].astype(numpy.float64) scoresNonTarget = scores[scores[:,4]=='0',3].astype(numpy.float64) # compute threshold on development set threshold = {'EER': bob.measure.eer_threshold, 'HTER' : bob.measure.min_hter_threshold} [args.criterion](scoresNonTarget, scoresTarget) thr_mean.append(threshold) # apply threshold to development set far, frr = bob.measure.farfrr(scoresNonTarget, scoresTarget, threshold) #print("Model %s - The %s of the development set of '%s' is %2.3f%%" % (model, args.criterion, args.legends[i] if args.legends else args.dev_files[i], (far + frr) * 50.)) # / 2 * 100% eer = 100. * bob.measure.eer_rocch(scoresNonTarget, scoresTarget) eer_mean.append(eer) if (eer != 0): print("Model %s - The %s of the %s set is %2.3f%%" % (totalModels[model], args.criterion, group, eer )) # / 2 * 100% utils.info("Plotting Fauna graph to file '%s'" % args.pdf) # plot scoresTarget blue_dot, = mpl.plot(numpy.ones(len(scoresTarget))*model, scoresTarget, 'bo') # plot scoresNonTarget red_cross, = mpl.plot(numpy.ones(len(scoresNonTarget))*model, scoresNonTarget, 'rx') # plot threshold black_th, = mpl.plot(model, threshold, 'k*', markersize=16) # finalize plot offset = 0.01 mpl.ylabel('Score norm') if (args.norm == 'none'): offset = numpy.mean(ids_group[:,3].astype(numpy.float64))/100 mpl.ylabel('Score') mpl.axis([-1,len(totalModels), min(ids_group[:,3].astype(numpy.float64))-offset, max(ids_group[:,3].astype(numpy.float64))+offset]) mpl.xticks(range(0,len(totalModels)+1,5)) #mpl.xticks(range(0,len(totalModels),10), totalModels[range(0,len(totalModels),10)].astype(numpy.str), rotation = 'vertical') mpl.xlabel('User model') mpl.grid(True, color=(0.6,0.6,0.6)) #thr_mean = {'EER': bob.measure.eer_threshold, 'HTER' : bob.measure.min_hter_threshold} [args.criterion](scores_dev[i][0], scores_dev[i][1]) #mpl.axhline(y=thr_mean, xmin=0, xmax=1, c="blue", linewidth=1.5, zorder=0) #mpl.text(60, .025, r'$\mu=100,\ \sigma=15$') #mpl.annotate('local max', xy=(2, 1), xytext=(3, 1.5),arrowprops=dict(facecolor='black', shrink=0.05),) mpl.legend([blue_dot, red_cross, black_th], ["Target scores", "NonTarget scores", "Model threshold"], numpoints=1) mpl.legend(loc=5) #mpl.legend(handles = [blue_dot, red_cross, black_th],["Target scores", "NonTarget scores", "Model threshold"]) mpl.title(title) return eer_mean, figure
def calibrate_scores(self, norms=['nonorm', 'ztnorm'], groups=['dev', 'eval'], prior=0.5): """Calibrates the score files by learning a linear calibration from the dev files (first element of the groups) and executing the on all groups, separately for all given norms.""" # read score files of the first group for norm in norms: training_score_file = self.m_file_selector.no_norm_result_file( groups[0] ) if norm == 'nonorm' else self.m_file_selector.zt_norm_result_file( groups[0]) if norm is 'ztnorm' else None # create a LLR trainer utils.info( " - Calibration: Training calibration for type %s from group %s" % (norm, groups[0])) llr_trainer = bob.trainer.CGLogRegTrainer(prior, 1e-16, 100000) training_scores = list( bob.measure.load.split_four_column(training_score_file)) for i in (0, 1): h = numpy.array(training_scores[i]) h.shape = (len(training_scores[i]), 1) training_scores[i] = h # train the LLR llr_machine = llr_trainer.train(training_scores[0], training_scores[1]) del training_scores utils.debug( " ... Resulting calibration parameters: shift = %f, scale = %f" % (llr_machine.biases[0], llr_machine.weights[0, 0])) # now, apply it to all groups for group in groups: score_file = self.m_file_selector.no_norm_result_file( group ) if norm == 'nonorm' else self.m_file_selector.zt_norm_result_file( group) if norm is 'ztnorm' else None calibrated_file = self.m_file_selector.calibrated_score_file( group, norm == 'ztnorm') utils.info( " - Calibration: calibrating scores from '%s' to '%s'" % (score_file, calibrated_file)) # iterate through the score file and calibrate scores scores = bob.measure.load.four_column(score_file) with open(calibrated_file, 'w') as f: for line in scores: assert len(line) == 4 calibrated_score = llr_machine([line[3]]) f.write('%s %s %s ' % line[0:3] + str(calibrated_score[0]) + "\n")
def __scores_d__(self, t_model_ids, group, force, preload_probes): """Computes D scores.""" # probe files: z_probe_objects = self.m_file_selector.z_probe_objects(group) z_probe_files = self.m_file_selector.get_paths( z_probe_objects, 'projected' if self.m_use_projected_dir else 'features') # preload the probe files for a faster access (and fewer network load) if preload_probes: utils.info("- Scoring: preloading Z-probe files of group '%s'" % group) # read all probe files into memory if self.m_file_selector.uses_probe_file_sets(): preloaded_z_probes = [[ self.m_tool.read_probe(str(z_probe_file)) for z_probe_file in file_set ] for file_set in z_probe_files] else: preloaded_z_probes = [ self.m_tool.read_probe(str(z_probe_file)) for z_probe_file in z_probe_files ] utils.info("- Scoring: computing score matrix D for group '%s'" % group) # Gets the Z-Norm impostor samples z_probe_ids = [] for z_probe_object in z_probe_objects: z_probe_ids.append(z_probe_object.client_id) # Loads the T-Norm models for t_model_id in t_model_ids: # test if the file is already there score_file = self.m_file_selector.d_same_value_file( t_model_id, group) if self.__check_file__(score_file, force): utils.warn("score file '%s' already exists." % (score_file)) else: t_model = self.m_tool.read_model( self.m_file_selector.t_model_file(t_model_id, group)) if preload_probes: d = self.__scores_preloaded__(t_model, preloaded_z_probes) else: d = self.__scores__(t_model, z_probe_files) bob.io.save(d, self.m_file_selector.d_file(t_model_id, group)) t_client_id = [self.m_file_selector.client_id(t_model_id)] d_same_value_tm = bob.machine.ztnorm_same_value( t_client_id, z_probe_ids) bob.io.save(d_same_value_tm, score_file)
def gmm_mstep(self, counts, force=False): """Performs a single M-step of the GMM training (non-parallel)""" old_machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.gmm_intermediate_file % (self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info("UBM training: Skipping GMM M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list)) # load stats file gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_file)) else: # load several files job_ids = range(self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id+1), len(training_list))) for job_id in job_ids] stats_files = [self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices] # read all stats files gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_files[0])) for stats_file in stats_files[1:]: gmm_stats += bob.machine.GMMStats(bob.io.HDF5File(stats_file)) # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # load the old gmm machine gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(old_machine_file)) # initialize the trainer gmm_trainer = bob.trainer.ML_GMMTrainer(self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) gmm_trainer.gmm_statistics = gmm_stats # Calls M-step gmm_trainer.m_step(gmm_machine, data) # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(new_machine_file)) gmm_machine.save(bob.io.HDF5File(new_machine_file, 'w')) import shutil shutil.copy(new_machine_file, self.m_configuration.projector_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.gmm_intermediate_file % (self.m_args.iteration-1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def kmeans_mstep(self, counts, force=False): """Performs a single M-step of the K-Means algorithm (non-parallel)""" old_machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.kmeans_intermediate_file % (self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info("UBM training: Skipping KMeans M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list)) # load stats file zeroeth, first, nsamples, dist = self.read_stats(stats_file) else: # load several files job_ids = range(self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id+1), len(training_list))) for job_id in job_ids] stats_files = [self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices] # read all stats files zeroeth, first, nsamples, dist = self.read_stats(stats_files[0]) for stats_file in stats_files[1:]: zeroeth_, first_, nsamples_, dist_ = self.read_stats(stats_file) zeroeth += zeroeth_ first += first_ nsamples += nsamples_ dist += dist_ # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # Creates the KMeansTrainer kmeans_trainer = bob.trainer.KMeansTrainer() # Creates the KMeansMachine kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(old_machine_file)) kmeans_trainer.initialize(kmeans_machine, data) kmeans_trainer.zeroeth_order_statistics = zeroeth kmeans_trainer.first_order_statistics = first kmeans_trainer.average_min_distance = dist # Performs the M-step kmeans_trainer.m_step(kmeans_machine, data) # data is not used in M-step utils.info("UBM training: Performed M step %d with result %f" % (self.m_args.iteration, dist/nsamples)) # Save the K-Means model utils.ensure_dir(os.path.dirname(new_machine_file)) kmeans_machine.save(bob.io.HDF5File(new_machine_file, 'w')) shutil.copy(new_machine_file, self.m_configuration.kmeans_file) utils.info("UBM training: Wrote new KMeans machine '%s'" % new_machine_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.kmeans_intermediate_file % (self.m_args.iteration-1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def concatenate(self, compute_zt_norm, groups=['dev', 'eval']): """Concatenates all results into one (or two) score files per group.""" for group in groups: utils.info("- Scoring: concatenating score files for group '%s'" % group) # (sorted) list of models model_ids = self.m_file_selector.model_ids(group) with open(self.m_file_selector.no_norm_result_file(group), 'w') as f: # Concatenates the scores for model_id in model_ids: model_file = self.m_file_selector.no_norm_file( model_id, group) if not os.path.exists(model_file): f.close() os.remove( self.m_file_selector.no_norm_result_file(group)) raise IOError( "The score file '%s' cannot be found. Aborting!" % model_file) with open(model_file, 'r') as res_file: f.write(res_file.read()) if compute_zt_norm: with open(self.m_file_selector.zt_norm_result_file(group), 'w') as f: # Concatenates the scores for model_id in model_ids: model_file = self.m_file_selector.zt_norm_file( model_id, group) if not os.path.exists(model_file): f.close() os.remove( self.m_file_selector.zt_norm_result_file( group)) raise IOError( "The score file '%s' cannot be found. Aborting!" % model_file) with open(model_file, 'r') as res_file: f.write(res_file.read())
def feature_normalization(self, indices, force=False): """Normalizes the list of features to have zero mean and unit variance (parallel)""" normalized_list = self.m_file_selector.training_feature_list() utils.info("UBM training: normalizing features from range(%d, %d)" % indices) # iterate through the files and normalize the features for index in range(indices[0], indices[1]): feature = bob.io.load(str(training_list[index])) mean, std = self.m_tool.__normalize_std_array__(feature) if self.m_tool_chain.__check_file__(normalized_list[index], force): utils.debug("Skipping file '%s'" % normalized_list[index]) else: utils.ensure_dir(os.path.dirname(normalized_list[index])) f = bob.io.HDF5File(str(normalized_list[index]), 'w') f.set('mean', mean) f.set('std', std) utils.debug("Saved normalized feature %s" %str(normalized_list[index]))
def __scores_c__(self, t_model_ids, group, force, preload_probes): """Computes C scores.""" # probe files: probe_objects = self.m_file_selector.probe_objects(group) probe_files = self.m_file_selector.get_paths( probe_objects, 'projected' if self.m_use_projected_dir else 'features') # preload the probe files for a faster access (and fewer network load) if preload_probes: utils.info("- Scoring: preloading probe files of group '%s'" % group) # read all probe files into memory if self.m_file_selector.uses_probe_file_sets(): preloaded_probes = [[ self.m_tool.read_probe(str(probe_file)) for probe_file in file_set ] for file_set in all_probe_files] else: preloaded_probes = [ self.m_tool.read_probe(str(probe_file)) for probe_file in probe_files ] utils.info("- Scoring: computing score matrix C for group '%s'" % group) # Computes the raw scores for the T-Norm model for t_model_id in t_model_ids: # test if the file is already there score_file = self.m_file_selector.c_file(t_model_id, group) if self.__check_file__(score_file, force): utils.warn("score file '%s' already exists." % (score_file)) else: t_model = self.m_tool.read_model( self.m_file_selector.t_model_file(t_model_id, group)) if preload_probes: c = self.__scores_preloaded__(t_model, preloaded_probes) else: c = self.__scores__(t_model, probe_files) bob.io.save(c, score_file)
def kmeans_estep(self, indices, force=False): """Performs a single E-step of the K-Means algorithm (parallel)""" stats_file = self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info("UBM training: Skipping KMeans E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: KMeans E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1])]) kmeans_trainer = bob.trainer.KMeansTrainer() t = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Temporary Kmeans machine required for trainer initialization kmeans_trainer.initialize(t, data) # Performs the E-step kmeans_trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array([kmeans_trainer.average_min_distance]) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) utils.ensure_dir(os.path.dirname(stats_file)) f = bob.io.HDF5File(stats_file, 'w') f.set('zeros', kmeans_trainer.zeroeth_order_statistics) f.set('first', kmeans_trainer.first_order_statistics) f.set('dist', dist * nsamples) f.set('nsamples', nsamples) utils.info("UBM training: Wrote Stats file '%s'" % stats_file)
def gmm_initialize(self, force=False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" output_file = self.m_configuration.gmm_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 800): utils.info("UBM Training: Skipping GMM initialization since '%s' already exists" % output_file) else: training_list = self.m_file_selector.training_feature_list() utils.info("UBM Training: Initializing GMM") # load KMeans machine kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(self.m_configuration.kmeans_file)) # read features data = numpy.vstack([bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices(len(training_list), self.m_args.limit_training_examples)]) # Create initial GMM Machine gmm_machine = bob.machine.GMMMachine(self.m_tool.m_gaussians, data.shape[1]) [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds(self.m_tool.m_variance_threshold) utils.ensure_dir(os.path.dirname(output_file)) gmm_machine.save(bob.io.HDF5File(os.path.join(output_file), 'w')) utils.info("UBM Training: Wrote GMM file '%s'" % output_file)
def kmeans_initialize(self, force=False): """Initializes the K-Means training (non-parallel).""" output_file = self.m_configuration.kmeans_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 1000): utils.info( "UBM training: Skipping KMeans initialization since the file '%s' already exists" % output_file) else: # read data utils.info("UBM training: initializing kmeans") training_list = self.m_file_selector.training_feature_list() data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices( len(training_list), self.m_args.limit_training_examples) ]) # Perform KMeans initialization kmeans_machine = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure kmeans_trainer = bob.trainer.KMeansTrainer() kmeans_trainer.initialize(kmeans_machine, data) utils.ensure_dir(os.path.dirname(output_file)) kmeans_machine.save(bob.io.HDF5File(output_file, 'w')) utils.info("UBM training: saved initial KMeans machine to '%s'" % output_file)
def preprocess_data(self, preprocessor, indices=None, force=False): """Preprocesses the original data with the given preprocessor.""" # get the file lists data_files = self.m_file_selector.original_data_list() preprocessed_data_files = self.m_file_selector.preprocessed_data_list() # select a subset of keys to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Preprocessing: splitting of index range %s" % str(indices)) else: index_range = range(len(data_files)) utils.ensure_dir(self.m_file_selector.preprocessed_directory) utils.info( "- Preprocessing: processing %d data files from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.m_database.original_directory, self.m_file_selector.preprocessed_directory)) # read annotation files annotation_list = self.m_file_selector.annotation_list() for i in index_range: preprocessed_data_file = preprocessed_data_files[i] if not self.__check_file__(preprocessed_data_file, force): data = preprocessor.read_original_data(str(data_files[i])) # get the annotations; might be None annotations = self.m_file_selector.get_annotations( annotation_list[i]) # call the preprocessor preprocessed_data = preprocessor(data, annotations) utils.ensure_dir(os.path.dirname(preprocessed_data_file)) preprocessor.save_data(preprocessed_data, str(preprocessed_data_file))
def __scores_b__(self, model_ids, group, force, preload_probes): """Computes B scores.""" # probe files: z_probe_objects = self.m_file_selector.z_probe_objects(group) z_probe_files = self.m_file_selector.get_paths( z_probe_objects, 'projected' if self.m_use_projected_dir else 'features') # preload the probe files for a faster access (and fewer network load) if preload_probes: utils.info("- Scoring: preloading Z-probe files of group '%s'" % group) # read all probe files into memory if self.m_file_selector.uses_probe_file_sets(): preloaded_z_probes = [[ self.m_tool.read_probe(str(z_probe_file)) for z_probe_file in file_set ] for file_set in z_probe_files] else: preloaded_z_probes = [ self.m_tool.read_probe(str(z_probe_file)) for z_probe_file in z_probe_files ] utils.info("- Scoring: computing score matrix B for group '%s'" % group) # Loads the models for model_id in model_ids: # test if the file is already there score_file = self.m_file_selector.b_file(model_id, group) if self.__check_file__(score_file, force): utils.warn("score file '%s' already exists." % (score_file)) else: model = self.m_tool.read_model( self.m_file_selector.model_file(model_id, group)) if preload_probes: b = self.__scores_preloaded__(model, preloaded_z_probes) else: b = self.__scores__(model, z_probe_files) bob.io.save(b, score_file)
def train_extractor(self, extractor, preprocessor, force=False): """Trains the feature extractor using preprocessed data of the 'world' set, if the feature extractor requires training.""" if extractor.requires_training: extractor_file = self.m_file_selector.extractor_file if self.__check_file__(extractor_file, force, 1000): utils.info("- Extraction: extractor '%s' already exists." % extractor_file) else: utils.ensure_dir(os.path.dirname(extractor_file)) # read training files if extractor.split_training_data_by_client: train_files = self.m_file_selector.training_list( 'preprocessed', 'train_extractor', arrange_by_client=True) train_data = self.__read_data_by_client__( train_files, preprocessor) utils.info( "- Extraction: training extractor '%s' using %d identities: " % (extractor_file, len(train_files))) else: train_files = self.m_file_selector.training_list( 'preprocessed', 'train_extractor') train_data = self.__read_data__(train_files, preprocessor) utils.info( "- Extraction: training extractor '%s' using %d training files: " % (extractor_file, len(train_files))) # train model extractor.train(train_data, extractor_file, train_files)
def gmm_estep(self, indices, force=False): """Performs a single E-step of the GMM training (parallel).""" stats_file = self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info("UBM training: Skipping GMM E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: GMM E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1])]) gmm_trainer = bob.trainer.ML_GMMTrainer(self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) # Calls the E-step and extracts the GMM statistics gmm_trainer.e_step(gmm_machine, data) gmm_stats = gmm_trainer.gmm_statistics # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(stats_file)) gmm_stats.save(bob.io.HDF5File(stats_file, 'w')) utils.info("UBM training: Wrote GMM stats '%s'" % (stats_file))
def train_projector(self, tool, extractor, force=False): """Train the feature projector with the extracted features of the world group.""" if tool.requires_projector_training: projector_file = self.m_file_selector.projector_file if self.__check_file__(projector_file, force, 1000): utils.info("- Projection: projector '%s' already exists." % projector_file) else: utils.ensure_dir(os.path.dirname(projector_file)) # train projector if tool.split_training_features_by_client: train_files = self.m_file_selector.training_list( 'features', 'train_projector', arrange_by_client=True) train_features = self.__read_features_by_client__( train_files, extractor) utils.info( "- Projection: training projector '%s' using %d identities: " % (projector_file, len(train_files))) else: train_files = self.m_file_selector.training_list( 'features', 'train_projector') train_features = self.__read_features__( train_files, extractor) utils.info( "- Projection: training projector '%s' using %d training files: " % (projector_file, len(train_files))) # perform training tool.train_projector(train_features, str(projector_file))
def feature_normalization(self, indices, force=False): """Normalizes the list of features to have zero mean and unit variance (parallel)""" normalized_list = self.m_file_selector.training_feature_list() utils.info("UBM training: normalizing features from range(%d, %d)" % indices) # iterate through the files and normalize the features for index in range(indices[0], indices[1]): feature = bob.io.load(str(training_list[index])) mean, std = self.m_tool.__normalize_std_array__(feature) if self.m_tool_chain.__check_file__(normalized_list[index], force): utils.debug("Skipping file '%s'" % normalized_list[index]) else: utils.ensure_dir(os.path.dirname(normalized_list[index])) f = bob.io.HDF5File(str(normalized_list[index]), 'w') f.set('mean', mean) f.set('std', std) utils.debug("Saved normalized feature %s" % str(normalized_list[index]))
def zt_norm(self, groups=['dev', 'eval']): """Computes ZT-Norm using the previously generated A, B, C, and D files""" for group in groups: utils.info("- Scoring: computing ZT-norm for group '%s'" % group) # list of models model_ids = self.m_file_selector.model_ids(group) t_model_ids = self.m_file_selector.t_model_ids(group) # first, normalize C and D scores self.__scores_c_normalize__(model_ids, t_model_ids, group) # and normalize it self.__scores_d_normalize__(t_model_ids, group) # load D matrices only once d = bob.io.load(self.m_file_selector.d_matrix_file(group)) d_same_value = bob.io.load( self.m_file_selector.d_same_value_matrix_file(group)).astype( bool) # Loops over the model ids for model_id in model_ids: # Loads probe files to get information about the type of access probe_objects = self.m_file_selector.probe_objects_for_model( model_id, group) # Loads A, B, and C matrices for current model id a = bob.io.load(self.m_file_selector.a_file(model_id, group)) b = bob.io.load(self.m_file_selector.b_file(model_id, group)) c = bob.io.load( self.m_file_selector.c_file_for_model(model_id, group)) # compute zt scores zt_scores = bob.machine.ztnorm(a, b, c, d, d_same_value) # Saves to text file self.__save_scores__( self.m_file_selector.zt_norm_file(model_id, group), zt_scores, probe_objects, self.m_file_selector.client_id(model_id))
def __train_pca__(self, feature_space): """Generates the PCA covariance matrix""" # Initializes the data to apply PCA on. data_list = [] for client in feature_space: for feature in client: data_list.append(feature) data = numpy.vstack(data_list) del data_list utils.info(" -> Training LinearMachine using PCA") # Training. t = bob.trainer.PCATrainer() machine, variances = t.train(data) del data # Compute variance percentage, if desired. if isinstance(self.m_subspace_dim, float): cummulated = numpy.cumsum(variances) / numpy.sum(variances) for index in range(len(cummulated)): if cummulated[index] > self.m_subspace_dim: self.m_subspace_dim = index break self.m_subspace_dim = index del cummulated utils.info(" ... Keeping %d PCA dimensions" % self.m_subspace_dim) # Re-shape machine. machine.resize(machine.shape[0], self.m_subspace_dim) variances.resize(self.m_subspace_dim) # Return machine. return machine, variances
def kmeans_estep(self, indices, force=False): """Performs a single E-step of the K-Means algorithm (parallel)""" stats_file = self.m_configuration.kmeans_stats_file % ( self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info( "UBM training: Skipping KMeans E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(machine_file)) utils.info("UBM training: KMeans E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1]) ]) kmeans_trainer = bob.trainer.KMeansTrainer() t = bob.machine.KMeansMachine( self.m_tool.m_gaussians, data.shape[1] ) # Temporary Kmeans machine required for trainer initialization kmeans_trainer.initialize(t, data) # Performs the E-step kmeans_trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array([kmeans_trainer.average_min_distance]) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) utils.ensure_dir(os.path.dirname(stats_file)) f = bob.io.HDF5File(stats_file, 'w') f.set('zeros', kmeans_trainer.zeroeth_order_statistics) f.set('first', kmeans_trainer.first_order_statistics) f.set('dist', dist * nsamples) f.set('nsamples', nsamples) utils.info("UBM training: Wrote Stats file '%s'" % stats_file)
def gmm_initialize(self, force=False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" output_file = self.m_configuration.gmm_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 800): utils.info( "UBM Training: Skipping GMM initialization since '%s' already exists" % output_file) else: training_list = self.m_file_selector.training_feature_list() utils.info("UBM Training: Initializing GMM") # load KMeans machine kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(self.m_configuration.kmeans_file)) # read features data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices( len(training_list), self.m_args.limit_training_examples) ]) # Create initial GMM Machine gmm_machine = bob.machine.GMMMachine(self.m_tool.m_gaussians, data.shape[1]) [ variances, weights ] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds( self.m_tool.m_variance_threshold) utils.ensure_dir(os.path.dirname(output_file)) gmm_machine.save(bob.io.HDF5File(os.path.join(output_file), 'w')) utils.info("UBM Training: Wrote GMM file '%s'" % output_file)
def kmeans_initialize(self, force=False): """Initializes the K-Means training (non-parallel).""" output_file = self.m_configuration.kmeans_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 1000): utils.info("UBM training: Skipping KMeans initialization since the file '%s' already exists" % output_file) else: # read data utils.info("UBM training: initializing kmeans") training_list = self.m_file_selector.training_feature_list() data = numpy.vstack([bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices(len(training_list), self.m_args.limit_training_examples)]) # Perform KMeans initialization kmeans_machine = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure kmeans_trainer = bob.trainer.KMeansTrainer() kmeans_trainer.initialize(kmeans_machine, data) utils.ensure_dir(os.path.dirname(output_file)) kmeans_machine.save(bob.io.HDF5File(output_file, 'w')) utils.info("UBM training: saved initial KMeans machine to '%s'" % output_file)
def gmm_estep(self, indices, force=False): """Performs a single E-step of the GMM training (parallel).""" stats_file = self.m_configuration.gmm_stats_file % ( self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info( "UBM training: Skipping GMM E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: GMM E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1]) ]) gmm_trainer = bob.trainer.ML_GMMTrainer( self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) # Calls the E-step and extracts the GMM statistics gmm_trainer.e_step(gmm_machine, data) gmm_stats = gmm_trainer.gmm_statistics # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(stats_file)) gmm_stats.save(bob.io.HDF5File(stats_file, 'w')) utils.info("UBM training: Wrote GMM stats '%s'" % (stats_file))
def face_verify(args, command_line_parameters, external_dependencies=[], external_fake_job_id=0): """This is the main entry point for computing face verification experiments. You just have to specify configurations for any of the steps of the toolchain, which are: -- the database -- the preprocessing -- feature extraction -- the recognition tool -- and the grid configuration (in case, the function should be executed in the grid). Additionally, you can skip parts of the toolchain by selecting proper --skip-... parameters. If your probe files are not too big, you can also specify the --preload-probes switch to speed up the score computation. If files should be re-generated, please specify the --force option (might be combined with the --skip-... options).""" # generate tool chain executor executor = ToolChainExecutorZT(args) # as the main entry point, check whether the grid option was given if not args.grid: if args.timer is not None and not len(args.timer): args.timer = ('real', 'system', 'user') # not in a grid, use default tool chain sequentially if args.timer: utils.info("- Timer: Starting timer") start_time = os.times() executor.write_info(command_line_parameters) executor.execute_tool_chain() if args.timer: end_time = os.times() utils.info("- Timer: Stopped timer") for t in args.timer: index = {'real': 4, 'system': 1, 'user': 0}[t] print "Elapsed", t, "time:", end_time[index] - start_time[ index], "seconds" return {} elif args.sub_task: # execute the desired sub-task executor.execute_grid_job() return {} else: # no other parameter given, so deploy new jobs # get the name of this file this_file = __file__ if this_file[-1] == 'c': this_file = this_file[0:-1] executor.write_info(command_line_parameters) # initialize the executor to submit the jobs to the grid executor.set_common_parameters(calling_file=this_file, parameters=command_line_parameters, fake_job_id=external_fake_job_id) # add the jobs job_ids = executor.add_jobs_to_grid(external_dependencies) if executor.m_grid.is_local(): # start the jman local deamon executor.execute_local_deamon() return {} else: return job_ids
def compute_scores(self, tool, compute_zt_norm, force=False, indices=None, groups=['dev', 'eval'], types=['A', 'B', 'C', 'D'], preload_probes=False): """Computes the scores for the given groups (by default 'dev' and 'eval').""" # save tool for internal use self.m_tool = tool self.m_use_projected_dir = hasattr(tool, 'project') # load the projector and the enroller, if needed tool.load_projector(self.m_file_selector.projector_file) tool.load_enroller(self.m_file_selector.enroller_file) for group in groups: # get model ids model_ids = self.m_file_selector.model_ids(group) if compute_zt_norm: t_model_ids = self.m_file_selector.t_model_ids(group) # compute A scores if 'A' in types: if indices != None: model_ids_short = model_ids[indices[0]:indices[1]] utils.info("- Scoring: splitting of index range %s" % str(indices)) else: model_ids_short = model_ids # we need to time this. timer = ('real', 'system', 'user') utils.info("-Timer: Starting Timer") start_time = os.times() # create inverted index file if requested by the algorithm if self.m_tool.requires_inverted_indexing: self.__scores_inverted___(model_ids_short, group, compute_zt_norm, force, preload_probes) else: self.__scores_a__(model_ids_short, group, compute_zt_norm, force, preload_probes) end_time = os.times() utils.info("-Timer: Stopped Timer") for t in timer: index = {'real': 4, 'system': 1, 'user': 0}[t] print "Elapsed", t, "time:", end_time[index] - start_time[ index], "seconds" if compute_zt_norm: # compute B scores if 'B' in types: if indices != None: model_ids_short = model_ids[indices[0]:indices[1]] utils.info("- Scoring: splitting of index range %s" % str(indices)) else: model_ids_short = model_ids self.__scores_b__(model_ids_short, group, force, preload_probes) # compute C scores if 'C' in types: if indices != None: t_model_ids_short = t_model_ids[indices[0]:indices[1]] utils.info("- Scoring: splitting of index range %s" % str(indices)) else: t_model_ids_short = t_model_ids self.__scores_c__(t_model_ids_short, group, force, preload_probes) # compute D scores if 'D' in types: if indices != None: t_model_ids_short = t_model_ids[indices[0]:indices[1]] utils.info("- Scoring: splitting of index range %s" % str(indices)) else: t_model_ids_short = t_model_ids self.__scores_d__(t_model_ids_short, group, force, preload_probes)
def kmeans_mstep(self, counts, force=False): """Performs a single M-step of the K-Means algorithm (non-parallel)""" old_machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.kmeans_intermediate_file % ( self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info( "UBM training: Skipping KMeans M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.kmeans_stats_file % ( self.m_args.iteration, 0, len(training_list)) # load stats file zeroeth, first, nsamples, dist = self.read_stats(stats_file) else: # load several files job_ids = range( self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id + 1), len(training_list))) for job_id in job_ids] stats_files = [ self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices ] # read all stats files zeroeth, first, nsamples, dist = self.read_stats( stats_files[0]) for stats_file in stats_files[1:]: zeroeth_, first_, nsamples_, dist_ = self.read_stats( stats_file) zeroeth += zeroeth_ first += first_ nsamples += nsamples_ dist += dist_ # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # Creates the KMeansTrainer kmeans_trainer = bob.trainer.KMeansTrainer() # Creates the KMeansMachine kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(old_machine_file)) kmeans_trainer.initialize(kmeans_machine, data) kmeans_trainer.zeroeth_order_statistics = zeroeth kmeans_trainer.first_order_statistics = first kmeans_trainer.average_min_distance = dist # Performs the M-step kmeans_trainer.m_step(kmeans_machine, data) # data is not used in M-step utils.info("UBM training: Performed M step %d with result %f" % (self.m_args.iteration, dist / nsamples)) # Save the K-Means model utils.ensure_dir(os.path.dirname(new_machine_file)) kmeans_machine.save(bob.io.HDF5File(new_machine_file, 'w')) shutil.copy(new_machine_file, self.m_configuration.kmeans_file) utils.info("UBM training: Wrote new KMeans machine '%s'" % new_machine_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.kmeans_intermediate_file % ( self.m_args.iteration - 1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def main(command_line_parameters=None): """Reads score files, computes error measures and plots curves.""" args = command_line_arguments(command_line_parameters) #print args # get some colors for plotting cmap = mpl.cm.get_cmap(name='hsv') colors = [cmap(i) for i in numpy.linspace(0, 1.0, len(args.dev_files)+1)] score_parser = {'4column' : bob.measure.load.split_four_column, '5column' : bob.measure.load.split_five_column}[args.parser] ids_parser = {'4column' : bob.measure.load.four_column, '5column' : bob.measure.load.five_column}[args.parser] # First, read the score files utils.info("Loading %d score files of the development set" % len(args.dev_files)) scores_dev = [score_parser(os.path.join(args.directory, f)) for f in args.dev_files] ids_dev = [ids_parser(os.path.join(args.directory, f)) for f in args.dev_files] id_dev = [] for i in ids_dev[0]: if i[0] == i[1]: id_dev.append(i + (1,)) else: id_dev.append(i + (0,)) ids_dev = numpy.array(id_dev) if (args.norm == 'norm'): ids_dev[:,3]=ids_dev[:,3].astype(numpy.float64)/max(ids_dev[:,3].astype(numpy.float64)) if args.eval_files: utils.info("Loading %d score files of the evaluation set" % len(args.eval_files)) scores_eval = [score_parser(os.path.join(args.directory, f)) for f in args.eval_files] ids_eval = [ids_parser(os.path.join(args.directory, f)) for f in args.eval_files] id_eval = [] for i in ids_eval[0]: if i[0] == i[1]: id_eval.append(i + (1,)) else: id_eval.append(i + (0,)) ids_eval = numpy.array(id_eval) if (args.norm == 'norm'): ids_eval[:,3]=ids_eval[:,3].astype(numpy.float64)/max(ids_eval[:,3].astype(numpy.float64)) if args.criterion: utils.info("Computing %s on the development " % args.criterion + ("and HTER on the evaluation set" if args.eval_files else "set")) pdf = PdfPages(args.pdf) for i in range(len(scores_dev)): totalModels = numpy.unique(ids_dev[:,1]) eer_mean, figure = _plot_scores((25,10), args, totalModels, ids_dev, 'development', "Fauna graph for development set") pdf.savefig(figure) # Plot the EER per model bob.io.base.save(eer_mean,'eer_per_model.mat') pdf.savefig(_plot_eer((25,10), eer_mean, "EER per model curve for development set")) # Plot the scores histogram scoresTarget = ids_dev[ids_dev[:,4]=='1',3].astype(numpy.float64) scoresNonTarget = ids_dev[ids_dev[:,4]=='0',3].astype(numpy.float64) pdf.savefig(_plot_scores_hist((25,10), scoresTarget, scoresNonTarget, "Scores histogram for development set")) if args.eval_files: for i in range(len(scores_eval)): totalModels = numpy.unique(ids_eval[:,1]) eer_mean, figure = _plot_scores((25,10), args, totalModels, ids_eval, 'evaluation', "Fauna graph for evaluation set") pdf.savefig(figure) # Plot the EER per model pdf.savefig(_plot_eer((25,10), eer_mean, "EER per model curve for evaluation set")) # Plot the scores histogram scoresTarget = ids_eval[ids_eval[:,4]=='1',3].astype(numpy.float64) scoresNonTarget = ids_eval[ids_eval[:,4]=='0',3].astype(numpy.float64) pdf.savefig(_plot_scores_hist((25,10), scoresTarget, scoresNonTarget, "Scores histogram for evaluation set")) pdf.close()
def main(command_line_parameters=None): """Reads score files, computes error measures and plots curves.""" args = command_line_arguments(command_line_parameters) # get some colors for plotting cmap = mpl.cm.get_cmap(name='hsv') colors = [cmap(i) for i in numpy.linspace(0, 1.0, len(args.dev_files) + 1)] if args.criterion or args.roc or args.det or args.cllr or args.mindcf: score_parser = { '4column': bob.measure.load.split_four_column, '5column': bob.measure.load.split_five_column }[args.parser] # First, read the score files utils.info("Loading %d score files of the development set" % len(args.dev_files)) scores_dev = [ score_parser(os.path.join(args.directory, f)) for f in args.dev_files ] if args.eval_files: utils.info("Loading %d score files of the evaluation set" % len(args.eval_files)) scores_eval = [ score_parser(os.path.join(args.directory, f)) for f in args.eval_files ] if args.criterion: utils.info("Computing %s on the development " % args.criterion + ("and HTER on the evaluation set" if args. eval_files else "set")) for i in range(len(scores_dev)): # compute threshold on development set threshold = { 'EER': bob.measure.eer_threshold, 'HTER': bob.measure.min_hter_threshold }[args.criterion](scores_dev[i][0], scores_dev[i][1]) # apply threshold to development set far, frr = bob.measure.farfrr(scores_dev[i][0], scores_dev[i][1], threshold) print("The %s of the development set of '%s' is %2.3f%%" % (args.criterion, args.legends[i] if args.legends else args.dev_files[i], (far + frr) * 50.)) # / 2 * 100% if args.eval_files: # apply threshold to evaluation set far, frr = bob.measure.farfrr(scores_eval[i][0], scores_eval[i][1], threshold) print("The HTER of the evaluation set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.dev_files[i], (far + frr) * 50.)) # / 2 * 100% if args.mindcf: utils.info("Computing minDCF on the development " + ( "and on the evaluation set" if args.eval_files else "set")) for i in range(len(scores_dev)): # compute threshold on development set threshold = bob.measure.min_weighted_error_rate_threshold( scores_dev[i][0], scores_dev[i][1], args.cost) # apply threshold to development set far, frr = bob.measure.farfrr(scores_dev[i][0], scores_dev[i][1], threshold) print("The minDCF of the development set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.dev_files[i], (args.cost * far + (1 - args.cost) * frr) * 100.)) if args.eval_files: # compute threshold on evaluation set threshold = bob.measure.min_weighted_error_rate_threshold( scores_eval[i][0], scores_eval[i][1], args.cost) # apply threshold to evaluation set far, frr = bob.measure.farfrr(scores_eval[i][0], scores_eval[i][1], threshold) print( "The minDCF of the evaluation set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.eval_files[i], (args.cost * far + (1 - args.cost) * frr) * 100.)) if args.cllr: utils.info("Computing Cllr and minCllr on the development " + ( "and on the evaluation set" if args.eval_files else "set")) for i in range(len(scores_dev)): cllr = bob.measure.calibration.cllr(scores_dev[i][0], scores_dev[i][1]) min_cllr = bob.measure.calibration.min_cllr( scores_dev[i][0], scores_dev[i][1]) print( "Calibration performance on development set of '%s' is Cllr %1.5f and minCllr %1.5f " % (args.legends[i], cllr, min_cllr)) if args.eval_files: cllr = bob.measure.calibration.cllr( scores_eval[i][0], scores_eval[i][1]) min_cllr = bob.measure.calibration.min_cllr( scores_eval[i][0], scores_eval[i][1]) print( "Calibration performance on evaluation set of '%s' is Cllr %1.5f and minCllr %1.5f" % (args.legends[i], cllr, min_cllr)) if args.roc: utils.info("Computing CAR curves on the development " + ( "and on the evaluation set" if args.eval_files else "set")) fars = [math.pow(10., i * 0.25) for i in range(-16, 0)] + [1.] frrs_dev = [ bob.measure.roc_for_far(scores[0], scores[1], fars) for scores in scores_dev ] if args.eval_files: frrs_eval = [ bob.measure.roc_for_far(scores[0], scores[1], fars) for scores in scores_eval ] utils.info("Plotting ROC curves to file '%s'" % args.roc) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.roc) # create a separate figure for dev and eval pdf.savefig( _plot_roc(frrs_dev, colors, args.legends if args.legends else args.dev_files, "ROC curve for development set")) del frrs_dev if args.eval_files: pdf.savefig( _plot_roc( frrs_eval, colors, args.legends if args.legends else args.eval_files, "ROC curve for evaluation set")) del frrs_eval pdf.close() if args.det: utils.info("Computing DET curves on the development " + ( "and on the evaluation set" if args.eval_files else "set")) dets_dev = [ bob.measure.det(scores[0], scores[1], 1000) for scores in scores_dev ] if args.eval_files: dets_eval = [ bob.measure.det(scores[0], scores[1], 1000) for scores in scores_eval ] utils.info("Plotting DET curves to file '%s'" % args.det) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.det) # create a separate figure for dev and eval pdf.savefig( _plot_det(dets_dev, colors, args.legends if args.legends else args.dev_files, "DET plot for development set")) del dets_dev if args.eval_files: pdf.savefig( _plot_det( dets_eval, colors, args.legends if args.legends else args.eval_files, "DET plot for evaluation set")) del dets_eval pdf.close() if args.cmc: utils.info("Computing CMC curves on the development " + ("and on the evaluation set" if args.eval_files else "set")) cmc_parser = { '4column': bob.measure.load.cmc_four_column, '5column': bob.measure.load.cmc_five_column }[args.parser] cmcs_dev = [ cmc_parser(os.path.join(args.directory, f)) for f in args.dev_files ] if args.eval_files: cmcs_eval = [ cmc_parser(os.path.join(args.directory, f)) for f in args.eval_files ] utils.info("Plotting CMC curves to file '%s'" % args.cmc) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.cmc) # create a separate figure for dev and eval pdf.savefig( _plot_cmc(cmcs_dev, colors, args.legends if args.legends else args.dev_files, "CMC curve for development set")) if args.eval_files: pdf.savefig( _plot_cmc(cmcs_eval, colors, args.legends if args.legends else args.eval_files, "CMC curve for evaluation set")) pdf.close()
def enroll_models(self, tool, extractor, compute_zt_norm, indices=None, groups=['dev', 'eval'], types=['N', 'T'], force=False): """Enroll the models for 'dev' and 'eval' groups, for both models and T-Norm-models. This function uses the extracted or projected features to compute the models, depending on your setup of the base class Tool.""" # read the projector file, if needed tool.load_projector(self.m_file_selector.projector_file) # read the model enrollment file tool.load_enroller(self.m_file_selector.enroller_file) # which tool to use to read the features... reader = tool if tool.use_projected_features_for_enrollment else extractor # Create Models if 'N' in types: for group in groups: model_ids = self.m_file_selector.model_ids(group) if indices != None: model_ids = model_ids[indices[0]:indices[1]] utils.info("- Enrollment: splitting of index range %s" % str(indices)) utils.info("- Enrollment: enrolling models of group '%s'" % group) for model_id in model_ids: # Path to the model model_file = self.m_file_selector.model_file( model_id, group) # Removes old file if required if not self.__check_file__(model_file, force): enroll_files = self.m_file_selector.enroll_files( model_id, group, 'projected' if tool.use_projected_features_for_enrollment else 'features') # load all files into memory enroll_features = [ reader.read_feature(str(enroll_file)) for enroll_file in enroll_files ] model = tool.enroll(enroll_features) # save the model utils.ensure_dir(os.path.dirname(model_file)) tool.save_model(model, str(model_file)) # T-Norm-Models if 'T' in types and compute_zt_norm: for group in groups: t_model_ids = self.m_file_selector.t_model_ids(group) if indices != None: t_model_ids = t_model_ids[indices[0]:indices[1]] utils.info("- Enrollment: splitting of index range %s" % str(indices)) utils.info("- Enrollment: enrolling T-models of group '%s'" % group) for t_model_id in t_model_ids: # Path to the model t_model_file = self.m_file_selector.t_model_file( t_model_id, group) # Removes old file if required if not self.__check_file__(t_model_file, force): t_enroll_files = self.m_file_selector.t_enroll_files( t_model_id, group, 'projected' if tool.use_projected_features_for_enrollment else 'features') # load all files into memory t_enroll_features = [ reader.read_feature(str(t_enroll_file)) for t_enroll_file in t_enroll_files ] t_model = tool.enroll(t_enroll_features) # save model utils.ensure_dir(os.path.dirname(t_model_file)) tool.save_model(t_model, str(t_model_file))
def average_results(self): """Iterates over all the folds of the current view and computes the average result""" utils.info(" - Scoring: Averaging results of views %s" % self.m_args.views) if not self.m_args.dry_run: file = open(self.m_configuration.result_file, 'w') if 'view1' in self.m_args.views: if self.m_args.dry_run: print "Would have averaged the results from view1 ..." else: # process the single result of view 1 # HACK... Overwrite the score directory of the file selector to get the right result file self.m_file_selector.score_directories = ( self.__scores_directory__('view1'), ) res_file = self.m_file_selector.no_norm_result_file('dev') negatives, positives = bob.measure.load.split_four_column( res_file) threshold = bob.measure.eer_threshold(negatives, positives) far, frr = bob.measure.farfrr(negatives, positives, threshold) hter = (far + frr) / 2.0 file.write( "On view1 (dev set only):\n\nFAR = %.3f;\tFRR = %.3f;\tHTER = %.3f;\tthreshold = %.3f\n" % (far, frr, hter, threshold)) file.write("Classification success: %.2f%%\n\n" % (self.__classification_result__( negatives, positives, threshold) * 100.)) if 'view2' in self.m_args.views: if self.m_args.dry_run: print "Would have averaged the results from view2 ..." else: file.write("On view2 (eval set only):\n\n") # iterate over all folds of view 2 errors = numpy.ndarray((10, ), numpy.float64) for f in range(1, 11): # HACK... Overwrite the score directory of the file selector to get the right result file self.m_file_selector.score_directories = ( self.__scores_directory__('fold%d' % f), ) dev_res_file = self.m_file_selector.no_norm_result_file( 'dev') eval_res_file = self.m_file_selector.no_norm_result_file( 'eval') # compute threshold on dev data dev_negatives, dev_positives = bob.measure.load.split_four_column( dev_res_file) threshold = bob.measure.eer_threshold( dev_negatives, dev_positives) # compute FAR and FRR for eval data eval_negatives, eval_positives = bob.measure.load.split_four_column( eval_res_file) far, frr = bob.measure.farfrr(eval_negatives, eval_positives, threshold) hter = (far + frr) / 2.0 file.write( "On fold%d:\n\nFAR = %.3f;\tFRR = %.3f;\tHTER = %.3f;\tthreshold = %.3f\n" % (f, far, frr, hter, threshold)) result = self.__classification_result__( eval_negatives, eval_positives, threshold) file.write("Classification success: %.2f%%\n\n" % (result * 100.)) errors[f - 1] = result # compute mean and std error mean = numpy.mean(errors) std = numpy.std(errors) file.write( "\nOverall classification success: %f (with standard deviation %f)\n" % (mean, std))
def __scores_a__(self, model_ids, group, compute_zt_norm, force, preload_probes): """Computes A scores. For non-ZT-norm, these are the only scores that are actually computed.""" # preload the probe files for a faster access (and fewer network load) if preload_probes: utils.info("- Scoring: preloading probe files of group '%s'" % group) all_probe_objects = self.m_file_selector.probe_objects(group) all_probe_files = self.m_file_selector.get_paths( self.m_file_selector.probe_objects(group), 'projected' if self.m_use_projected_dir else 'features') # read all probe files into memory if self.m_file_selector.uses_probe_file_sets(): all_preloaded_probes = [[ self.m_tool.read_probe(str(probe_file)) for probe_file in file_set ] for file_set in all_probe_files] else: all_preloaded_probes = [ self.m_tool.read_probe(str(probe_file)) for probe_file in all_probe_files ] if compute_zt_norm: utils.info("- Scoring: computing score matrix A for group '%s'" % group) else: utils.info("- Scoring: computing scores for group '%s'" % group) # Computes the raw scores for each model for model_id in model_ids: # test if the file is already there score_file = self.m_file_selector.a_file( model_id, group ) if compute_zt_norm else self.m_file_selector.no_norm_file( model_id, group) if self.__check_file__(score_file, force): utils.warn("score file '%s' already exists." % (score_file)) else: # get the probe split current_probe_objects = self.m_file_selector.probe_objects_for_model( model_id, group) model = self.m_tool.read_model( self.m_file_selector.model_file(model_id, group)) if preload_probes: # select the probe files for this model from all probes current_preloaded_probes = self.__probe_split__( current_probe_objects, all_probe_objects, all_preloaded_probes) # compute A matrix a = self.__scores_preloaded__(model, current_preloaded_probes) else: current_probe_files = self.m_file_selector.get_paths( current_probe_objects, 'projected' if self.m_use_projected_dir else 'features') a = self.__scores__(model, current_probe_files) if compute_zt_norm: # write A matrix only when you want to compute zt norm afterwards bob.io.save(a, self.m_file_selector.a_file(model_id, group)) # Save scores to text file self.__save_scores__( self.m_file_selector.no_norm_file(model_id, group), a, current_probe_objects, self.m_file_selector.client_id(model_id))
def main(command_line_parameters=None): """Reads score files, computes error measures and plots curves.""" args = command_line_arguments(command_line_parameters) # get some colors for plotting cmap = mpl.cm.get_cmap(name='hsv') colors = [cmap(i) for i in numpy.linspace(0, 1.0, len(args.dev_files)+1)] if args.criterion or args.roc or args.det or args.cllr or args.mindcf: score_parser = {'4column' : bob.measure.load.split_four_column, '5column' : bob.measure.load.split_five_column}[args.parser] # First, read the score files utils.info("Loading %d score files of the development set" % len(args.dev_files)) scores_dev = [score_parser(os.path.join(args.directory, f)) for f in args.dev_files] if args.eval_files: utils.info("Loading %d score files of the evaluation set" % len(args.eval_files)) scores_eval = [score_parser(os.path.join(args.directory, f)) for f in args.eval_files] if args.criterion: utils.info("Computing %s on the development " % args.criterion + ("and HTER on the evaluation set" if args.eval_files else "set")) for i in range(len(scores_dev)): # compute threshold on development set threshold = {'EER': bob.measure.eer_threshold, 'HTER' : bob.measure.min_hter_threshold} [args.criterion](scores_dev[i][0], scores_dev[i][1]) # apply threshold to development set far, frr = bob.measure.farfrr(scores_dev[i][0], scores_dev[i][1], threshold) print("The %s of the development set of '%s' is %2.3f%%" % (args.criterion, args.legends[i] if args.legends else args.dev_files[i], (far + frr) * 50.)) # / 2 * 100% if args.eval_files: # apply threshold to evaluation set far, frr = bob.measure.farfrr(scores_eval[i][0], scores_eval[i][1], threshold) print("The HTER of the evaluation set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.dev_files[i], (far + frr) * 50.)) # / 2 * 100% if args.mindcf: utils.info("Computing minDCF on the development " + ("and on the evaluation set" if args.eval_files else "set")) for i in range(len(scores_dev)): # compute threshold on development set threshold = bob.measure.min_weighted_error_rate_threshold(scores_dev[i][0], scores_dev[i][1], args.cost) # apply threshold to development set far, frr = bob.measure.farfrr(scores_dev[i][0], scores_dev[i][1], threshold) print("The minDCF of the development set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.dev_files[i], (args.cost * far + (1-args.cost) * frr) * 100. )) if args.eval_files: # compute threshold on evaluation set threshold = bob.measure.min_weighted_error_rate_threshold(scores_eval[i][0], scores_eval[i][1], args.cost) # apply threshold to evaluation set far, frr = bob.measure.farfrr(scores_eval[i][0], scores_eval[i][1], threshold) print("The minDCF of the evaluation set of '%s' is %2.3f%%" % (args.legends[i] if args.legends else args.eval_files[i], (args.cost * far + (1-args.cost) * frr) * 100. )) if args.cllr: utils.info("Computing Cllr and minCllr on the development " + ("and on the evaluation set" if args.eval_files else "set")) for i in range(len(scores_dev)): cllr = bob.measure.calibration.cllr(scores_dev[i][0], scores_dev[i][1]) min_cllr = bob.measure.calibration.min_cllr(scores_dev[i][0], scores_dev[i][1]) print("Calibration performance on development set of '%s' is Cllr %1.5f and minCllr %1.5f " % (args.legends[i], cllr, min_cllr)) if args.eval_files: cllr = bob.measure.calibration.cllr(scores_eval[i][0], scores_eval[i][1]) min_cllr = bob.measure.calibration.min_cllr(scores_eval[i][0], scores_eval[i][1]) print("Calibration performance on evaluation set of '%s' is Cllr %1.5f and minCllr %1.5f" % (args.legends[i], cllr, min_cllr)) if args.roc: utils.info("Computing CAR curves on the development " + ("and on the evaluation set" if args.eval_files else "set")) fars = [math.pow(10., i * 0.25) for i in range(-16,0)] + [1.] frrs_dev = [bob.measure.roc_for_far(scores[0], scores[1], fars) for scores in scores_dev] if args.eval_files: frrs_eval = [bob.measure.roc_for_far(scores[0], scores[1], fars) for scores in scores_eval] utils.info("Plotting ROC curves to file '%s'" % args.roc) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.roc) # create a separate figure for dev and eval pdf.savefig(_plot_roc(frrs_dev, colors, args.legends if args.legends else args.dev_files, "ROC curve for development set")) del frrs_dev if args.eval_files: pdf.savefig(_plot_roc(frrs_eval, colors, args.legends if args.legends else args.eval_files, "ROC curve for evaluation set")) del frrs_eval pdf.close() if args.det: utils.info("Computing DET curves on the development " + ("and on the evaluation set" if args.eval_files else "set")) dets_dev = [bob.measure.det(scores[0], scores[1], 1000) for scores in scores_dev] if args.eval_files: dets_eval = [bob.measure.det(scores[0], scores[1], 1000) for scores in scores_eval] utils.info("Plotting DET curves to file '%s'" % args.det) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.det) # create a separate figure for dev and eval pdf.savefig(_plot_det(dets_dev, colors, args.legends if args.legends else args.dev_files, "DET plot for development set")) del dets_dev if args.eval_files: pdf.savefig(_plot_det(dets_eval, colors, args.legends if args.legends else args.eval_files, "DET plot for evaluation set")) del dets_eval pdf.close() if args.cmc: utils.info("Computing CMC curves on the development " + ("and on the evaluation set" if args.eval_files else "set")) cmc_parser = {'4column' : bob.measure.load.cmc_four_column, '5column' : bob.measure.load.cmc_five_column}[args.parser] cmcs_dev = [cmc_parser(os.path.join(args.directory, f)) for f in args.dev_files] if args.eval_files: cmcs_eval = [cmc_parser(os.path.join(args.directory, f)) for f in args.eval_files] utils.info("Plotting CMC curves to file '%s'" % args.cmc) # create a multi-page PDF for the ROC curve pdf = PdfPages(args.cmc) # create a separate figure for dev and eval pdf.savefig(_plot_cmc(cmcs_dev, colors, args.legends if args.legends else args.dev_files, "CMC curve for development set")) if args.eval_files: pdf.savefig(_plot_cmc(cmcs_eval, colors, args.legends if args.legends else args.eval_files, "CMC curve for evaluation set")) pdf.close()
def gmm_mstep(self, counts, force=False): """Performs a single M-step of the GMM training (non-parallel)""" old_machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.gmm_intermediate_file % ( self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info( "UBM training: Skipping GMM M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.gmm_stats_file % ( self.m_args.iteration, 0, len(training_list)) # load stats file gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_file)) else: # load several files job_ids = range( self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id + 1), len(training_list))) for job_id in job_ids] stats_files = [ self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices ] # read all stats files gmm_stats = bob.machine.GMMStats( bob.io.HDF5File(stats_files[0])) for stats_file in stats_files[1:]: gmm_stats += bob.machine.GMMStats( bob.io.HDF5File(stats_file)) # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # load the old gmm machine gmm_machine = bob.machine.GMMMachine( bob.io.HDF5File(old_machine_file)) # initialize the trainer gmm_trainer = bob.trainer.ML_GMMTrainer( self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) gmm_trainer.gmm_statistics = gmm_stats # Calls M-step gmm_trainer.m_step(gmm_machine, data) # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(new_machine_file)) gmm_machine.save(bob.io.HDF5File(new_machine_file, 'w')) import shutil shutil.copy(new_machine_file, self.m_configuration.projector_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.gmm_intermediate_file % ( self.m_args.iteration - 1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))