def kmeans_initialize(self, force=False): """Initializes the K-Means training (non-parallel).""" output_file = self.m_configuration.kmeans_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 1000): utils.info( "UBM training: Skipping KMeans initialization since the file '%s' already exists" % output_file) else: # read data utils.info("UBM training: initializing kmeans") training_list = self.m_file_selector.training_feature_list() data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices( len(training_list), self.m_args.limit_training_examples) ]) # Perform KMeans initialization kmeans_machine = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure kmeans_trainer = bob.trainer.KMeansTrainer() kmeans_trainer.initialize(kmeans_machine, data) utils.ensure_dir(os.path.dirname(output_file)) kmeans_machine.save(bob.io.HDF5File(output_file, 'w')) utils.info("UBM training: saved initial KMeans machine to '%s'" % output_file)
def gmm(data): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" output_file = GMM_HDF5 print "UBM Training - Step 2: Initializing GMM...." # load KMeans machine kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(KMeans_HDF5)) # Create initial GMM Machine gmm_machine = bob.machine.GMMMachine(gaussians, data.shape[1]) [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds(variance_threshold) # Creates the GMMTrainer and trains the GMM gmm_trainer = bob.trainer.ML_GMMTrainer(True, True, True) gmm_trainer.max_iterations = max_iterations gmm_trainer.rng = bob.core.random.mt19937(INIT_SEED) gmm_trainer.train(gmm_machine, data) utils.ensure_dir(os.path.dirname(output_file)) gmm_machine.save(bob.io.HDF5File(os.path.join(output_file), 'w')) print "UBM Training - Step 2: Wrote GMM file '%s'" % output_file
def extract_features(self, extractor, preprocessor, indices=None, force=False): """Extracts the features from the preprocessed data using the given extractor.""" extractor.load(str(self.m_file_selector.extractor_file)) data_files = self.m_file_selector.preprocessed_data_list() feature_files = self.m_file_selector.feature_list() # select a subset of indices to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Extraction: splitting of index range %s" % str(indices)) else: index_range = range(len(data_files)) utils.ensure_dir(self.m_file_selector.features_directory) utils.info( "- Extraction: extracting %d features from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.preprocessed_directory, self.m_file_selector.features_directory)) for i in index_range: data_file = data_files[i] feature_file = feature_files[i] if not self.__check_file__(feature_file, force): # load data data = preprocessor.read_data(str(data_file)) # extract feature feature = extractor(data, data_file) # Save feature utils.ensure_dir(os.path.dirname(feature_file)) extractor.save_feature(feature, str(feature_file))
def project_features(self, tool, extractor, indices=None, force=False): """Projects the features for all files of the database.""" # load the projector file if tool.performs_projection: tool.load_projector(str(self.m_file_selector.projector_file)) feature_files = self.m_file_selector.feature_list() projected_files = self.m_file_selector.projected_list() # select a subset of indices to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Projection: splitting of index range %s" % str(indices)) else: index_range = range(len(feature_files)) utils.ensure_dir(self.m_file_selector.projected_directory) utils.info( "- Projection: projecting %d features from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.features_directory, self.m_file_selector.projected_directory)) # extract the features for i in index_range: feature_file = feature_files[i] projected_file = projected_files[i] if not self.__check_file__(projected_file, force): # load feature feature = extractor.read_feature(str(feature_file)) # project feature projected = tool.project(feature) # write it utils.ensure_dir(os.path.dirname(projected_file)) tool.save_feature(projected, str(projected_file))
def train_projector(self, tool, extractor, force=False): """Train the feature projector with the extracted features of the world group.""" if tool.requires_projector_training: projector_file = self.m_file_selector.projector_file if self.__check_file__(projector_file, force, 1000): utils.info("- Projection: projector '%s' already exists." % projector_file) else: utils.ensure_dir(os.path.dirname(projector_file)) # train projector if tool.split_training_features_by_client: train_files = self.m_file_selector.training_list( 'features', 'train_projector', arrange_by_client=True) train_features = self.__read_features_by_client__( train_files, extractor) utils.info( "- Projection: training projector '%s' using %d identities: " % (projector_file, len(train_files))) else: train_files = self.m_file_selector.training_list( 'features', 'train_projector') train_features = self.__read_features__( train_files, extractor) utils.info( "- Projection: training projector '%s' using %d training files: " % (projector_file, len(train_files))) # perform training tool.train_projector(train_features, str(projector_file))
def kmeans_estep(self, indices, force=False): """Performs a single E-step of the K-Means algorithm (parallel)""" stats_file = self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info("UBM training: Skipping KMeans E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: KMeans E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1])]) kmeans_trainer = bob.trainer.KMeansTrainer() t = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Temporary Kmeans machine required for trainer initialization kmeans_trainer.initialize(t, data) # Performs the E-step kmeans_trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array([kmeans_trainer.average_min_distance]) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) utils.ensure_dir(os.path.dirname(stats_file)) f = bob.io.HDF5File(stats_file, 'w') f.set('zeros', kmeans_trainer.zeroeth_order_statistics) f.set('first', kmeans_trainer.first_order_statistics) f.set('dist', dist * nsamples) f.set('nsamples', nsamples) utils.info("UBM training: Wrote Stats file '%s'" % stats_file)
def train_enroller(self, tool, extractor, force=False): """Trains the model enroller using the extracted or projected features, depending on your setup of the base class Tool.""" reader = tool if tool.use_projected_features_for_enrollment else extractor if tool.requires_enroller_training: enroller_file = self.m_file_selector.enroller_file if self.__check_file__(enroller_file, force, 1000): utils.info("- Enrollment: enroller '%s' already exists." % enroller_file) else: utils.ensure_dir(os.path.dirname(enroller_file)) # first, load the projector tool.load_projector(str(self.m_file_selector.projector_file)) # training models train_files = self.m_file_selector.training_list( 'projected' if tool.use_projected_features_for_enrollment else 'features', 'train_enroller', arrange_by_client=True) train_features = self.__read_features_by_client__( train_files, reader) # perform training utils.info( "- Enrollment: training enroller '%s' using %d identities: " % (enroller_file, len(train_features))) tool.train_enroller(train_features, str(enroller_file))
def gmm_estep(self, indices, force=False): """Performs a single E-step of the GMM training (parallel).""" stats_file = self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info("UBM training: Skipping GMM E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: GMM E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1])]) gmm_trainer = bob.trainer.ML_GMMTrainer(self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) # Calls the E-step and extracts the GMM statistics gmm_trainer.e_step(gmm_machine, data) gmm_stats = gmm_trainer.gmm_statistics # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(stats_file)) gmm_stats.save(bob.io.HDF5File(stats_file, 'w')) utils.info("UBM training: Wrote GMM stats '%s'" % (stats_file))
def train_extractor(self, extractor, preprocessor, force=False): """Trains the feature extractor using preprocessed data of the 'world' set, if the feature extractor requires training.""" if extractor.requires_training: extractor_file = self.m_file_selector.extractor_file if self.__check_file__(extractor_file, force, 1000): utils.info("- Extraction: extractor '%s' already exists." % extractor_file) else: utils.ensure_dir(os.path.dirname(extractor_file)) # read training files if extractor.split_training_data_by_client: train_files = self.m_file_selector.training_list( 'preprocessed', 'train_extractor', arrange_by_client=True) train_data = self.__read_data_by_client__( train_files, preprocessor) utils.info( "- Extraction: training extractor '%s' using %d identities: " % (extractor_file, len(train_files))) else: train_files = self.m_file_selector.training_list( 'preprocessed', 'train_extractor') train_data = self.__read_data__(train_files, preprocessor) utils.info( "- Extraction: training extractor '%s' using %d training files: " % (extractor_file, len(train_files))) # train model extractor.train(train_data, extractor_file, train_files)
def gmm_initialize(self, force=False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" output_file = self.m_configuration.gmm_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 800): utils.info("UBM Training: Skipping GMM initialization since '%s' already exists" % output_file) else: training_list = self.m_file_selector.training_feature_list() utils.info("UBM Training: Initializing GMM") # load KMeans machine kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(self.m_configuration.kmeans_file)) # read features data = numpy.vstack([bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices(len(training_list), self.m_args.limit_training_examples)]) # Create initial GMM Machine gmm_machine = bob.machine.GMMMachine(self.m_tool.m_gaussians, data.shape[1]) [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds(self.m_tool.m_variance_threshold) utils.ensure_dir(os.path.dirname(output_file)) gmm_machine.save(bob.io.HDF5File(os.path.join(output_file), 'w')) utils.info("UBM Training: Wrote GMM file '%s'" % output_file)
def kmeans_mstep(self, counts, force=False): """Performs a single M-step of the K-Means algorithm (non-parallel)""" old_machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.kmeans_intermediate_file % (self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info("UBM training: Skipping KMeans M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list)) # load stats file zeroeth, first, nsamples, dist = self.read_stats(stats_file) else: # load several files job_ids = range(self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id+1), len(training_list))) for job_id in job_ids] stats_files = [self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices] # read all stats files zeroeth, first, nsamples, dist = self.read_stats(stats_files[0]) for stats_file in stats_files[1:]: zeroeth_, first_, nsamples_, dist_ = self.read_stats(stats_file) zeroeth += zeroeth_ first += first_ nsamples += nsamples_ dist += dist_ # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # Creates the KMeansTrainer kmeans_trainer = bob.trainer.KMeansTrainer() # Creates the KMeansMachine kmeans_machine = bob.machine.KMeansMachine(bob.io.HDF5File(old_machine_file)) kmeans_trainer.initialize(kmeans_machine, data) kmeans_trainer.zeroeth_order_statistics = zeroeth kmeans_trainer.first_order_statistics = first kmeans_trainer.average_min_distance = dist # Performs the M-step kmeans_trainer.m_step(kmeans_machine, data) # data is not used in M-step utils.info("UBM training: Performed M step %d with result %f" % (self.m_args.iteration, dist/nsamples)) # Save the K-Means model utils.ensure_dir(os.path.dirname(new_machine_file)) kmeans_machine.save(bob.io.HDF5File(new_machine_file, 'w')) shutil.copy(new_machine_file, self.m_configuration.kmeans_file) utils.info("UBM training: Wrote new KMeans machine '%s'" % new_machine_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.kmeans_intermediate_file % (self.m_args.iteration-1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def gmm_mstep(self, counts, force=False): """Performs a single M-step of the GMM training (non-parallel)""" old_machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.gmm_intermediate_file % (self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info("UBM training: Skipping GMM M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list)) # load stats file gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_file)) else: # load several files job_ids = range(self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id+1), len(training_list))) for job_id in job_ids] stats_files = [self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices] # read all stats files gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_files[0])) for stats_file in stats_files[1:]: gmm_stats += bob.machine.GMMStats(bob.io.HDF5File(stats_file)) # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # load the old gmm machine gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(old_machine_file)) # initialize the trainer gmm_trainer = bob.trainer.ML_GMMTrainer(self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) gmm_trainer.gmm_statistics = gmm_stats # Calls M-step gmm_trainer.m_step(gmm_machine, data) # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(new_machine_file)) gmm_machine.save(bob.io.HDF5File(new_machine_file, 'w')) import shutil shutil.copy(new_machine_file, self.m_configuration.projector_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.gmm_intermediate_file % (self.m_args.iteration-1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def save_feature(self, feature, feature_file): """Saves the given *extracted* feature to a file with the given name. In this base class implementation: - If the given feature has a 'save' attribute, it calls feature.save(bob.io.HDF5File(feature_file), 'w'). In this case, the given feature_file might be either a file name or a bob.io.HDF5File. - Otherwise, it uses bob.io.save to do that. If you have a different format, please overwrite this function. """ utils.ensure_dir(os.path.dirname(feature_file)) if hasattr(feature, 'save'): # this is some class that supports saving itself feature.save(bob.io.HDF5File(feature_file, "w")) else: bob.io.save(feature, feature_file)
def kmeans_estep(self, indices, force=False): """Performs a single E-step of the K-Means algorithm (parallel)""" stats_file = self.m_configuration.kmeans_stats_file % ( self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info( "UBM training: Skipping KMeans E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(machine_file)) utils.info("UBM training: KMeans E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1]) ]) kmeans_trainer = bob.trainer.KMeansTrainer() t = bob.machine.KMeansMachine( self.m_tool.m_gaussians, data.shape[1] ) # Temporary Kmeans machine required for trainer initialization kmeans_trainer.initialize(t, data) # Performs the E-step kmeans_trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array([kmeans_trainer.average_min_distance]) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) utils.ensure_dir(os.path.dirname(stats_file)) f = bob.io.HDF5File(stats_file, 'w') f.set('zeros', kmeans_trainer.zeroeth_order_statistics) f.set('first', kmeans_trainer.first_order_statistics) f.set('dist', dist * nsamples) f.set('nsamples', nsamples) utils.info("UBM training: Wrote Stats file '%s'" % stats_file)
def kmeans(data): """the K-Means training.""" # read data print "UBM Training - Step 1: initializing kmeans" output_file = KMeans_HDF5 # Perform KMeans initialization kmeans_machine = bob.machine.KMeansMachine(gaussians, data.shape[1]) # Creates the KMeansTrainer and trains the Kmeans kmeans_trainer = bob.trainer.KMeansTrainer() kmeans_trainer.initialization_method = kmeans_trainer.initialization_method_type.RANDOM_NO_DUPLICATE kmeans_trainer.max_iterations = max_iterations kmeans_trainer.convergence_threshold = variance_threshold kmeans_trainer.rng = bob.core.random.mt19937(INIT_SEED) kmeans_trainer.train(kmeans_machine, data) utils.ensure_dir(os.path.dirname(output_file)) kmeans_machine.save(bob.io.HDF5File(output_file, 'w')) print "UBM Training - Step 1: Saved KMeans machine to '%s'" % output_file
def gmm_initialize(self, force=False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" output_file = self.m_configuration.gmm_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 800): utils.info( "UBM Training: Skipping GMM initialization since '%s' already exists" % output_file) else: training_list = self.m_file_selector.training_feature_list() utils.info("UBM Training: Initializing GMM") # load KMeans machine kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(self.m_configuration.kmeans_file)) # read features data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices( len(training_list), self.m_args.limit_training_examples) ]) # Create initial GMM Machine gmm_machine = bob.machine.GMMMachine(self.m_tool.m_gaussians, data.shape[1]) [ variances, weights ] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds( self.m_tool.m_variance_threshold) utils.ensure_dir(os.path.dirname(output_file)) gmm_machine.save(bob.io.HDF5File(os.path.join(output_file), 'w')) utils.info("UBM Training: Wrote GMM file '%s'" % output_file)
def kmeans_initialize(self, force=False): """Initializes the K-Means training (non-parallel).""" output_file = self.m_configuration.kmeans_intermediate_file % 0 if self.m_tool_chain.__check_file__(output_file, force, 1000): utils.info("UBM training: Skipping KMeans initialization since the file '%s' already exists" % output_file) else: # read data utils.info("UBM training: initializing kmeans") training_list = self.m_file_selector.training_feature_list() data = numpy.vstack([bob.io.load(str(training_list[index])) for index in utils.quasi_random_indices(len(training_list), self.m_args.limit_training_examples)]) # Perform KMeans initialization kmeans_machine = bob.machine.KMeansMachine(self.m_tool.m_gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure kmeans_trainer = bob.trainer.KMeansTrainer() kmeans_trainer.initialize(kmeans_machine, data) utils.ensure_dir(os.path.dirname(output_file)) kmeans_machine.save(bob.io.HDF5File(output_file, 'w')) utils.info("UBM training: saved initial KMeans machine to '%s'" % output_file)
def feature_normalization(self, indices, force=False): """Normalizes the list of features to have zero mean and unit variance (parallel)""" normalized_list = self.m_file_selector.training_feature_list() utils.info("UBM training: normalizing features from range(%d, %d)" % indices) # iterate through the files and normalize the features for index in range(indices[0], indices[1]): feature = bob.io.load(str(training_list[index])) mean, std = self.m_tool.__normalize_std_array__(feature) if self.m_tool_chain.__check_file__(normalized_list[index], force): utils.debug("Skipping file '%s'" % normalized_list[index]) else: utils.ensure_dir(os.path.dirname(normalized_list[index])) f = bob.io.HDF5File(str(normalized_list[index]), 'w') f.set('mean', mean) f.set('std', std) utils.debug("Saved normalized feature %s" %str(normalized_list[index]))
def train_ivector(train_features, input_ubm_file): # load UBM ubm = bob.machine.GMMMachine(bob.io.HDF5File(input_ubm_file)) # load GMM stats from training files gmm_stats = load_gmm_stats_list(input_ubm_file, train_features) # Training IVector enroller output_file = 'model/enroller_ivector.hdf5' print "IVector training" # Perform IVector initialization ivector_machine = bob.machine.IVectorMachine(ubm, subspace_dimension_of_t) ivector_machine.variance_threshold = variance_threshold # Creates the IVectorTrainer and trains the ivector machine ivector_trainer = bob.trainer.IVectorTrainer(update_sigma=True, convergence_threshold=variance_threshold, max_iterations=max_iterationss) ivector_trainer.train(ivector_machine, gmm_stats) utils.ensure_dir(os.path.dirname(output_file)) ivector_machine.save(bob.io.HDF5File(output_file, 'w')) print "IVector training: saved enroller's IVector machine base to '%s'" % output_file
def preprocess_data(self, preprocessor, indices=None, force=False): """Preprocesses the original data with the given preprocessor.""" # get the file lists data_files = self.m_file_selector.original_data_list() preprocessed_data_files = self.m_file_selector.preprocessed_data_list() # select a subset of keys to iterate if indices != None: index_range = range(indices[0], indices[1]) utils.info("- Preprocessing: splitting of index range %s" % str(indices)) else: index_range = range(len(data_files)) utils.ensure_dir(self.m_file_selector.preprocessed_directory) utils.info( "- Preprocessing: processing %d data files from directory '%s' to directory '%s'" % (len(index_range), self.m_file_selector.m_database.original_directory, self.m_file_selector.preprocessed_directory)) # read annotation files annotation_list = self.m_file_selector.annotation_list() for i in index_range: preprocessed_data_file = preprocessed_data_files[i] if not self.__check_file__(preprocessed_data_file, force): data = preprocessor.read_original_data(str(data_files[i])) # get the annotations; might be None annotations = self.m_file_selector.get_annotations( annotation_list[i]) # call the preprocessor preprocessed_data = preprocessor(data, annotations) utils.ensure_dir(os.path.dirname(preprocessed_data_file)) preprocessor.save_data(preprocessed_data, str(preprocessed_data_file))
def save_feature(self, feature, feature_file): """Save extracted SIFT features separated into keypoints and descriptors""" utils.ensure_dir(os.path.dirname(feature_file)) l_key = 4 # Length of SIFT keypoint. l_desc = 128 # Length of the SIFT descriptors. l_feat = len(feature) # Length of feature. sift_keypoints = numpy.ndarray(shape=(l_feat, l_key), dtype=feature[0].dtype) sift_descriptor = numpy.ndarray(shape=(l_feat, l_desc), dtype=feature[0].dtype) # Separate the keypoints and the descriptors. k = 0 for val in feature: sift_keypoints[k] = val[0:4] sift_descriptor[k] = val[4:] k = k + 1 # For this implementation, only descriptors are needed. bob.io.save(sift_descriptor, feature_file)
def feature_normalization(self, indices, force=False): """Normalizes the list of features to have zero mean and unit variance (parallel)""" normalized_list = self.m_file_selector.training_feature_list() utils.info("UBM training: normalizing features from range(%d, %d)" % indices) # iterate through the files and normalize the features for index in range(indices[0], indices[1]): feature = bob.io.load(str(training_list[index])) mean, std = self.m_tool.__normalize_std_array__(feature) if self.m_tool_chain.__check_file__(normalized_list[index], force): utils.debug("Skipping file '%s'" % normalized_list[index]) else: utils.ensure_dir(os.path.dirname(normalized_list[index])) f = bob.io.HDF5File(str(normalized_list[index]), 'w') f.set('mean', mean) f.set('std', std) utils.debug("Saved normalized feature %s" % str(normalized_list[index]))
def gmm_estep(self, indices, force=False): """Performs a single E-step of the GMM training (parallel).""" stats_file = self.m_configuration.gmm_stats_file % ( self.m_args.iteration, indices[0], indices[1]) if self.m_tool_chain.__check_file__(stats_file, force, 1000): utils.info( "UBM training: Skipping GMM E-Step since the file '%s' already exists" % stats_file) else: training_list = self.m_file_selector.training_feature_list() machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration gmm_machine = bob.machine.GMMMachine(bob.io.HDF5File(machine_file)) utils.info("UBM training: GMM E-Step from range(%d, %d)" % indices) # read data data = numpy.vstack([ bob.io.load(str(training_list[index])) for index in range(indices[0], indices[1]) ]) gmm_trainer = bob.trainer.ML_GMMTrainer( self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) # Calls the E-step and extracts the GMM statistics gmm_trainer.e_step(gmm_machine, data) gmm_stats = gmm_trainer.gmm_statistics # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(stats_file)) gmm_stats.save(bob.io.HDF5File(stats_file, 'w')) utils.info("UBM training: Wrote GMM stats '%s'" % (stats_file))
def train_ivector(train_features, input_ubm_file): # load UBM ubm = bob.machine.GMMMachine(bob.io.HDF5File(input_ubm_file)) # load GMM stats from training files gmm_stats = load_gmm_stats_list(input_ubm_file, train_features) # Training IVector enroller output_file = 'model/enroller_ivector.hdf5' print "IVector training" # Perform IVector initialization ivector_machine = bob.machine.IVectorMachine(ubm, subspace_dimension_of_t) ivector_machine.variance_threshold = variance_threshold # Creates the IVectorTrainer and trains the ivector machine ivector_trainer = bob.trainer.IVectorTrainer( update_sigma=True, convergence_threshold=variance_threshold, max_iterations=max_iterationss) ivector_trainer.train(ivector_machine, gmm_stats) utils.ensure_dir(os.path.dirname(output_file)) ivector_machine.save(bob.io.HDF5File(output_file, 'w')) print "IVector training: saved enroller's IVector machine base to '%s'" % output_file
def zt_norm_file(self, model_id, group): """Returns the score text file after ZT-normalization for the given model id of the given group.""" zt_norm_dir = os.path.join(self.score_directories[1], group) utils.ensure_dir(zt_norm_dir) return os.path.join(zt_norm_dir, str(model_id) + ".txt")
def no_norm_result_file(self, group): """Returns the resulting score text file for the given group.""" no_norm_dir = self.score_directories[0] utils.ensure_dir(no_norm_dir) return os.path.join(no_norm_dir, "scores-" + group)
def enroll_models(self, tool, extractor, compute_zt_norm, indices=None, groups=['dev', 'eval'], types=['N', 'T'], force=False): """Enroll the models for 'dev' and 'eval' groups, for both models and T-Norm-models. This function uses the extracted or projected features to compute the models, depending on your setup of the base class Tool.""" # read the projector file, if needed tool.load_projector(self.m_file_selector.projector_file) # read the model enrollment file tool.load_enroller(self.m_file_selector.enroller_file) # which tool to use to read the features... reader = tool if tool.use_projected_features_for_enrollment else extractor # Create Models if 'N' in types: for group in groups: model_ids = self.m_file_selector.model_ids(group) if indices != None: model_ids = model_ids[indices[0]:indices[1]] utils.info("- Enrollment: splitting of index range %s" % str(indices)) utils.info("- Enrollment: enrolling models of group '%s'" % group) for model_id in model_ids: # Path to the model model_file = self.m_file_selector.model_file( model_id, group) # Removes old file if required if not self.__check_file__(model_file, force): enroll_files = self.m_file_selector.enroll_files( model_id, group, 'projected' if tool.use_projected_features_for_enrollment else 'features') # load all files into memory enroll_features = [ reader.read_feature(str(enroll_file)) for enroll_file in enroll_files ] model = tool.enroll(enroll_features) # save the model utils.ensure_dir(os.path.dirname(model_file)) tool.save_model(model, str(model_file)) # T-Norm-Models if 'T' in types and compute_zt_norm: for group in groups: t_model_ids = self.m_file_selector.t_model_ids(group) if indices != None: t_model_ids = t_model_ids[indices[0]:indices[1]] utils.info("- Enrollment: splitting of index range %s" % str(indices)) utils.info("- Enrollment: enrolling T-models of group '%s'" % group) for t_model_id in t_model_ids: # Path to the model t_model_file = self.m_file_selector.t_model_file( t_model_id, group) # Removes old file if required if not self.__check_file__(t_model_file, force): t_enroll_files = self.m_file_selector.t_enroll_files( t_model_id, group, 'projected' if tool.use_projected_features_for_enrollment else 'features') # load all files into memory t_enroll_features = [ reader.read_feature(str(t_enroll_file)) for t_enroll_file in t_enroll_files ] t_model = tool.enroll(t_enroll_features) # save model utils.ensure_dir(os.path.dirname(t_model_file)) tool.save_model(t_model, str(t_model_file))
def c_file_for_model(self, model_id, group): """Returns the C-file for the given model id that is used for computing ZT normalization.""" c_dir = os.path.join(self.zt_score_directories[2], group) utils.ensure_dir(c_dir) return os.path.join(c_dir, str(model_id) + self.default_extension)
def d_file(self, t_model_id, group): """Returns the D-file for the given T-model id that is used for computing ZT normalization.""" d_dir = os.path.join(self.zt_score_directories[3], group) utils.ensure_dir(d_dir) return os.path.join(d_dir, str(t_model_id) + self.default_extension)
def d_matrix_file(self, group): """Returns the D-file for storing all scores for pairs of T-models and Z-probes.""" d_dir = os.path.join(self.zt_score_directories[3], group) utils.ensure_dir(d_dir) return os.path.join(d_dir, "D" + self.default_extension)
def d_same_value_file(self, t_model_id, group): """Returns the specific D-file for storing which pairs of the given T-model id and all Z-probes are intrapersonal or extrapersonal.""" d_dir = os.path.join(self.zt_score_directories[4], group) utils.ensure_dir(d_dir) return os.path.join(d_dir, str(t_model_id) + self.default_extension)
def d_same_value_matrix_file(self, group): """Returns the specific D-file for storing which pairs of T-models and Z-probes are intrapersonal or extrapersonal.""" d_dir = os.path.join(self.zt_score_directories[4], group) utils.ensure_dir(d_dir) return os.path.join(d_dir, "D_sameValue" + self.default_extension)
def calibrated_score_file(self, group, zt_norm=False): """Returns the directory where calibrated scores can be found.""" calibration_dir = self.score_directories[1 if zt_norm else 0] utils.ensure_dir(calibration_dir) return os.path.join(calibration_dir, "calibrated-" + group)
def kmeans_mstep(self, counts, force=False): """Performs a single M-step of the K-Means algorithm (non-parallel)""" old_machine_file = self.m_configuration.kmeans_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.kmeans_intermediate_file % ( self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info( "UBM training: Skipping KMeans M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.kmeans_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.kmeans_stats_file % ( self.m_args.iteration, 0, len(training_list)) # load stats file zeroeth, first, nsamples, dist = self.read_stats(stats_file) else: # load several files job_ids = range( self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id + 1), len(training_list))) for job_id in job_ids] stats_files = [ self.m_configuration.kmeans_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices ] # read all stats files zeroeth, first, nsamples, dist = self.read_stats( stats_files[0]) for stats_file in stats_files[1:]: zeroeth_, first_, nsamples_, dist_ = self.read_stats( stats_file) zeroeth += zeroeth_ first += first_ nsamples += nsamples_ dist += dist_ # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # Creates the KMeansTrainer kmeans_trainer = bob.trainer.KMeansTrainer() # Creates the KMeansMachine kmeans_machine = bob.machine.KMeansMachine( bob.io.HDF5File(old_machine_file)) kmeans_trainer.initialize(kmeans_machine, data) kmeans_trainer.zeroeth_order_statistics = zeroeth kmeans_trainer.first_order_statistics = first kmeans_trainer.average_min_distance = dist # Performs the M-step kmeans_trainer.m_step(kmeans_machine, data) # data is not used in M-step utils.info("UBM training: Performed M step %d with result %f" % (self.m_args.iteration, dist / nsamples)) # Save the K-Means model utils.ensure_dir(os.path.dirname(new_machine_file)) kmeans_machine.save(bob.io.HDF5File(new_machine_file, 'w')) shutil.copy(new_machine_file, self.m_configuration.kmeans_file) utils.info("UBM training: Wrote new KMeans machine '%s'" % new_machine_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.kmeans_intermediate_file % ( self.m_args.iteration - 1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def gmm_mstep(self, counts, force=False): """Performs a single M-step of the GMM training (non-parallel)""" old_machine_file = self.m_configuration.gmm_intermediate_file % self.m_args.iteration new_machine_file = self.m_configuration.gmm_intermediate_file % ( self.m_args.iteration + 1) if self.m_tool_chain.__check_file__(new_machine_file, force, 1000): utils.info( "UBM training: Skipping GMM M-Step since the file '%s' already exists" % new_machine_file) else: # get the files from e-step training_list = self.m_file_selector.training_feature_list() # try if there is one file containing all data if os.path.exists(self.m_configuration.gmm_stats_file % (self.m_args.iteration, 0, len(training_list))): stats_file = self.m_configuration.gmm_stats_file % ( self.m_args.iteration, 0, len(training_list)) # load stats file gmm_stats = bob.machine.GMMStats(bob.io.HDF5File(stats_file)) else: # load several files job_ids = range( self.__generate_job_array__(training_list, counts)[1]) job_indices = [(counts * job_id, min(counts * (job_id + 1), len(training_list))) for job_id in job_ids] stats_files = [ self.m_configuration.gmm_stats_file % (self.m_args.iteration, indices[0], indices[1]) for indices in job_indices ] # read all stats files gmm_stats = bob.machine.GMMStats( bob.io.HDF5File(stats_files[0])) for stats_file in stats_files[1:]: gmm_stats += bob.machine.GMMStats( bob.io.HDF5File(stats_file)) # read some features (needed for computation, but not really required) data = numpy.array(bob.io.load(str(training_list[0]))) # load the old gmm machine gmm_machine = bob.machine.GMMMachine( bob.io.HDF5File(old_machine_file)) # initialize the trainer gmm_trainer = bob.trainer.ML_GMMTrainer( self.m_tool.m_update_means, self.m_tool.m_update_variances, self.m_tool.m_update_weights) gmm_trainer.responsibilities_threshold = self.m_tool.m_responsibility_threshold gmm_trainer.initialize(gmm_machine, data) gmm_trainer.gmm_statistics = gmm_stats # Calls M-step gmm_trainer.m_step(gmm_machine, data) # Saves the GMM statistics to the file utils.ensure_dir(os.path.dirname(new_machine_file)) gmm_machine.save(bob.io.HDF5File(new_machine_file, 'w')) import shutil shutil.copy(new_machine_file, self.m_configuration.projector_file) if self.m_args.clean_intermediate and self.m_args.iteration > 0: old_file = self.m_configuration.gmm_intermediate_file % ( self.m_args.iteration - 1) utils.info("Removing old intermediate directory '%s'" % os.path.dirname(old_file)) shutil.rmtree(os.path.dirname(old_file))
def no_norm_file(self, model_id, group): """Returns the score text file for the given model id of the given group.""" no_norm_dir = os.path.join(self.score_directories[0], group) utils.ensure_dir(no_norm_dir) return os.path.join(no_norm_dir, str(model_id) + ".txt")
def zt_norm_result_file(self, group): """Returns the resulting score text file after ZT-normalization for the given group.""" zt_norm_dir = self.score_directories[1] utils.ensure_dir(zt_norm_dir) return os.path.join(zt_norm_dir, "scores-" + group)