def total_variability(self, stat_server_filename, ubm, tv_rank, nb_iter=20, min_div=True, tv_init=None, batch_size=300, save_init=False, output_file_name=None, num_thread=1): """ Train a total variability model using multiple process on a single node. this method is the recommended one to train a Total Variability matrix. Optimization: Only half of symmetric matrices are stored here process sessions per batch in order to control the memory footprint Batches are processed by a pool of workers running in different process The implementation is based on a multiple producers / single consumer approach :param stat_server_filename: a list of StatServer file names to process :param ubm: a Mixture object :param tv_rank: rank of the total variability model :param nb_iter: number of EM iteration :param min_div: boolean, if True, apply minimum divergence re-estimation :param tv_init: initial matrix to start the EM iterations with :param batch_size: size of batch to load in memory for each worker :param save_init: boolean, if True, save the initial matrix :param output_file_name: name of the file where to save the matrix :param num_thread: number of process to run in parallel """ if not isinstance(stat_server_filename, list): stat_server_filename = [stat_server_filename] assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" assert (isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer" gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" # Set useful variables with h5py.File(stat_server_filename[0], 'r') as fh: # open the first StatServer to get size _, sv_size = fh['stat1'].shape feature_size = fh['stat1'].shape[1] // fh['stat0'].shape[1] distrib_nb = fh['stat0'].shape[1] upper_triangle_indices = numpy.triu_indices(tv_rank) # mean and Sigma are initialized at ZEROS as statistics are centered self.mean = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE) self.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(STAT_TYPE)) if tv_init is None: self.F = numpy.random.randn(sv_size, tv_rank).astype(STAT_TYPE) else: self.F = tv_init self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE) # Save init if required if output_file_name is None: output_file_name = "temporary_factor_analyser" if save_init: self.write(output_file_name + "_init.h5") # Estimate TV iteratively for it in range(nb_iter): # Create serialized accumulators for the list of models to process with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) _A = serialize( numpy.zeros((distrib_nb, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)) _C = serialize(numpy.zeros((tv_rank, sv_size), dtype=STAT_TYPE)) _R = serialize( numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)) total_session_nb = 0 # E-step # Accumulate statistics for each StatServer from the list for stat_server_file in stat_server_filename: # get info from the current StatServer with h5py.File(stat_server_file, 'r') as fh: nb_sessions = fh["modelset"].shape[0] total_session_nb += nb_sessions batch_nb = int( numpy.floor(nb_sessions / float(batch_size) + 0.999)) batch_indices = numpy.array_split( numpy.arange(nb_sessions), batch_nb) manager = multiprocessing.Manager() q = manager.Queue() pool = multiprocessing.Pool(num_thread + 2) # pool = multiprocessing.pool.ThreadPool(num_thread + 2) # put Consumer to work first watcher = pool.apply_async(e_gather, ((_A, _C, _R), q)) # fire off workers jobs = [] # Load data per batch to reduce the memory footprint for batch_idx in tqdm(batch_indices, desc="Iteration# {}".format(it + 1)): # Create list of argument for a process arg = fh["stat0"][batch_idx, :], fh["stat1"][ batch_idx, :], ubm, self.F job = pool.apply_async(e_worker, (arg, q)) jobs.append(job) # print(len(jobs)) # collect results from the workers through the pool result queue for job in jobs: job.get() # print(len(jobs)) # now we are done, kill the consumer q.put((None, None, None, None)) pool.close() _A, _C, _R = watcher.get() _R /= total_session_nb # M-step _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) for c in range(distrib_nb): distrib_idx = range(c * feature_size, (c + 1) * feature_size) _A_tmp[upper_triangle_indices] = _A_tmp.T[ upper_triangle_indices] = _A[c, :] self.F[distrib_idx, :] = scipy.linalg.solve( _A_tmp, _C[:, distrib_idx]).T # Minimum divergence if min_div: _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) _R_tmp[upper_triangle_indices] = _R_tmp.T[ upper_triangle_indices] = _R ch = scipy.linalg.cholesky(_R_tmp) self.F = self.F.dot(ch) # Save the current FactorAnalyser if output_file_name is not None: if it < nb_iter - 1: self.write(output_file_name + "_it-{}.h5".format(it)) else: self.write(output_file_name + ".h5")
def extract_ivectors(self, ubm, stat_server_filename, prefix='', batch_size=300, uncertainty=False, num_thread=1): """ Parallel extraction of i-vectors using multiprocessing module :param ubm: Mixture object (the UBM) :param stat_server_filename: name of the file from which the input StatServer is read :param prefix: prefix used to store the StatServer in its file :param batch_size: number of sessions to process in a batch :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices :param num_thread: number of process to run in parallel :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) """ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" tv_rank = self.F.shape[1] # Set useful variables with h5py.File(stat_server_filename, 'r') as fh: # open the first statserver to get size _, sv_size = fh[prefix + 'stat1'].shape nb_sessions = fh[prefix + "modelset"].shape[0] iv_server = StatServer() iv_server.modelset = fh.get(prefix + 'modelset').value iv_server.segset = fh.get(prefix + 'segset').value tmpstart = fh.get(prefix + "start").value tmpstop = fh.get(prefix + "stop").value iv_server.start = numpy.empty(fh[prefix + "start"].shape, '|O') iv_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O') iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE) with warnings.catch_warnings(): iv_server.stat1 = serialize(numpy.zeros( (nb_sessions, tv_rank))) iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank))) nb_sessions = iv_server.modelset.shape[0] batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999)) batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) manager = multiprocessing.Manager() q = manager.Queue() pool = multiprocessing.Pool(num_thread + 2) # put listener to work first watcher = pool.apply_async(iv_collect, ((iv_server.stat1, iv_sigma), q)) # fire off workers jobs = [] # Load data per batch to reduce the memory footprint for batch_idx in batch_indices: # Create list of argument for a process arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][ batch_idx, :], ubm, self.F job = pool.apply_async(iv_extract_on_batch, (arg, q)) jobs.append(job) # collect results from the workers through the pool result queue for job in jobs: job.get() # now we are done, kill the listener q.put((None, None, None)) pool.close() iv_server.stat1, iv_sigma = watcher.get() if uncertainty: return iv_server, iv_sigma else: return iv_server
def total_variability(stat_server_file_name, ubm, tv_rank, nb_iter=20, min_div=True, tv_init=None, save_init=False, output_file_name=None): """ Train a total variability model using multiple process on multiple nodes with MPI. Example of how to train a total variability matrix using MPI. Here is what your script should look like: ---------------------------------------------------------------- import sidekit fa = sidekit.FactorAnalyser() fa.total_variability_mpi("/lium/spk1/larcher/expe/MPI_TV/data/statserver.h5", ubm, tv_rank, nb_iter=tv_iteration, min_div=True, tv_init=tv_new_init2, output_file_name="data/TV_mpi") ---------------------------------------------------------------- This script should be run using mpirun command (see MPI4PY website for more information about how to use it http://pythonhosted.org/mpi4py/ ) mpirun --hostfile hostfile ./my_script.py :param comm: MPI.comm object defining the group of nodes to use :param stat_server_file_name: name of the StatServer file to load (make sure you provide absolute path and that it is accessible from all your nodes). :param ubm: a Mixture object :param tv_rank: rank of the total variability model :param nb_iter: number of EM iteration :param min_div: boolean, if True, apply minimum divergence re-estimation :param tv_init: initial matrix to start the EM iterations with :param output_file_name: name of the file where to save the matrix """ comm = MPI.COMM_WORLD comm.Barrier() # this lines allows to process a single StatServer or a list of StatServers if not isinstance(stat_server_file_name, list): stat_server_file_name = [stat_server_file_name] # Initialize useful variables sv_size = ubm.get_mean_super_vector().shape[0] gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" nb_distrib, feature_size = ubm.mu.shape upper_triangle_indices = numpy.triu_indices(tv_rank) # Initialize the FactorAnalyser, mean and Sigma are initialized at ZEROS as statistics are centered factor_analyser = FactorAnalyser() factor_analyser.mean = numpy.zeros(ubm.get_mean_super_vector().shape) factor_analyser.F = serialize( numpy.zeros((sv_size, tv_rank)).astype(data_type)) if tv_init is None: factor_analyser.F = numpy.random.randn(sv_size, tv_rank).astype(data_type) else: factor_analyser.F = tv_init factor_analyser.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape) # Save init if required if comm.rank == 0: if output_file_name is None: output_file_name = "temporary_factor_analyser" if save_init: factor_analyser.write(output_file_name + "_init.h5") # Iterative training of the FactorAnalyser for it in range(nb_iter): if comm.rank == 0: logging.critical("Start it {}".format(it)) _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2), dtype=data_type) _C = numpy.zeros((tv_rank, sv_size), dtype=data_type) _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=data_type) if comm.rank == 0: total_session_nb = 0 # E-step for stat_server_file in stat_server_file_name: with h5py.File(stat_server_file, 'r') as fh: nb_sessions = fh["segset"].shape[0] if comm.rank == 0: total_session_nb += nb_sessions comm.Barrier() if comm.rank == 0: logging.critical( "Process file: {}".format(stat_server_file)) # Allocate a list of sessions to process to each node local_session_idx = numpy.array_split(range(nb_sessions), comm.size) stat0 = fh['stat0'][local_session_idx[comm.rank], :] stat1 = fh['stat1'][local_session_idx[comm.rank], :] e_h, e_hh = e_on_batch(stat0, stat1, ubm, factor_analyser.F) _A += stat0.T.dot(e_hh) _C += e_h.T.dot(stat1) _R += numpy.sum(e_hh, axis=0) comm.Barrier() comm.Barrier() # Sum all statistics if comm.rank == 0: # only processor 0 will actually get the data total_A = numpy.zeros_like(_A) total_C = numpy.zeros_like(_C) total_R = numpy.zeros_like(_R) else: total_A = [None] * _A.shape[0] total_C = None total_R = None # Accumulate _A, using a list in order to avoid limitations of MPI (impossible to reduce matrices bigger # than 4GB) for ii in range(_A.shape[0]): _tmp = copy.deepcopy(_A[ii]) if comm.rank == 0: _total_A = numpy.zeros_like(total_A[ii]) else: _total_A = None comm.Reduce([_tmp, MPI.FLOAT], [_total_A, MPI.FLOAT], op=MPI.SUM, root=0) if comm.rank == 0: total_A[ii] = copy.deepcopy(_total_A) comm.Reduce([_C, MPI.FLOAT], [total_C, MPI.FLOAT], op=MPI.SUM, root=0) comm.Reduce([_R, MPI.FLOAT], [total_R, MPI.FLOAT], op=MPI.SUM, root=0) comm.Barrier() # M-step if comm.rank == 0: total_R /= total_session_nb _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type) for c in range(nb_distrib): distrib_idx = range(c * feature_size, (c + 1) * feature_size) _A_tmp[upper_triangle_indices] = _A_tmp.T[ upper_triangle_indices] = total_A[c, :] factor_analyser.F[distrib_idx, :] = scipy.linalg.solve( _A_tmp, total_C[:, distrib_idx]).T # minimum divergence if min_div: _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type) _R_tmp[upper_triangle_indices] = _R_tmp.T[ upper_triangle_indices] = total_R ch = scipy.linalg.cholesky(_R_tmp) factor_analyser.F = factor_analyser.F.dot(ch) # Save the current FactorAnalyser if output_file_name is not None: if it < nb_iter - 1: factor_analyser.write(output_file_name + "_it-{}.h5".format(it)) else: factor_analyser.write(output_file_name + ".h5") factor_analyser.F = comm.bcast(factor_analyser.F, root=0) comm.Barrier()