def total_variability(self,
                          stat_server_filename,
                          ubm,
                          tv_rank,
                          nb_iter=20,
                          min_div=True,
                          tv_init=None,
                          batch_size=300,
                          save_init=False,
                          output_file_name=None,
                          num_thread=1):
        """
        Train a total variability model using multiple process on a single node.
        this method is the recommended one to train a Total Variability matrix.

        Optimization:
            Only half of symmetric matrices are stored here
            process sessions per batch in order to control the memory footprint
            Batches are processed by a pool of workers running in different process
            The implementation is based on a multiple producers / single consumer approach

        :param stat_server_filename: a list of StatServer file names to process
        :param ubm: a Mixture object
        :param tv_rank: rank of the total variability model
        :param nb_iter: number of EM iteration
        :param min_div: boolean, if True, apply minimum divergence re-estimation
        :param tv_init: initial matrix to start the EM iterations with
        :param batch_size: size of batch to load in memory for each worker
        :param save_init: boolean, if True, save the initial matrix
        :param output_file_name: name of the file where to save the matrix
        :param num_thread: number of process to run in parallel
        """
        if not isinstance(stat_server_filename, list):
            stat_server_filename = [stat_server_filename]

        assert (isinstance(ubm, Mixture)
                and ubm.validate()), "Second argument must be a proper Mixture"
        assert (isinstance(nb_iter, int)
                and (0 < nb_iter)), "nb_iter must be a positive integer"

        gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"

        # Set useful variables
        with h5py.File(stat_server_filename[0],
                       'r') as fh:  # open the first StatServer to get size
            _, sv_size = fh['stat1'].shape
            feature_size = fh['stat1'].shape[1] // fh['stat0'].shape[1]
            distrib_nb = fh['stat0'].shape[1]

        upper_triangle_indices = numpy.triu_indices(tv_rank)

        # mean and Sigma are initialized at ZEROS as statistics are centered
        self.mean = numpy.zeros(ubm.get_mean_super_vector().shape,
                                dtype=STAT_TYPE)
        self.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(STAT_TYPE))
        if tv_init is None:
            self.F = numpy.random.randn(sv_size, tv_rank).astype(STAT_TYPE)
        else:
            self.F = tv_init
        self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape,
                                 dtype=STAT_TYPE)

        # Save init if required
        if output_file_name is None:
            output_file_name = "temporary_factor_analyser"
        if save_init:
            self.write(output_file_name + "_init.h5")

        # Estimate  TV iteratively
        for it in range(nb_iter):

            # Create serialized accumulators for the list of models to process
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', RuntimeWarning)
                _A = serialize(
                    numpy.zeros((distrib_nb, tv_rank * (tv_rank + 1) // 2),
                                dtype=STAT_TYPE))
                _C = serialize(numpy.zeros((tv_rank, sv_size),
                                           dtype=STAT_TYPE))
                _R = serialize(
                    numpy.zeros((tv_rank * (tv_rank + 1) // 2),
                                dtype=STAT_TYPE))

            total_session_nb = 0

            # E-step
            # Accumulate statistics for each StatServer from the list
            for stat_server_file in stat_server_filename:

                # get info from the current StatServer
                with h5py.File(stat_server_file, 'r') as fh:
                    nb_sessions = fh["modelset"].shape[0]
                    total_session_nb += nb_sessions
                    batch_nb = int(
                        numpy.floor(nb_sessions / float(batch_size) + 0.999))
                    batch_indices = numpy.array_split(
                        numpy.arange(nb_sessions), batch_nb)

                    manager = multiprocessing.Manager()
                    q = manager.Queue()
                    pool = multiprocessing.Pool(num_thread + 2)
                    # pool = multiprocessing.pool.ThreadPool(num_thread + 2)

                    # put Consumer to work first
                    watcher = pool.apply_async(e_gather, ((_A, _C, _R), q))
                    # fire off workers
                    jobs = []

                    # Load data per batch to reduce the memory footprint
                    for batch_idx in tqdm(batch_indices,
                                          desc="Iteration# {}".format(it + 1)):

                        # Create list of argument for a process
                        arg = fh["stat0"][batch_idx, :], fh["stat1"][
                            batch_idx, :], ubm, self.F
                        job = pool.apply_async(e_worker, (arg, q))
                        jobs.append(job)
                    # print(len(jobs))

                    # collect results from the workers through the pool result queue
                    for job in jobs:
                        job.get()
                    # print(len(jobs))
                    # now we are done, kill the consumer
                    q.put((None, None, None, None))
                    pool.close()

                    _A, _C, _R = watcher.get()

            _R /= total_session_nb

            # M-step
            _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
            for c in range(distrib_nb):
                distrib_idx = range(c * feature_size, (c + 1) * feature_size)
                _A_tmp[upper_triangle_indices] = _A_tmp.T[
                    upper_triangle_indices] = _A[c, :]
                self.F[distrib_idx, :] = scipy.linalg.solve(
                    _A_tmp, _C[:, distrib_idx]).T

            # Minimum divergence
            if min_div:
                _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
                _R_tmp[upper_triangle_indices] = _R_tmp.T[
                    upper_triangle_indices] = _R
                ch = scipy.linalg.cholesky(_R_tmp)
                self.F = self.F.dot(ch)

            # Save the current FactorAnalyser
            if output_file_name is not None:
                if it < nb_iter - 1:
                    self.write(output_file_name + "_it-{}.h5".format(it))
                else:
                    self.write(output_file_name + ".h5")
    def extract_ivectors(self,
                         ubm,
                         stat_server_filename,
                         prefix='',
                         batch_size=300,
                         uncertainty=False,
                         num_thread=1):
        """
        Parallel extraction of i-vectors using multiprocessing module

        :param ubm: Mixture object (the UBM)
        :param stat_server_filename: name of the file from which the input StatServer is read
        :param prefix: prefix used to store the StatServer in its file
        :param batch_size: number of sessions to process in a batch
        :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices
        :param num_thread: number of process to run in parallel
        :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional)
        """
        assert (isinstance(ubm, Mixture)
                and ubm.validate()), "Second argument must be a proper Mixture"

        tv_rank = self.F.shape[1]

        # Set useful variables
        with h5py.File(stat_server_filename,
                       'r') as fh:  # open the first statserver to get size
            _, sv_size = fh[prefix + 'stat1'].shape
            nb_sessions = fh[prefix + "modelset"].shape[0]

            iv_server = StatServer()
            iv_server.modelset = fh.get(prefix + 'modelset').value
            iv_server.segset = fh.get(prefix + 'segset').value

            tmpstart = fh.get(prefix + "start").value
            tmpstop = fh.get(prefix + "stop").value
            iv_server.start = numpy.empty(fh[prefix + "start"].shape, '|O')
            iv_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O')
            iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1]
            iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1]

            iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE)
            with warnings.catch_warnings():
                iv_server.stat1 = serialize(numpy.zeros(
                    (nb_sessions, tv_rank)))
                iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank)))

            nb_sessions = iv_server.modelset.shape[0]
            batch_nb = int(numpy.floor(nb_sessions / float(batch_size) +
                                       0.999))
            batch_indices = numpy.array_split(numpy.arange(nb_sessions),
                                              batch_nb)

            manager = multiprocessing.Manager()
            q = manager.Queue()
            pool = multiprocessing.Pool(num_thread + 2)

            # put listener to work first
            watcher = pool.apply_async(iv_collect,
                                       ((iv_server.stat1, iv_sigma), q))
            # fire off workers
            jobs = []

            # Load data per batch to reduce the memory footprint
            for batch_idx in batch_indices:

                # Create list of argument for a process
                arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][
                    batch_idx, :], ubm, self.F
                job = pool.apply_async(iv_extract_on_batch, (arg, q))
                jobs.append(job)

            # collect results from the workers through the pool result queue
            for job in jobs:
                job.get()

            # now we are done, kill the listener
            q.put((None, None, None))
            pool.close()

            iv_server.stat1, iv_sigma = watcher.get()
        if uncertainty:
            return iv_server, iv_sigma
        else:
            return iv_server
示例#3
0
def total_variability(stat_server_file_name,
                      ubm,
                      tv_rank,
                      nb_iter=20,
                      min_div=True,
                      tv_init=None,
                      save_init=False,
                      output_file_name=None):
    """
    Train a total variability model using multiple process on multiple nodes with MPI.

    Example of how to train a total variability matrix using MPI.
    Here is what your script should look like:

    ----------------------------------------------------------------

    import sidekit

    fa = sidekit.FactorAnalyser()
    fa.total_variability_mpi("/lium/spk1/larcher/expe/MPI_TV/data/statserver.h5",
                             ubm,
                             tv_rank,
                             nb_iter=tv_iteration,
                             min_div=True,
                             tv_init=tv_new_init2,
                             output_file_name="data/TV_mpi")

    ----------------------------------------------------------------

    This script should be run using mpirun command (see MPI4PY website for
    more information about how to use it
        http://pythonhosted.org/mpi4py/
    )

        mpirun --hostfile hostfile ./my_script.py

    :param comm: MPI.comm object defining the group of nodes to use
    :param stat_server_file_name: name of the StatServer file to load (make sure you provide absolute path and that
    it is accessible from all your nodes).
    :param ubm: a Mixture object
    :param tv_rank: rank of the total variability model
    :param nb_iter: number of EM iteration
    :param min_div: boolean, if True, apply minimum divergence re-estimation
    :param tv_init: initial matrix to start the EM iterations with
    :param output_file_name: name of the file where to save the matrix
    """
    comm = MPI.COMM_WORLD

    comm.Barrier()

    # this lines allows to process a single StatServer or a list of StatServers
    if not isinstance(stat_server_file_name, list):
        stat_server_file_name = [stat_server_file_name]

    # Initialize useful variables
    sv_size = ubm.get_mean_super_vector().shape[0]
    gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
    nb_distrib, feature_size = ubm.mu.shape
    upper_triangle_indices = numpy.triu_indices(tv_rank)

    # Initialize the FactorAnalyser, mean and Sigma are initialized at ZEROS as statistics are centered
    factor_analyser = FactorAnalyser()
    factor_analyser.mean = numpy.zeros(ubm.get_mean_super_vector().shape)
    factor_analyser.F = serialize(
        numpy.zeros((sv_size, tv_rank)).astype(data_type))
    if tv_init is None:
        factor_analyser.F = numpy.random.randn(sv_size,
                                               tv_rank).astype(data_type)
    else:
        factor_analyser.F = tv_init
    factor_analyser.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape)

    # Save init if required
    if comm.rank == 0:
        if output_file_name is None:
            output_file_name = "temporary_factor_analyser"
        if save_init:
            factor_analyser.write(output_file_name + "_init.h5")

    # Iterative training of the FactorAnalyser
    for it in range(nb_iter):
        if comm.rank == 0:
            logging.critical("Start it {}".format(it))

        _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2),
                         dtype=data_type)
        _C = numpy.zeros((tv_rank, sv_size), dtype=data_type)
        _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=data_type)

        if comm.rank == 0:
            total_session_nb = 0

        # E-step
        for stat_server_file in stat_server_file_name:

            with h5py.File(stat_server_file, 'r') as fh:
                nb_sessions = fh["segset"].shape[0]

                if comm.rank == 0:
                    total_session_nb += nb_sessions

                comm.Barrier()
                if comm.rank == 0:
                    logging.critical(
                        "Process file: {}".format(stat_server_file))

                # Allocate a list of sessions to process to each node
                local_session_idx = numpy.array_split(range(nb_sessions),
                                                      comm.size)
                stat0 = fh['stat0'][local_session_idx[comm.rank], :]
                stat1 = fh['stat1'][local_session_idx[comm.rank], :]
                e_h, e_hh = e_on_batch(stat0, stat1, ubm, factor_analyser.F)

                _A += stat0.T.dot(e_hh)
                _C += e_h.T.dot(stat1)
                _R += numpy.sum(e_hh, axis=0)

            comm.Barrier()

        comm.Barrier()

        # Sum all statistics
        if comm.rank == 0:
            # only processor 0 will actually get the data
            total_A = numpy.zeros_like(_A)
            total_C = numpy.zeros_like(_C)
            total_R = numpy.zeros_like(_R)
        else:
            total_A = [None] * _A.shape[0]
            total_C = None
            total_R = None

        # Accumulate _A, using a list in order to avoid limitations of MPI (impossible to reduce matrices bigger
        # than 4GB)
        for ii in range(_A.shape[0]):
            _tmp = copy.deepcopy(_A[ii])
            if comm.rank == 0:
                _total_A = numpy.zeros_like(total_A[ii])
            else:
                _total_A = None

            comm.Reduce([_tmp, MPI.FLOAT], [_total_A, MPI.FLOAT],
                        op=MPI.SUM,
                        root=0)
            if comm.rank == 0:
                total_A[ii] = copy.deepcopy(_total_A)

        comm.Reduce([_C, MPI.FLOAT], [total_C, MPI.FLOAT], op=MPI.SUM, root=0)

        comm.Reduce([_R, MPI.FLOAT], [total_R, MPI.FLOAT], op=MPI.SUM, root=0)

        comm.Barrier()

        # M-step
        if comm.rank == 0:

            total_R /= total_session_nb
            _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type)
            for c in range(nb_distrib):
                distrib_idx = range(c * feature_size, (c + 1) * feature_size)
                _A_tmp[upper_triangle_indices] = _A_tmp.T[
                    upper_triangle_indices] = total_A[c, :]
                factor_analyser.F[distrib_idx, :] = scipy.linalg.solve(
                    _A_tmp, total_C[:, distrib_idx]).T

            # minimum divergence
            if min_div:
                _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type)
                _R_tmp[upper_triangle_indices] = _R_tmp.T[
                    upper_triangle_indices] = total_R
                ch = scipy.linalg.cholesky(_R_tmp)
                factor_analyser.F = factor_analyser.F.dot(ch)

            # Save the current FactorAnalyser
            if output_file_name is not None:
                if it < nb_iter - 1:
                    factor_analyser.write(output_file_name +
                                          "_it-{}.h5".format(it))
                else:
                    factor_analyser.write(output_file_name + ".h5")
        factor_analyser.F = comm.bcast(factor_analyser.F, root=0)
        comm.Barrier()