Пример #1
0
 def __init__(self):
     self.utterances = Utterances()
     self.ongoing_training = False
     self.schedulued_training = False
     self.repeat_training = False
     self.model = self.load()
     self.training_stack = []
Пример #2
0
    def __init__(self,
                 K_max,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 init_assignments="rand",
                 wip=0):

        # Attributes from parameters
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.wip = wip

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats,
            vec_ids_dict  #, n_slices_min=n_slices_min
        )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Embeddings in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]
        print("No. initial embeddings: {}".format(init_embeds.shape[0]))

        # Initialize the K-means components
        assignments = -1 * np.ones(N, dtype=int)
        if init_assignments == "rand":
            assignments[init_embeds] = np.random.randint(
                0, K_max, len(init_embeds))
        elif init_assignments == "spread":
            n_init_embeds = len(init_embeds)
            assignment_list = (
                range(K_max) *
                int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds]
            random.shuffle(assignment_list)
            assignments[init_embeds] = np.array(assignment_list)
        self.acoustic_model = KMeans(embeddings, K_max, assignments)
Пример #3
0
def parse_excel(path_to_data, sheet_index):
    """
        Parses hand-made excel to trainign and validation files
    """
    df = pd.read_excel(path_to_data, sheet_name=sheet_index)
    # fills missing intents
    df["intent"] = df["intent"].fillna(method="ffill")
    df = df.drop("téma", axis=1)
    df = df.drop("podkategorie", axis=1)

    # melts data to two columns : [intent, utterance]
    df = pd.melt(df, id_vars="intent", value_name="utterance", var_name="drop")
    df.drop("drop", axis=1, inplace=True)

    df["utterance"] = df["utterance"].str.lower()
    df["intent"] = df["intent"].str.lower()

    df.replace(np.nan, '', regex=True, inplace=True)

    df["utterance"] = df["utterance"].apply(unidecode.unidecode)
    df["intent"] = df["intent"].apply(unidecode.unidecode)
    df["intent"] = df["intent"].str.replace(" ", "_", regex=False)
    df["utterance"] = df["utterance"].str.replace("[^A-Za-z0-9 ]+",
                                                  " ",
                                                  regex=True)

    df.replace('', np.nan, regex=True, inplace=True)
    df.dropna(subset=["utterance"], inplace=True)

    docs = df_to_doc(df)

    Utterances().save_utterances(docs)
class BigramAcousticWordseg(object):
    """
    Unigram word segmentation of speech using acoustic word embeddings.

    Segmentation and sampling operations are carried out in this class.
    Segmentation results are mainly stored in `utterances`, which deals with
    all utterance-level information, but knows nothing about the acoustics. The
    `acoustic_model` deals with all the acoustic embedding operations. Blocked
    Gibbs sampling is used for inference. In the member functions, the index
    `i` generally refers to the index of an utterance.

    Parameters
    ----------
    am_K : int
        Acoustic model parameter.
    am_param_prior : e.g. instance of `FixedVarPrior`
        The acoustic model prior on the mean and covariance parameters.
    lm_params : dict
        A dictionary with at least an entry for "type", which can be
        "maxlikelihood", and the other entries giving the hyperparameters for
        that particular kind of language model.
    embedding_mats : dict of matrix
        The matrices of embeddings for every utterance.
    vec_ids_dict : dict of vector of int
        For every utterance, the vector IDs (see `Utterances`).
    landmarks_dict : dict of list of int
        For every utterance, the landmark points at which word boundaries are
        considered, given in the number of frames (10 ms units) from the start
        of each utterance. There is an implicit landmark at the start of every
        utterance.
    durations_dict : dict of vector of int
        The shape of this dict is the same as that of `vec_ids_dict`, but here
        the duration (in frames) of each of the embeddings are given.
    seed_boundaries_dict : dict of list of tuple
        Every tuple is the start (inclusive) and end (exclusive) embedding
        slice index of a seed token, giving its boundaries. If not given, no
        seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    seed_boundaries_dict : dict of list of int
        For every utterance, seed boundaries in 10 ms units (same format as
        `landmarks_dict`). If not given, no seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    n_slices_min : int
        The minimum number of landmarks over which an embedding can be
        calculated.
    n_slices_max : int
        The maximum number of landmarks over which an embedding can be
        calculated.
    min_duration : int
        Minimum duration of a segment.
    p_boundary_init : float
        See `Utterances`.
    beta_sent_boundary : float
        The symmetric Beta prior on the end of sentence probability; if this is
        set to -1, sentence boundary probabilities are not taken into account.
    lms : float
        Language model scaling factor.
    wip : float
        Word insertion penalty.
    fb_type : str
        The type of forward-backward algorithm to use:
        - "unigram": In this case, segmentation is carried out as it is done in
          the unigram case; i.e. only assignments are sampled using the bigram
          model.
        - "bigram": Sample assignments using the bigram language model.
    init_am_assignments : str
        This setting determines how the initial acoustic model assignments are
        determined:
        - "rand": Randomly assigned.
        - "one-by-one": Data vectors are added one at a time to the acoustic
          model.
    time_power_term : float
        Scaling the per-frame scaling; with 1.2 instead of 1, we get less words
        (prefer longer words).

    Attributes
    ----------
    utterances : Utterances
        Knows nothing about the acoustics. The indices in the `vec_ids`
        attribute refers to the embedding at the corresponding row in
        `acoustic_model.components.X`.
    acoustic_model : BigramFBGMM
        Knows nothing about utterance-level information. All embeddings are
        stored in this class as the data `components.X` attribute.
    ids_to_utterance_labels : list of str
        Keeps track of utterance labels for a specific utterance ID.
    unigram_counts : Kx1 vector of int
        Counts for each of the K components.
    bigram_counts : KxK matrix of int
        Element (j, i) is the count N_i_given_j of the component i following
        the component j.
    """

    def __init__(self, am_K, am_param_prior, lm_params,
            embedding_mats, vec_ids_dict, durations_dict, landmarks_dict,
            seed_boundaries_dict=None, seed_assignments_dict=None,
            covariance_type="fixed", n_slices_min=0,
            n_slices_max=20, min_duration=0, p_boundary_init=0.5,
            beta_sent_boundary=2.0, lms=1., wip=0., fb_type="bigram",
            init_am_assignments="rand",
            time_power_term=1.):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.beta_sent_boundary = beta_sent_boundary
        self.wip = wip
        self.lms = lms
        self.time_power_term = time_power_term
        self.set_fb_type(fb_type)

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(embedding_mats, vec_ids_dict)
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(
            lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries,
            p_boundary_init=p_boundary_init, n_slices_min=n_slices_min,
            n_slices_max=n_slices_max, min_duration=min_duration
            )

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]

        # Setup language model
        if lm_params["type"] == "smooth":
            intrp_lambda = lm_params["intrp_lambda"]
            a = lm_params["a"]
            b = lm_params["b"]
            K = am_K
            self.lm = BigramSmoothLM(intrp_lambda, a, b, K)

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1*np.ones(N, dtype=int)
        if seed_assignments_dict is not None:

            # Use seed assignments if provided
            logger.info("Using seed assignments")
            self.seed_to_cluster = {}
            i_cluster = 0
            for i_utt, utt in enumerate(ids_to_utterance_labels):
                utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
                utt_init_assignments = np.array(seed_assignments_dict[utt][:])
                utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)]
                utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)]
                for seed in utt_init_assignments:
                    if not seed in self.seed_to_cluster:
                        if isinstance(seed, (int, long)):
                            self.seed_to_cluster[seed] = seed
                        else:
                            self.seed_to_cluster[seed] = i_cluster
                            i_cluster += 1
                utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments]
                assignments[utt_init_embeds] = utt_init_assignments
            if am_K is None:
                am_K = max(self.seed_to_cluster.values()) + 1
            else:
                assert am_K >= max(self.seed_to_cluster.values()) + 1

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(
                embeddings, am_param_prior, am_K, assignments,
                covariance_type=covariance_type, lms=lms, lm=self.lm
                )           

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds))
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(
                embeddings, am_param_prior, am_K, assignments,
                covariance_type=covariance_type, lms=lms, lm=self.lm
                )

        elif init_am_assignments == "one-by-one":
            assert False
            # # Initialize `acoustic_model`
            # logger.info("Using a one-by-one initial assignment")
            # self.acoustic_model = am_class(
            #     embeddings, am_param_prior, am_alpha, am_K, assignments,
            #     covariance_type=covariance_type, lms=lms
            #     )

            # # Assign the embeddings one-by-one
            # for i_embed in init_embeds:
            #     # print i_embed
            #     self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments

        # Setup initial language model counts
        self.set_lm_counts()

    def set_fb_type(self, fb_type):
        self.fb_type = fb_type

        # Assign forward-backward function
        if fb_type == "bigram":
            self.fb_func = forward_backward
            self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_bigram
        elif fb_type == "unigram":
            self.fb_func = unigram_acoustic_wordseg.forward_backward
            self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_unigram
        else:
            assert False, "invalid `fb_type`: " + fb_type

    def set_lm_counts(self):
        # K = self.acoustic_model.components.K_max
        # unigram_counts = np.zeros(K, np.int)
        # bigram_counts = np.zeros((K, K), np.int)
        for i_utt in xrange(self.utterances.D):
            self.lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt))
            # print 
            # print i_utt, "-"*5, self.get_unsup_transcript_i(i_utt)
            # j_prev = None
            # for i_cur in self.get_unsup_transcript_i(i_utt):
            #     self.lm.unigram_counts[i_cur] += 1
            #     if j_prev is not None:
            #         self.lm.bigram_counts[j_prev, i_cur] += 1
            #     j_prev = i_cur
        # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts)

    def log_prob_z(self):
        """
        Return the log marginal probability of component assignment P(z).
        """
        lm_tmp = BigramSmoothLM(
            intrp_lambda=self.lm.intrp_lambda, a=self.lm.a, b=self.lm.b,
            K=self.lm.K
            )
        log_prob_z = 0.
        for i_utt in xrange(self.utterances.D):
            j_prev = None
            for i_cur in self.get_unsup_transcript_i(i_utt):
                if j_prev is not None:
                    log_prob_z += np.log(lm_tmp.prob_i_given_j(i_cur, j_prev))
                    lm_tmp.bigram_counts[j_prev, i_cur] += 1
                else:
                    log_prob_z += np.log(lm_tmp.prob_i(i_cur))
                lm_tmp.unigram_counts[i_cur] += 1
        return log_prob_z

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""
        log_prob_z = self.log_prob_z()
        log_prob_X_given_z = self.acoustic_model.log_prob_X_given_z()
        return log_prob_z + log_prob_X_given_z

    # @profile
    def log_marg_i_embed_unigram(self, i_embed):
        """Return the unigram log marginal of the i'th data vector: p(x_i)"""
        assert i_embed != -1

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = self.lms * self.lm.log_prob_vec_i()
        # logger.info("log_prob_z: " + str(log_prob_z))

        # (24.23) in Murphy, p. 842`
        log_prob_z[:self.acoustic_model.components.K] += self.acoustic_model.components.log_post_pred(
            i_embed
            )
        # Empty (unactive) components
        log_prob_z[self.acoustic_model.components.K:] += self.acoustic_model.components.log_prior(i_embed)
        return _cython_utils.logsumexp(log_prob_z)

    # @profile
    def gibbs_sample_inside_loop_i_embed(self, i_embed, j_prev_assignment=None, anneal_temp=1, i_utt=None):
        """
        Perform the inside loop of Gibbs sampling for data vector `i_embed`.
        """

        # Temp
        # print "j_prev_assignment", j_prev_assignment
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print

        # Compute log probability of `X[i]` belonging to each component; this
        # is the bigram version of (24.26) in Murphy, p. 843.
        if j_prev_assignment is not None:
            log_prob_z = np.log(self.lm.prob_vec_given_j(j_prev_assignment))
        else:
            log_prob_z = self.lm.log_prob_vec_i()
        # print log_prob_z

        # Scale with language model scaling factor
        log_prob_z *= self.lms
        # print log_prob_z
        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("lms * log(P(z=i|z_prev=j)): " + str(log_prob_z))
            logger.debug("log(p(x|z=i)): " + str(self.acoustic_model.components.log_post_pred(i_embed)))

        # Bigram version of (24.23) in Murphy, p. 842
        log_prob_z[:self.acoustic_model.components.K] += self.acoustic_model.components.log_post_pred(i_embed)
        # Empty (unactive) components
        log_prob_z[self.acoustic_model.components.K:] += self.acoustic_model.components.log_prior(i_embed)
        if anneal_temp != 1:
            log_prob_z = log_prob_z - _cython_utils.logsumexp(log_prob_z)
            log_prob_z_anneal = 1./anneal_temp * log_prob_z - _cython_utils.logsumexp(1./anneal_temp * log_prob_z)
            prob_z = np.exp(log_prob_z_anneal)
        else:
            prob_z = np.exp(log_prob_z - _cython_utils.logsumexp(log_prob_z))
        assert not np.isnan(np.sum(prob_z))

        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("P(z=i|x): " + str(prob_z))

        # Sample the new component assignment for `X[i]`
        k = utils.draw(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.acoustic_model.components.K:
            k = self.acoustic_model.components.K

        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("Adding item " + str(i_embed) + " to acoustic model component " + str(k))
        self.acoustic_model.components.add_item(i_embed, k)

        return k

    def gibbs_sample_i(self, i, anneal_temp=1, anneal_gibbs_am=False,
            assignments_only=False):
        """
        Block Gibbs sample new boundaries and embedding assignments for
        utterance `i`.

        Return
        ------
        log_prob : float
        """

        # # Temp
        # print i, self.ids_to_utterance_labels[i], str(self.get_unsup_transcript_i(i))

        # Debug trace
        logger.debug("Gibbs sampling utterance: " + str(i))
        if i == i_debug_monitor:
            logger.debug("-"*39)
            logger.debug("log p(X) before sampling: " + str(self.log_marg()))
            logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i)))
            logger.debug("Unigram counts before sampling: " + str(self.lm.unigram_counts))
            logger.debug("Bigram counts before sampling: " + str(self.lm.bigram_counts))

        # Remove counts from the `lm`
        self.lm.remove_counts_from_utterance(self.get_unsup_transcript_i(i))

        # Remove embeddings from utterance `i` from the `acoustic_model`
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.components.del_item(i_embed)

        # Sample segmentation
        if not assignments_only:

            # Get the log probabilities of the embeddings
            N = self.utterances.lengths[i]
            vec_embed_log_probs = self.get_vec_embed_log_probs(
                self.utterances.vec_ids[i, :(N**2 + N)/2],
                self.utterances.durations[i, :(N**2 + N)/2]
                )
            # assert False, "vec_embed_log_probs should be calculated differently based on unigram or bigram segmentation" 

            # Debug trace
            if i == i_debug_monitor:
                logger.debug("Statistics before sampling, but after removing, is given below")
                if self.fb_type == "unigram":
                    log_margs = [
                        self.log_marg_i_embed_unigram(j) for j in
                        self.utterances.get_segmented_embeds_i(i) if j != -1
                        ]
                else:
                    assert False, "to-do"
                embeddings = self.utterances.get_segmented_embeds_i(i)
                lengths = self.utterances.get_segmented_durations_i(i)
                logger.debug("Embeddings: " + str(embeddings))
                logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i)))
                logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i)))
                logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i)))
                logger.debug("log_margs: " + str(log_margs))
                logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs*np.array(lengths))))
                logger.debug("log p(X): " + str(self.log_marg()))

            # Draw new boundaries for utterance `i`
            log_p_continue = math.log(self.calc_p_continue())
            log_prob, self.utterances.boundaries[i, :N] = self.fb_func(
                vec_embed_log_probs, log_p_continue, N, self.n_slices_min, self.n_slices_max, i, anneal_temp
                )

            # Debug trace
            if i == i_debug_monitor:
                logger.debug("Statistics after sampling, but before adding new embeddings to `acoustic_model`")
                if self.fb_type == "unigram":
                    log_margs = [
                        self.log_marg_i_embed_unigram(j) for j in
                        self.utterances.get_segmented_embeds_i(i) if j != -1
                        ]
                else:
                    assert False, "to-do"
                lengths = self.utterances.get_segmented_durations_i(i)
                logger.debug("Embeddings: " + str(self.utterances.get_segmented_embeds_i(i)))
                logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i)))
                logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i)))
                logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i)))
                logger.debug("log_margs: " + str(log_margs))
                logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs*np.array(lengths))))
                logger.debug("log p(X): " + str(self.log_marg()))

        # # Temp
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print

        # Assign new embeddings to components in `acoustic_model`
        if i == i_debug_monitor:
            logger.debug("Sampling component assignments")
        j_prev_assignment = None
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                # This only happens because of backtracking in the forward-backward functions
                continue  # don't assign a non-embedding (accidently the last embedding)
            if anneal_gibbs_am:
                anneal_temp = anneal_temp
            else:
                anneal_temp = 1

            j_prev_assignment = self.gibbs_sample_inside_loop_i_embed(
                i_embed, j_prev_assignment, anneal_temp=anneal_temp, i_utt=i
                )

        self.lm.counts_from_utterance(self.get_unsup_transcript_i(i))

        # logger.info("!!!")
        # logger.info(str(self.lm.unigram_counts))
        # logger.info(str(self.acoustic_model.components.counts))
        # logger.info(str(self.lm.bigram_counts))
        # logger.info("!!!")

        # print "!!!", self.lm.unigram_counts
        # print self.acoustic_model.components.counts
        # print "bigram_counts", self.lm.bigram_counts

        # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts)

        # import copy
        # lm = copy.copy(self.lm)
        # lm.unigram_counts.fill(0.0)
        # lm.bigram_counts.fill(0.0)
        # for i_utt in xrange(self.utterances.D):
        #     lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt))
        # npt.assert_equal(lm.unigram_counts, self.lm.unigram_counts)
        # npt.assert_equal(lm.bigram_counts, self.lm.bigram_counts)
        # assert False

            # print self.lm.unigram_counts
            # print self.acoustic_model.components.lm.unigram_counts
            # print self.acoustic_model.components.counts
            # print self.lm.bigram_counts
        # assert False

        # Temp
        # print self.utterances.get_segmented_embeds_i(i)
        # print self.get_unsup_transcript_i(i)

        # Update `lm` counts
        # self.lm.counts_from_utterance(self.get_unsup_transcript_i(i))
        # assert False

        # # # Temp
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print self.acoustic_model.components.lm.unigram_counts

        # Debug trace
        if i == i_debug_monitor:
            logger.debug("log p(X) after sampling: " + str(self.log_marg()))
            logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i)))
            logger.debug("Unigram counts after sampling: " + str(self.lm.unigram_counts))
            logger.debug("Bigram counts after sampling: " + str(self.lm.bigram_counts))
            logger.debug("-"*39)

        if assignments_only:
            # Segmentation is not performed, so frame-scaled marginals does not make gibbs_sample_inside_loop_i_embed
            return 0.
        else:
            return log_prob

    def gibbs_sample(self, n_iter, am_n_iter=0, anneal_schedule=None,
            anneal_start_temp_inv=0.1, anneal_end_temp_inv=1,
            n_anneal_steps=-1, anneal_gibbs_am=False, assignments_only=False):
        """
        Perform blocked Gibbs sampling on all utterances.

        Parameters
        ----------
        n_iter : int
            Number of Gibbs sampling iterations of segmentation.
        am_n_iter : int
            Number of acoustic model Gibbs sampling iterations inbetween
            segmentation sampling iterations.
        anneal_schedule : str
            Can be one of the following:
            - None: A constant temperature of `anneal_end_temp_inv` is used
              throughout; if `anneal_end_temp_inv` is left at default (1), then
              this is equivalent to not performing annealing.
            - "linear": Linearly take the inverse temperature from
              `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule,
              annealing is performed over all `n_iter` iterations.
            - "step": Piecewise schedule in which the inverse temperature is
              taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps` steps (annealing will be performed over all
              `n_iter` iterations; it might be worth adding an additional
              variable for this case to allow the step schedule to stop early).
        assignments_only : bool
            Whether only component assignments should be sampled, or whether
            both component assignment and segmentation should be performed.

        Return
        ------
        record_dict : dict
            Contains several fields describing the sampling process. Each field
            is described by its key and statistics are given in a list which
            covers the Gibbs sampling iterations.
        """

        logger.info("Gibbs sampling for " + str(n_iter) + " iterations")
        logger.debug(
            "Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor]
            + " (index=" + str(i_debug_monitor) + ")"
            )

        # Setup annealing iterator
        if anneal_schedule is None:
            get_anneal_temp = iter([])
        elif anneal_schedule == "linear":
            if n_anneal_steps == -1:
                n_anneal_steps = n_iter
            anneal_list = 1./np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            get_anneal_temp = iter(anneal_list)
        elif anneal_schedule == "step":
            assert not n_anneal_steps == -1, (
                "`n_anneal_steps` of -1 not allowed for step annealing schedule"
                )
            n_iter_per_step = int(round(float(n_iter)/n_anneal_steps))
            anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            anneal_list = 1./anneal_list
            # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1]
            anneal_list = np.repeat(anneal_list, n_iter_per_step)
            get_anneal_temp = iter(anneal_list)

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        record_dict["log_marg"] = []
        record_dict["log_marg*length"] = []
        record_dict["log_prob_z"] = []
        record_dict["log_prob_X_given_z"] = []
        record_dict["anneal_temp"] = []
        record_dict["components"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in xrange(n_iter):

            start_time = time.time()

            # Perform intermediate acoustic model re-sampling
            if am_n_iter > 0:
                assert False, "to-do"
                self.acoustic_model.gibbs_sample(
                    am_n_iter, consider_unassigned=False
                    )

            # Get anneal temperature
            anneal_temp = next(get_anneal_temp, anneal_end_temp_inv)

            # Loop over utterances
            utt_order = range(self.utterances.D)
            random.shuffle(utt_order)
            if debug_gibbs_only:
                utt_order = [i_debug_monitor]
            log_prob = 0
            for i_utt in utt_order:
                log_prob += self.gibbs_sample_i(i_utt, anneal_temp, anneal_gibbs_am, assignments_only)

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["log_marg*length"].append(log_prob)
            record_dict["log_prob_z"].append(self.log_prob_z())
            record_dict["log_prob_X_given_z"].append(self.acoustic_model.log_prob_X_given_z())
            record_dict["anneal_temp"].append(anneal_temp)
            record_dict["components"].append(self.acoustic_model.components.K)
            record_dict["n_tokens"].append(self.acoustic_model.get_n_assigned())

            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

            logger.debug("Unigram counts after inference: " + str(self.lm.unigram_counts))
            logger.debug("Bigram counts after inference: " + str(self.lm.bigram_counts))

        return record_dict

    # @profile
    def get_vec_embed_log_probs_unigram(self, vec_ids, durations):
        """
        Return the unigram log marginal probs of the `vec_ids` embeddings,
        scaled by the given `durations`.
        """

        # Get marginals
        vec_embed_log_probs = -np.inf*np.ones(len(vec_ids))
        for i, embed_id in enumerate(vec_ids):
            if embed_id == -1:
                continue
            vec_embed_log_probs[i] = self.log_marg_i_embed_unigram(embed_id)

            # Scale log marginals by number of frames
            if np.isnan(durations[i]):
                vec_embed_log_probs[i] = -np.inf
            else:
                vec_embed_log_probs[i] *= durations[i]**self.time_power_term

        return vec_embed_log_probs + self.wip

    def get_vec_embed_log_probs_bigram(self, vec_ids, durations):
        pass

    def calc_p_continue(self):
        """
        Return the probability of not having an utterance break.

        It is assumed that the number of utterances are one less than the total
        number, since the current utterance is excluded from the calculation.
        """
        if self.beta_sent_boundary != -1:
            assert False, "to check"
            n_tokens = sum(self.acoustic_model.components.counts)  # number of assigned tokens
            n_sentences = self.utterances.D - 1
            n_continue = n_tokens - n_sentences
            p_continue = (
                (n_continue + self.beta_sent_boundary / 2.0) /
                (n_tokens + self.beta_sent_boundary)
                )
        else:
            p_continue = 1.0
        return p_continue

    def get_unsup_transcript_i(self, i):
        """Return a list of the components for current segmentation of `i`."""
        return list(
            self.acoustic_model.components.get_assignments(self.utterances.get_segmented_embeds_i(i))
            )
    def __init__(self, am_K, am_param_prior, lm_params,
            embedding_mats, vec_ids_dict, durations_dict, landmarks_dict,
            seed_boundaries_dict=None, seed_assignments_dict=None,
            covariance_type="fixed", n_slices_min=0,
            n_slices_max=20, min_duration=0, p_boundary_init=0.5,
            beta_sent_boundary=2.0, lms=1., wip=0., fb_type="bigram",
            init_am_assignments="rand",
            time_power_term=1.):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.beta_sent_boundary = beta_sent_boundary
        self.wip = wip
        self.lms = lms
        self.time_power_term = time_power_term
        self.set_fb_type(fb_type)

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(embedding_mats, vec_ids_dict)
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(
            lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries,
            p_boundary_init=p_boundary_init, n_slices_min=n_slices_min,
            n_slices_max=n_slices_max, min_duration=min_duration
            )

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]

        # Setup language model
        if lm_params["type"] == "smooth":
            intrp_lambda = lm_params["intrp_lambda"]
            a = lm_params["a"]
            b = lm_params["b"]
            K = am_K
            self.lm = BigramSmoothLM(intrp_lambda, a, b, K)

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1*np.ones(N, dtype=int)
        if seed_assignments_dict is not None:

            # Use seed assignments if provided
            logger.info("Using seed assignments")
            self.seed_to_cluster = {}
            i_cluster = 0
            for i_utt, utt in enumerate(ids_to_utterance_labels):
                utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
                utt_init_assignments = np.array(seed_assignments_dict[utt][:])
                utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)]
                utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)]
                for seed in utt_init_assignments:
                    if not seed in self.seed_to_cluster:
                        if isinstance(seed, (int, long)):
                            self.seed_to_cluster[seed] = seed
                        else:
                            self.seed_to_cluster[seed] = i_cluster
                            i_cluster += 1
                utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments]
                assignments[utt_init_embeds] = utt_init_assignments
            if am_K is None:
                am_K = max(self.seed_to_cluster.values()) + 1
            else:
                assert am_K >= max(self.seed_to_cluster.values()) + 1

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(
                embeddings, am_param_prior, am_K, assignments,
                covariance_type=covariance_type, lms=lms, lm=self.lm
                )           

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds))
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(
                embeddings, am_param_prior, am_K, assignments,
                covariance_type=covariance_type, lms=lms, lm=self.lm
                )

        elif init_am_assignments == "one-by-one":
            assert False
            # # Initialize `acoustic_model`
            # logger.info("Using a one-by-one initial assignment")
            # self.acoustic_model = am_class(
            #     embeddings, am_param_prior, am_alpha, am_K, assignments,
            #     covariance_type=covariance_type, lms=lms
            #     )

            # # Assign the embeddings one-by-one
            # for i_embed in init_embeds:
            #     # print i_embed
            #     self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments

        # Setup initial language model counts
        self.set_lm_counts()
Пример #6
0
class BigBrain:
    def __init__(self):
        self.utterances = Utterances()
        self.ongoing_training = False
        self.schedulued_training = False
        self.repeat_training = False
        self.model = self.load()
        self.training_stack = []

    def schedulue_training(self):
        if self.schedulued_training:
            self.repeat_training = True
            return False
        self.schedulued_training = True
        t = Timer(TRAIN_OFFSET, self.gym)
        t.start()
        return True

    def process_utterance_text(self, text):
        processed = unidecode.unidecode(text)
        processed = processed.lower()
        return processed

    def predict(self, utterance):
        if self.model is None:
            print("Model is not initialized yet")
            return None, 0
        processed = self.process_utterance_text(utterance)
        response = self.model.predict(processed)
        if len(response) < 2:
            return None, 0
        confidence = response[1][0]
        intent = label_to_intent(response[0][0])
        return (intent, confidence)

    def gym(self):
        if self.ongoing_training:
            # reschedule training
            self.schedulue_training()
            return

        self.ongoing_training = True

        train_path, _ = self.utterances.generate_train_file(eval_count=0)
        MODEL["input"] = train_path
        MODEL["loss"] = "hs"
        self.model = fasttext.train_supervised(**MODEL)

        self.schedulued_training = False
        self.ongoing_training = False

        self.save()

        if self.repeat_training:
            self.repeat_training = False
            self.gym()

    def path(self, name):
        return DATA_PATH + name

    def create_metadata(self):
        save = None
        with open(self.path("meta.json"), 'w+', encoding="utf-8") as f:
            save = {"models": []}
            json.dump(save, f)
        return save

    def load_metadata(self):
        meta = None
        with open(self.path("meta.json")) as f:
            meta = json.load(f)
        return meta

    def push_model(self, filename):
        meta = self.load_metadata()
        if (len(meta["models"]) == N_BACKUP):
            to_delete = meta["models"][:1][0]
            os.remove(self.path(to_delete))
            meta["models"] = meta["models"][1:]
        meta["models"].append(filename)
        with open(self.path("meta.json"), 'w+', encoding="utf-8") as f:
            json.dump(meta, f)

    def get_intents(self):
        labels = self.model.get_labels()
        intents = []
        for label in labels:
            intents.append(label_to_intent(label))
        return intents

    def scoop_model_params(self):
        train_parameters = [
            'lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel', 'minn',
            'maxn', 'neg', 'wordNgrams', 'bucket', 'lrUpdateRate', 't'
        ]

        args_getter = self.model.f.getArgs()

        parameters = {}
        for param in train_parameters:
            attr = getattr(args_getter, param)
            if param == 'loss':
                attr = attr.name
            parameters[param] = attr

        return parameters

    def print_prop(self):
        model = self.model
        f = model.f
        args = f.getArgs()
        keys2 = [a for a in dir(args) if not a.startswith('__')]
        print(keys2)

    def get_models(self):
        meta = self.load_metadata()
        return meta["models"]

    def save(self):
        filename = "model-" + random_string(8) + ".bin"
        self.model.save_model(self.path(filename))
        self.push_model(filename)

    def meta_train(self):
        train_path, eval_path = self.utterances.generate_train_file(
            eval_count=EVAL_COUNT)
        self.model = fasttext.train_supervised(
            input=train_path,
            autotuneValidationFile=eval_path,
            autotuneModelSize="{}M".format(MAX_MODEL_SIZE))
        self.save()

    def load(self):
        models = self.get_models()
        if len(models) == 0:
            return None
        filename = self.get_models()[-1:][0]
        return fasttext.load_model(self.path(filename))

    def check_intent(self, intent):
        result = re.search(r"^[a-z0-9_]+$", intent)
        return result is not None

    def train(self, utterance):
        utterance["index"] = random_string(INDEX_LENGTH)
        utterance["utterance"] = self.process_utterance_text(
            utterance["utterance"])
        print(utterance)
        intent = utterance["intent"]
        intent = intent.lower()
        if not self.check_intent(intent):
            return (TRAIN.BAD_INTENT, TRAIN.NO_TRAIN)
        utterance["intent"] = intent
        self.utterances.save_utterance(utterance)
        if not self.schedulued_training:
            self.schedulue_training()
        return (TRAIN.TRAIN_OK, TRAIN_OFFSET)

    def just_train(self):
        self.meta_train()
Пример #7
0
class UnigramAcousticWordseg(object):
    """
    Unigram word segmentation of speech using acoustic word embeddings.

    Segmentation and sampling operations are carried out in this class.
    Segmentation results are mainly stored in `utterances`, which deals with
    all utterance-level information, but knows nothing about the acoustics. The
    `acoustic_model` deals with all the acoustic embedding operations. Blocked
    Gibbs sampling is used for inference. In the member functions, the index
    `i` generally refers to the index of an utterance.

    Parameters
    ----------
    am_class : e.g. `FBGMM`
    am_alpha : float
        Acoustic model parameter.
    am_K : int
        Acoustic model parameter.
    am_param_prior : e.g. instance of `FixedVarPrior`
        The acoustic model prior on the mean and covariance parameters.
    embedding_mats : dict of matrix
        The matrices of embeddings for every utterance.
    vec_ids_dict : dict of vector of int
        For every utterance, the vector IDs (see `Utterances`).
    landmarks_dict : dict of list of int
        For every utterance, the landmark points at which word boundaries are
        considered, given in the number of frames (10 ms units) from the start
        of each utterance. There is an implicit landmark at the start of every
        utterance.
    durations_dict : dict of vector of int
        The shape of this dict is the same as that of `vec_ids_dict`, but here
        the duration (in frames) of each of the embeddings are given.
    seed_boundaries_dict : dict of list of tuple
        Every tuple is the start (inclusive) and end (exclusive) embedding
        slice index of a seed token, giving its boundaries. If not given, no
        seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    seed_boundaries_dict : dict of list of int
        For every utterance, seed boundaries in 10 ms units (same format as
        `landmarks_dict`). If not given, no seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    n_slices_min : int
        The minimum number of landmarks over which an embedding can be
        calculated.
    n_slices_max : int
        The maximum number of landmarks over which an embedding can be
        calculated.
    min_duration : int
        Minimum duration of a segment.
    p_boundary_init : float
        See `Utterances`.
    beta_sent_boundary : float
        The symmetric Beta prior on the end of sentence probability; if this is
        set to -1, sentence boundary probabilities are not taken into account.
    lms : float
        Language model scaling factor.
    wip : float
        Word insertion penalty.
    fb_type : str
        The type of forward-backward algorithm to use:
        - "standard": The normal forward filtering backward sampling algorithm.
        - "viterbi": The Viterbi version of the forward backward algorithm,
          using MAP assignments instead of sampling segmentation of embedding
          component assignments.
    init_am_assignments : str
        This setting determines how the initial acoustic model assignments are
        determined:
        - "rand": Randomly assigned.
        - "one-by-one": Data vectors are added one at a time to the acoustic
          model.
    time_power_term : float
        Scaling the per-frame scaling; with 1.2 instead of 1, we get less words
        (prefer longer words).

    Attributes
    ----------
    utterances : Utterances
        Knows nothing about the acoustics. The indices in the `vec_ids`
        attribute refers to the embedding at the corresponding row in
        `acoustic_model.components.X`.
    acoustic_model : FBGMM or IGMM
        Knows nothing about utterance-level information. All embeddings are
        stored in this class as the data `components.X` attribute.
    ids_to_utterance_labels : list of str
        Keeps track of utterance labels for a specific utterance ID.
    """
    def __init__(self,
                 am_class,
                 am_alpha,
                 am_K,
                 am_param_prior,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 seed_boundaries_dict=None,
                 seed_assignments_dict=None,
                 covariance_type="fixed",
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 beta_sent_boundary=2.0,
                 lms=1.,
                 wip=0.,
                 fb_type="standard",
                 init_am_assignments="rand",
                 time_power_term=1.):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.beta_sent_boundary = beta_sent_boundary
        # self.lms = lms
        self.wip = wip
        self.time_power_term = time_power_term
        self.set_fb_type(fb_type)

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats,
            vec_ids_dict  #, n_slices_min=n_slices_min
        )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # lengths = [
        #     int(-1 + np.sqrt(1 + 4 * 2 * i)) / 2 for i in
        #     [len(vec_ids_dict[j]) for j in ids_to_utterance_labels]
        #     ]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [
                seed_boundaries_dict[i] for i in ids_to_utterance_labels
            ]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     seed_boundaries=seed_boundaries,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1 * np.ones(N, dtype=int)
        if seed_assignments_dict is not None:
            # Use seed assignments if provided
            logger.info("Using seed assignments")
            self.seed_to_cluster = {}
            i_cluster = 0
            for i_utt, utt in enumerate(ids_to_utterance_labels):
                utt_init_embeds = np.array(
                    self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
                utt_init_assignments = np.array(seed_assignments_dict[utt][:])
                utt_init_assignments = utt_init_assignments[np.where(
                    utt_init_embeds != -1)]
                utt_init_embeds = utt_init_embeds[np.where(
                    utt_init_embeds != -1)]
                for seed in utt_init_assignments:
                    if not seed in self.seed_to_cluster:
                        if isinstance(seed, (int, long)):
                            self.seed_to_cluster[seed] = seed
                        else:
                            self.seed_to_cluster[seed] = i_cluster
                            i_cluster += 1
                utt_init_assignments = [
                    self.seed_to_cluster[i] for i in utt_init_assignments
                ]
                assignments[utt_init_embeds] = utt_init_assignments
            if am_K is None:
                am_K = max(self.seed_to_cluster.values()) + 1
            else:
                assert am_K >= max(self.seed_to_cluster.values()) + 1

            # Initialize `acoustic_model`
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(
                0, am_K, len(init_embeds))
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(
                        init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

        elif init_am_assignments == "one-by-one":
            # Initialize `acoustic_model`
            logger.info("Using a one-by-one initial assignment")
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

            # Assign the embeddings one-by-one
            for i_embed in init_embeds:
                # print i_embed
                self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments

    def set_fb_type(self, fb_type):
        self.fb_type = fb_type

        # Assign forward-backward function
        if fb_type == "standard":
            self.fb_func = forward_backward
        elif fb_type == "viterbi":
            self.fb_func = forward_backward_viterbi
        else:
            assert False, "invalid `fb_type`: " + fb_type

    def gibbs_sample_i(self, i, anneal_temp=1, anneal_gibbs_am=False):
        """
        Block Gibbs sample new boundaries and embedding assignments for
        utterance `i`.

        Return
        ------
        log_prob : float
        """

        # Debug trace
        logger.debug("Gibbs sampling utterance: " + str(i))
        if i == i_debug_monitor:
            logger.debug("-" * 39)
            logger.debug("log p(X) before sampling: " +
                         str(self.acoustic_model.log_marg()))
            logger.debug("Unsupervised transcript before sampling: " +
                         str(self.get_unsup_transcript_i(i)))

        # Remove embeddings from utterance `i` from the `acoustic_model`
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.components.del_item(i_embed)

        # Get the log probabilities of the embeddings
        N = self.utterances.lengths[i]
        vec_embed_log_probs = self.get_vec_embed_log_probs(
            self.utterances.vec_ids[i, :(N**2 + N) / 2],
            self.utterances.durations[i, :(N**2 + N) / 2])

        # Debug trace
        if i == i_debug_monitor:
            logger.debug(
                "Statistics before sampling, but after removing, is given below"
            )
            log_margs = [
                self.acoustic_model.log_marg_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            # lengths = []
            # i_bound = -1
            # for embed, bound in zip(embeddings, where_bounds):
            #     if embed == -1:
            #         continue
            #     lengths.append(bound - i_bound)
            #     i_bound = bound
            # print lengths
            # print self.utterances.get_segmented_durations_i(i)
            logger.debug("Embeddings: " + str(embeddings))
            logger.debug(
                "Utterance embeddings: " +
                str(self.utterances.get_original_segmented_embeds_i(i)))
            logger.debug(
                "Landmark indices: " +
                str(self.utterances.get_segmented_landmark_indices(i)))
            logger.debug("Durations: " +
                         str(self.utterances.get_segmented_durations_i(i)))
            logger.debug("log_margs: " + str(log_margs))
            logger.debug("sum(log_margs*lengths): " +
                         str(np.sum(log_margs * np.array(lengths))))
            logger.debug("log p(X): " + str(self.acoustic_model.log_marg()))

        # Draw new boundaries for utterance `i`
        log_p_continue = math.log(self.calc_p_continue())
        log_prob, self.utterances.boundaries[i, :N] = self.fb_func(
            vec_embed_log_probs, log_p_continue, N, self.n_slices_min,
            self.n_slices_max, i, anneal_temp)

        # Debug trace
        if i == i_debug_monitor:
            logger.debug(
                "Statistics after sampling, but before adding new embeddings to `acoustic_model`"
            )
            log_margs = [
                self.acoustic_model.log_marg_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            lengths = self.utterances.get_segmented_durations_i(i)
            # lengths = []
            # i_bound = -1
            # for bound in where_bounds:
            #     lengths.append(bound - i_bound)
            #     i_bound = bound
            logger.debug("Embeddings: " +
                         str(self.utterances.get_segmented_embeds_i(i)))
            logger.debug(
                "Utterance embeddings: " +
                str(self.utterances.get_original_segmented_embeds_i(i)))
            logger.debug(
                "Landmark indices: " +
                str(self.utterances.get_segmented_landmark_indices(i)))
            logger.debug("Durations: " +
                         str(self.utterances.get_segmented_durations_i(i)))
            logger.debug("log_margs: " + str(log_margs))
            logger.debug("sum(log_margs*lengths): " +
                         str(np.sum(log_margs * np.array(lengths))))
            logger.debug("log p(X): " + str(self.acoustic_model.log_marg()))
            # npt.assert_almost_equal(np.sum(log_margs*np.array(lengths)), log_prob)

        # Assign new embeddings to components in `acoustic_model`
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                # This only happens because of backtracking in the forward-backward functions
                continue  # don't assign a non-embedding (accidently the last embedding)
            if self.fb_type == "standard":
                if anneal_gibbs_am:
                    self.acoustic_model.gibbs_sample_inside_loop_i(
                        i_embed, anneal_temp)
                else:
                    self.acoustic_model.gibbs_sample_inside_loop_i(
                        i_embed, anneal_temp=1)
            elif self.fb_type == "viterbi":
                self.acoustic_model.map_assign_i(i_embed)

        # Debug trace
        if i == i_debug_monitor:
            logger.debug("log p(X) after sampling: " +
                         str(self.acoustic_model.log_marg()))
            logger.debug("Unsupervised transcript after sampling: " +
                         str(self.get_unsup_transcript_i(i)))
            logger.debug("-" * 39)

        # # temp
        # print str(self.get_unsup_transcript_i(i))

        return log_prob

    def gibbs_sample(self,
                     n_iter,
                     am_n_iter=0,
                     anneal_schedule=None,
                     anneal_start_temp_inv=0.1,
                     anneal_end_temp_inv=1,
                     n_anneal_steps=-1,
                     anneal_gibbs_am=False):
        """
        Perform blocked Gibbs sampling on all utterances.

        Parameters
        ----------
        n_iter : int
            Number of Gibbs sampling iterations of segmentation.
        am_n_iter : int
            Number of acoustic model Gibbs sampling iterations inbetween
            segmentation sampling iterations.
        anneal_schedule : str
            Can be one of the following:
            - None: A constant temperature of `anneal_end_temp_inv` is used
              throughout; if `anneal_end_temp_inv` is left at default (1), then
              this is equivalent to not performing annealing.
            - "linear": Linearly take the inverse temperature from
              `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule,
              annealing is performed over all `n_iter` iterations.
            - "step": Piecewise schedule in which the inverse temperature is
              taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps` steps (annealing will be performed over all
              `n_iter` iterations; it might be worth adding an additional
              variable for this case to allow the step schedule to stop early).

        Return
        ------
        record_dict : dict
            Contains several fields describing the sampling process. Each field
            is described by its key and statistics are given in a list which
            covers the Gibbs sampling iterations.
        """

        logger.info("Gibbs sampling for " + str(n_iter) + " iterations")
        logger.debug("Monitoring utterance " +
                     self.ids_to_utterance_labels[i_debug_monitor] +
                     " (index=" + str(i_debug_monitor) + ")")

        # Setup annealing iterator
        if anneal_schedule is None:
            get_anneal_temp = iter([])
        elif anneal_schedule == "linear":
            if n_anneal_steps == -1:
                n_anneal_steps = n_iter
            anneal_list = 1. / np.linspace(anneal_start_temp_inv,
                                           anneal_end_temp_inv, n_anneal_steps)
            get_anneal_temp = iter(anneal_list)
        elif anneal_schedule == "step":
            assert not n_anneal_steps == -1, (
                "`n_anneal_steps` of -1 not allowed for step annealing schedule"
            )
            n_iter_per_step = int(round(float(n_iter) / n_anneal_steps))
            anneal_list = np.linspace(anneal_start_temp_inv,
                                      anneal_end_temp_inv, n_anneal_steps)
            anneal_list = 1. / anneal_list
            # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1]
            anneal_list = np.repeat(anneal_list, n_iter_per_step)
            get_anneal_temp = iter(anneal_list)

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        record_dict["log_marg"] = []
        record_dict["log_marg*length"] = []
        record_dict["log_prob_z"] = []
        record_dict["log_prob_X_given_z"] = []
        record_dict["anneal_temp"] = []
        record_dict["components"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in xrange(n_iter):

            start_time = time.time()

            # Perform intermediate acoustic model re-sampling
            if am_n_iter > 0:
                self.acoustic_model.gibbs_sample(am_n_iter,
                                                 consider_unassigned=False)

            # Get anneal temperature
            anneal_temp = next(get_anneal_temp, anneal_end_temp_inv)

            # Loop over utterances
            utt_order = range(self.utterances.D)
            random.shuffle(utt_order)
            if debug_gibbs_only:
                utt_order = [i_debug_monitor]
            log_prob = 0
            for i_utt in utt_order:
                log_prob += self.gibbs_sample_i(i_utt, anneal_temp,
                                                anneal_gibbs_am)

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.acoustic_model.log_marg())
            record_dict["log_marg*length"].append(log_prob)
            record_dict["log_prob_z"].append(self.acoustic_model.log_prob_z())
            record_dict["log_prob_X_given_z"].append(
                self.acoustic_model.log_prob_X_given_z())
            record_dict["anneal_temp"].append(anneal_temp)
            record_dict["components"].append(self.acoustic_model.components.K)
            record_dict["n_tokens"].append(
                self.acoustic_model.get_n_assigned())

            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

        return record_dict

    def get_vec_embed_log_probs(self, vec_ids, durations):
        """
        Return the log marginal probs of the `vec_ids` embeddings, scaled by
        the given `durations`.
        """

        # Get marginals
        vec_embed_log_probs = -np.inf * np.ones(len(vec_ids))
        for i, embed_id in enumerate(vec_ids):
            if embed_id == -1:
                continue
            vec_embed_log_probs[i] = self.acoustic_model.log_marg_i(embed_id)

            # Scale log marginals by number of frames
            if np.isnan(durations[i]):
                vec_embed_log_probs[i] = -np.inf
            else:
                vec_embed_log_probs[i] *= durations[i]**self.time_power_term

        # # Scale log marginals by number of frames
        # N = int(-1 + np.sqrt(1 + 4 * 2 * len(vec_ids))) / 2  # see `__init__`
        # i_ = 0
        # for t in xrange(1, N + 1):
        #     # Per-frame scaling
        #     vec_embed_log_probs[i_:i_ + t] = vec_embed_log_probs[i_:i_ + t] * (
        #         np.arange(t, 0, -1)
        #         )

        #     # # Add duration prior
        #     # if not self.dur_gamma_a_loc_scale is None:
        #     #     duration_prior_log = gamma.logpdf(
        #     #         np.arange(t, 0, -1), self.dur_gamma_a_loc_scale[0],
        #     #         loc=self.dur_gamma_a_loc_scale[1], scale=self.dur_gamma_a_loc_scale[2]
        #     #         )
        #     #     vec_embed_log_probs[i_:i_ + t] += self.dur_scaling_factor*duration_prior_log

        #     i_ += t
        return vec_embed_log_probs + self.wip

    def calc_p_continue(self):
        """
        Return the probability of not having an utterance break.

        It is assumed that the number of utterances are one less than the total
        number, since the current utterance is excluded from the calculation.
        """
        if self.beta_sent_boundary != -1:
            assert False, "to check"
            n_tokens = sum(self.acoustic_model.components.counts
                           )  # number of assigned tokens
            n_sentences = self.utterances.D - 1
            n_continue = n_tokens - n_sentences
            p_continue = ((n_continue + self.beta_sent_boundary / 2.0) /
                          (n_tokens + self.beta_sent_boundary))
        else:
            p_continue = 1.0
        return p_continue

    def get_unsup_transcript_i(self, i):
        """Return a list of the components for current segmentation of `i`."""
        return list(
            self.acoustic_model.components.get_assignments(
                self.utterances.get_segmented_embeds_i(i)))

    def get_log_margs_i(self, i):
        """
        Get the log marginals for the current segmentation of utterance `i`.

        The segments from utterance `i` is removed and then added back in. This
        function is used for monitoring and post-processing.
        """

        # Remove embeddings from utterance `i` from the `acoustic_model`
        segmented_embeds = self.utterances.get_segmented_embeds_i(i)
        assignments = self.acoustic_model.components.get_assignments(
            segmented_embeds)
        for i_embed in segmented_embeds:
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.components.del_item(i_embed)

        log_margs = [
            self.acoustic_model.log_marg_i(j)
            for j in self.utterances.get_segmented_embeds_i(i) if j != -1
        ]

        # Add the embeddings back into the model
        for embed, assignment in zip(segmented_embeds, assignments):
            self.acoustic_model.components.add_item(embed, assignment)

        return log_margs
Пример #8
0
    def __init__(self,
                 am_class,
                 am_alpha,
                 am_K,
                 am_param_prior,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 seed_boundaries_dict=None,
                 seed_assignments_dict=None,
                 covariance_type="fixed",
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 beta_sent_boundary=2.0,
                 lms=1.,
                 wip=0.,
                 fb_type="standard",
                 init_am_assignments="rand",
                 time_power_term=1.):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.beta_sent_boundary = beta_sent_boundary
        # self.lms = lms
        self.wip = wip
        self.time_power_term = time_power_term
        self.set_fb_type(fb_type)

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats,
            vec_ids_dict  #, n_slices_min=n_slices_min
        )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # lengths = [
        #     int(-1 + np.sqrt(1 + 4 * 2 * i)) / 2 for i in
        #     [len(vec_ids_dict[j]) for j in ids_to_utterance_labels]
        #     ]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [
                seed_boundaries_dict[i] for i in ids_to_utterance_labels
            ]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     seed_boundaries=seed_boundaries,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1 * np.ones(N, dtype=int)
        if seed_assignments_dict is not None:
            # Use seed assignments if provided
            logger.info("Using seed assignments")
            self.seed_to_cluster = {}
            i_cluster = 0
            for i_utt, utt in enumerate(ids_to_utterance_labels):
                utt_init_embeds = np.array(
                    self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
                utt_init_assignments = np.array(seed_assignments_dict[utt][:])
                utt_init_assignments = utt_init_assignments[np.where(
                    utt_init_embeds != -1)]
                utt_init_embeds = utt_init_embeds[np.where(
                    utt_init_embeds != -1)]
                for seed in utt_init_assignments:
                    if not seed in self.seed_to_cluster:
                        if isinstance(seed, (int, long)):
                            self.seed_to_cluster[seed] = seed
                        else:
                            self.seed_to_cluster[seed] = i_cluster
                            i_cluster += 1
                utt_init_assignments = [
                    self.seed_to_cluster[i] for i in utt_init_assignments
                ]
                assignments[utt_init_embeds] = utt_init_assignments
            if am_K is None:
                am_K = max(self.seed_to_cluster.values()) + 1
            else:
                assert am_K >= max(self.seed_to_cluster.values()) + 1

            # Initialize `acoustic_model`
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(
                0, am_K, len(init_embeds))
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(
                        init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

        elif init_am_assignments == "one-by-one":
            # Initialize `acoustic_model`
            logger.info("Using a one-by-one initial assignment")
            self.acoustic_model = am_class(embeddings,
                                           am_param_prior,
                                           am_alpha,
                                           am_K,
                                           assignments,
                                           covariance_type=covariance_type,
                                           lms=lms)

            # Assign the embeddings one-by-one
            for i_embed in init_embeds:
                # print i_embed
                self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments
Пример #9
0
class ESKmeans(object):
    """
    Embedded segmental K-means.

    Segmentation and clustering are carried out using this class. Variables
    related to the segmentation are stored in the `utterances` attribute, which
    deals with all utterance-level information but knows nothing about the
    acoustics. The `kmeans` attribute deals with all the acoustic embedding
    operations. In member functions, index `i` generally refers to the index of
    an utterance.

    Parameters
    ----------
    K_max : int
        Maximum number of components.
    embedding_mats : dict of matrix
        The matrices of embeddings for every utterance.
    vec_ids_dict : dict of vector of int
        For every utterance, the vector IDs (see `Utterances`).     READ!!!!
    landmarks_dict : dict of list of int
        For every utterance, the landmark points at which word boundaries are
        considered, given in the number of frames (10 ms units) from the start
        of each utterance. There is an implicit landmark at the start of every
        utterance.
    durations_dict : dict of vector of int
        The shape of this dict is the same as that of `vec_ids_dict`, but here
        the duration (in frames) of each of the embeddings are given.
    n_slices_min : int
        The minimum number of landmarks over which an embedding can be
        calculated.
    n_slices_max : int
        The maximum number of landmarks over which an embedding can be
        calculated.
    min_duration : int
        Minimum duration of a segment.
    wip : float
        Word insertion penalty.
    p_boundary_init : float
        See `Utterances`.
    init_assignments : str
        This setting determines how the initial acoustic model assignments are
        determined: "rand" assigns data vectors randomly; "each-in-own" assigns
        each data point to a component of its own; and "spread" makes an
        attempt to spread data vectors evenly over the components.

    Attributes
    ----------
    utterances : Utterances
        Knows nothing about the acoustics. The indices in the `vec_ids`
        attribute refers to the embedding at the corresponding row in
        `acoustic_model.X`.
    acoustic_model : KMeans
        Knows nothing about utterance-level information. All embeddings are
        stored in this class in its `X` attribute.
    ids_to_utterance_labels : list of str
        Keeps track of utterance labels for a specific utterance ID.
    """
    def __init__(self,
                 K_max,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 init_assignments="rand",
                 wip=0):

        # Attributes from parameters
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.wip = wip

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats,
            vec_ids_dict  #, n_slices_min=n_slices_min
        )

        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]

        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Embeddings in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]
        print("No. initial embeddings: {}".format(init_embeds.shape[0]))
        print(init_embeds)

        # Initialize the K-means components
        assignments = -1 * np.ones(N, dtype=int)
        if init_assignments == "rand":
            assignments[init_embeds] = np.random.randint(
                0, K_max, len(init_embeds))
        elif init_assignments == "spread":
            n_init_embeds = len(init_embeds)
            assignment_list = (
                range(K_max) *
                int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds]
            random.shuffle(assignment_list)
            assignments[init_embeds] = np.array(assignment_list)
        self.acoustic_model = KMeans(embeddings, K_max, assignments)

    def save(self, f):
        self.acoustic_model.save(f)
        # self.utterances.save(f)

    def load(self, f):
        self.acoustic_model.load(f)
        self.utterances.load(f)

    def segment_i(self, i):
        """
        Segment new boundaries and cluster new segments for utterance `i`.

        Return
        ------
        sum_neg_len_sqrd_norm : float
            The length-weighted K-means objective for this utterance.
        """

        # Debug trace
        if DEBUG > 0:
            print("Segmenting utterance: " + str(i))
            if i == I_DEBUG_MONITOR:
                print("-" * 79)
                print("Statistics before sampling")
                print("sum_neg_sqrd_norm before sampling: " +
                      str(self.acoustic_model.sum_neg_sqrd_norm()))
                print("Unsupervised transcript: " +
                      str(self.get_unsup_transcript_i(i)))
                print("Unsupervised max transcript: " +
                      str(self.get_max_unsup_transcript_i(i)))

        # The embeddings before segmentation
        old_embeds = self.utterances.get_segmented_embeds_i(i)

        # Get the scores of the embeddings
        N = self.utterances.lengths[i]
        vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms(
            self.utterances.vec_ids[i, :(N**2 + N) // 2],
            self.utterances.durations[i, :(N**2 + N) // 2])

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print("vec_embed_neg_len_sqrd_norms: " +
                  str(vec_embed_neg_len_sqrd_norms))
            neg_sqrd_norms = [
                self.acoustic_model.max_neg_sqrd_norm_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            print(lengths)
            print("Embeddings: " + str(embeddings))
            print("Utterance embeddings: " +
                  str(self.utterances.get_original_segmented_embeds_i(i)))
            print("Landmark indices: " +
                  str(self.utterances.get_segmented_landmark_indices(i)))
            print("Durations: " +
                  str(self.utterances.get_segmented_durations_i(i)))
            print("neg_sqrd_norms: " + str(neg_sqrd_norms))
            print("neg_len_sqrd_norms: " +
                  str(neg_sqrd_norms * np.array(lengths)))
            print("sum_neg_len_sqrd_norms: " +
                  str(np.sum(neg_sqrd_norms * np.array(lengths))))

        # Draw new boundaries for utterance i
        sum_neg_len_sqrd_norm, self.utterances.boundaries[
            i, :N] = forward_backward_kmeans_viterbi(
                vec_embed_neg_len_sqrd_norms, N, self.n_slices_min,
                self.n_slices_max, i)

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print(
                "Statistics after sampling, but before adding new embeddings to acoustic model"
            )
            neg_sqrd_norms = [
                self.acoustic_model.max_neg_sqrd_norm_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            print(lengths)
            print("Embeddings: " + str(embeddings))
            print("Utterance embeddings: " +
                  str(self.utterances.get_original_segmented_embeds_i(i)))
            print("Landmark indices: " +
                  str(self.utterances.get_segmented_landmark_indices(i)))
            print("Durations: " +
                  str(self.utterances.get_segmented_durations_i(i)))
            print("neg_sqrd_norms: " + str(neg_sqrd_norms))
            print("neg_len_sqrd_norms: " +
                  str(neg_sqrd_norms * np.array(lengths)))
            print("sum_neg_len_sqrd_norms: " +
                  str(np.sum(neg_sqrd_norms * np.array(lengths))))

        # Remove old embeddings and add new ones; this is equivalent to
        # assigning the new embeddings and updating the means.
        new_embeds = self.utterances.get_segmented_embeds_i(i)
        new_k = self.get_max_unsup_transcript_i(i)

        for i_embed in old_embeds:
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.del_item(i_embed)
        for i_embed, k in zip(new_embeds, new_k):
            self.acoustic_model.add_item(i_embed, k)
        self.acoustic_model.clean_components()

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print("sum_neg_sqrd_norm after sampling: " +
                  str(self.acoustic_model.sum_neg_sqrd_norm()))
            print("Unsupervised transcript after sampling: " +
                  str(self.get_unsup_transcript_i(i)))
            print("-" * 79)

        return sum_neg_len_sqrd_norm  # technically, this is with the old means (before updating, above)

    def segment(self, n_iter, n_iter_inbetween_kmeans=0):
        """
        Perform segmentation of all utterances and update the K-means model.

        Parameters
        ----------
        n_iter : int
            Number of iterations of segmentation.
        n_iter_inbetween_kmeans : int
            Number of K-means iterations inbetween segmentation iterations.

        Return
        ------
        record_dict : dict
            Contains several fields describing the optimization iterations.
            Each field is described by its key and statistics are given in a
            list covering the iterations.
        """

        # Debug trace
        print("Segmenting for {} iterations".format(n_iter))
        if DEBUG > 0:
            print("Monitoring utterance {} (index={:d})".format(
                self.ids_to_utterance_labels[I_DEBUG_MONITOR],
                I_DEBUG_MONITOR))

        # Setup record dictionary
        record_dict = {}
        record_dict["sum_neg_sqrd_norm"] = []
        record_dict["sum_neg_len_sqrd_norm"] = []
        record_dict["components"] = []
        record_dict["sample_time"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in range(n_iter):

            start_time = time.time()

            # Loop over utterances
            utt_order = list(range(self.utterances.D))

            random.shuffle(utt_order)
            if SEGMENT_DEBUG_ONLY:
                utt_order = [I_DEBUG_MONITOR]
            sum_neg_len_sqrd_norm = 0
            for i_utt in utt_order:
                sum_neg_len_sqrd_norm += self.segment_i(i_utt)

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["sum_neg_sqrd_norm"].append(
                self.acoustic_model.sum_neg_sqrd_norm())
            record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm)
            record_dict["components"].append(self.acoustic_model.K)
            record_dict["n_tokens"].append(
                self.acoustic_model.get_n_assigned())

            info = "Iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            print(info)

            # Perform intermediate acoustic model re-sampling
            if n_iter_inbetween_kmeans > 0:
                self.acoustic_model.fit(n_iter_inbetween_kmeans,
                                        consider_unassigned=False)

        return record_dict

    def segment_only_i(self, i):
        """
        Segment new boundaries for utterance `i`, without cluster assignment.

        Although cluster assignments are not updated, the cluster assignments
        are determined and returned (but the `acoustic_model` is not updated).

        Return
        ------
        i, sum_neg_len_sqrd_norm, new_boundaries, old_embeds, new_embeds,
                new_k : (int, vector, float, list, list, list)
            The utterance index; the length-weighted K-means objective for this
            utterance; newly segmented boundaries; embeddings before
            segmentation; new embeddings after segmentation; new embedding
            assignments.
        """

        # Debug trace
        if DEBUG > 0:
            print("Segmenting utterance: " + str(i))
            if i == I_DEBUG_MONITOR:
                print("-" * 79)
                print("Statistics before sampling")
                print("sum_neg_sqrd_norm before sampling: " +
                      str(self.acoustic_model.sum_neg_sqrd_norm()))
                print("Unsupervised transcript: " +
                      str(self.get_unsup_transcript_i(i)))
                print("Unsupervised max transcript: " +
                      str(self.get_max_unsup_transcript_i(i)))

        # The embeddings before segmentation
        old_embeds = self.utterances.get_segmented_embeds_i(i)

        # Get the scores of the embeddings
        N = self.utterances.lengths[i]
        vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms(
            self.utterances.vec_ids[i, :(N**2 + N) / 2],
            self.utterances.durations[i, :(N**2 + N) / 2])

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print("vec_embed_neg_len_sqrd_norms: " +
                  str(vec_embed_neg_len_sqrd_norms))
            neg_sqrd_norms = [
                self.acoustic_model.max_neg_sqrd_norm_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            print("Embeddings: " + str(embeddings))
            print("Utterance embeddings: " +
                  str(self.utterances.get_original_segmented_embeds_i(i)))
            print("Landmark indices: " +
                  str(self.utterances.get_segmented_landmark_indices(i)))
            print("Durations: " +
                  str(self.utterances.get_segmented_durations_i(i)))
            print("neg_sqrd_norms: " + str(neg_sqrd_norms))
            print("neg_len_sqrd_norms: " +
                  str(neg_sqrd_norms * np.array(lengths)))
            print("sum_neg_len_sqrd_norms: " +
                  str(np.sum(neg_sqrd_norms * np.array(lengths))))

        # Draw new boundaries for utterance i
        sum_neg_len_sqrd_norm, new_boundaries = forward_backward_kmeans_viterbi(
            vec_embed_neg_len_sqrd_norms, N, self.n_slices_min,
            self.n_slices_max, i)
        # sum_neg_len_sqrd_norm, self.utterances.boundaries[i, :N] = forward_backward_kmeans_viterbi(
        #     vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i
        #     )
        # new_boundaries = self.utterances.boundaries[i, :N]

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print(
                "Statistics after sampling, but before adding new embeddings to acoustic model"
            )
            neg_sqrd_norms = [
                self.acoustic_model.max_neg_sqrd_norm_i(j)
                for j in self.utterances.get_segmented_embeds_i(i) if j != -1
            ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            print("Embeddings: " + str(embeddings))
            print("Utterance embeddings: " +
                  str(self.utterances.get_original_segmented_embeds_i(i)))
            print("Landmark indices: " +
                  str(self.utterances.get_segmented_landmark_indices(i)))
            print("Durations: " +
                  str(self.utterances.get_segmented_durations_i(i)))
            print("neg_sqrd_norms: " + str(neg_sqrd_norms))
            print("neg_len_sqrd_norms: " +
                  str(neg_sqrd_norms * np.array(lengths)))
            print("sum_neg_len_sqrd_norms: " +
                  str(np.sum(neg_sqrd_norms * np.array(lengths))))

        # Remove old embeddings and add new ones; this is equivalent to
        # assigning the new embeddings and updating the means.
        # new_embeds = self.utterances.get_segmented_embeds_i(i)
        # new_k = self.get_max_unsup_transcript_i(i)

        new_embeds = self.utterances.get_segmented_embeds_i_bounds(
            i, new_boundaries)
        new_k = self.get_max_unsup_transcript_i_embeds(i, new_embeds)

        # for i_embed in old_embeds:
        #     if i_embed == -1:
        #         continue  # don't remove a non-embedding (would accidently remove the last embedding)
        #     self.acoustic_model.del_item(i_embed)
        # for i_embed, k in zip(new_embeds, new_k):
        #     self.acoustic_model.add_item(i_embed, k)
        # self.acoustic_model.clean_components()

        # Debug trace
        if DEBUG > 0 and i == I_DEBUG_MONITOR:
            print("sum_neg_sqrd_norm after sampling: " +
                  str(self.acoustic_model.sum_neg_sqrd_norm()))
            print("Unsupervised transcript after sampling: " +
                  str(self.get_unsup_transcript_i(i)))
            print("-" * 79)

        return i, sum_neg_len_sqrd_norm, new_boundaries, old_embeds, new_embeds, new_k

    def segment_parallel(self,
                         n_iter,
                         n_iter_inbetween_kmeans=0,
                         n_cpus=1,
                         n_batches=1):
        """
        Perform segmentation of all utterances and update the K-means model.

        Parameters
        ----------
        n_iter : int
            Number of iterations of segmentation.
        n_iter_inbetween_kmeans : int
            Number of K-means iterations inbetween segmentation iterations.
        n_cpus : int
            Number of parallel processes.
        n_batches : int
            Over each batch, an update is made.

        Return
        ------
        record_dict : dict
            Contains several fields describing the optimization iterations.
            Each field is described by its key and statistics are given in a
            list covering the iterations.
        """

        # Debug trace
        print("Segmenting for {} iterations".format(n_iter))
        if DEBUG > 0:
            print("Monitoring utterance {} (index={:d})".format(
                self.ids_to_utterance_labels[I_DEBUG_MONITOR],
                I_DEBUG_MONITOR))

        # Setup record dictionary
        record_dict = {}
        record_dict["sum_neg_sqrd_norm"] = []
        record_dict["sum_neg_len_sqrd_norm"] = []
        record_dict["components"] = []
        record_dict["sample_time"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in range(n_iter):

            start_time = time.time()

            # Determine utterance order
            utt_global_order = range(self.utterances.D)
            random.shuffle(utt_global_order)
            n_batch_size = int(
                np.ceil(len(utt_global_order) / float(n_batches)))

            # Perform segmentation over batches
            sum_neg_len_sqrd_norm = 0
            for i_batch in range(n_batches):
                utt_order = utt_global_order[n_batch_size *
                                             i_batch:n_batch_size *
                                             (i_batch + 1)]

                # Segment in parallel
                utt_batches = [utt_order[i::n_cpus] for i in range(n_cpus)]
                updates = Parallel(n_jobs=n_cpus)(
                    delayed(local_segment_only_utts)(self, utts)
                    for utts in utt_batches)

                # Aggregate updates
                updates = [item for sublist in updates
                           for item in sublist]  # flatten
                old_embeds = []
                new_embeds = []
                new_k = []
                for (i_utt, cur_sum_neg_len_sqrd_norm, cur_new_bounds,
                     cur_old_embeds, cur_new_embeds, cur_new_k) in updates:
                    sum_neg_len_sqrd_norm += cur_sum_neg_len_sqrd_norm
                    old_embeds.extend(cur_old_embeds)
                    new_embeds.extend(cur_new_embeds)
                    new_k.extend(cur_new_k)

                    N = self.utterances.lengths[i_utt]
                    self.utterances.boundaries[i_utt, :N] = cur_new_bounds

                # Remove old embeddings and add new ones; this is equivalent to
                # assigning the new embeddings and updating the means.
                for i_embed in old_embeds:
                    if i_embed == -1:
                        continue  # don't remove a non-embedding (would accidently remove the last embedding)
                    self.acoustic_model.del_item(i_embed)
                for i_embed, k in zip(new_embeds, new_k):
                    self.acoustic_model.add_item(i_embed, k)
                self.acoustic_model.clean_components()

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["sum_neg_sqrd_norm"].append(
                self.acoustic_model.sum_neg_sqrd_norm())
            record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm)
            record_dict["components"].append(self.acoustic_model.K)
            record_dict["n_tokens"].append(
                self.acoustic_model.get_n_assigned())

            info = "Iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            print(info)

            # Perform intermediate acoustic model re-sampling
            if n_iter_inbetween_kmeans > 0:
                self.acoustic_model.fit(n_iter_inbetween_kmeans,
                                        consider_unassigned=False)

        return record_dict

    def get_vec_embed_neg_len_sqrd_norms(self, vec_ids, durations):

        # Get scores
        vec_embed_neg_len_sqrd_norms = -np.inf * np.ones(len(vec_ids))
        for i, embed_id in enumerate(vec_ids):
            if embed_id == -1:
                continue
            vec_embed_neg_len_sqrd_norms[
                i] = self.acoustic_model.max_neg_sqrd_norm_i(embed_id)

            # Scale log marginals by number of frames
            # if np.isnan(durations[i]):
            if durations[i] == -1:
                vec_embed_neg_len_sqrd_norms[i] = -np.inf
            else:
                vec_embed_neg_len_sqrd_norms[i] *= durations[
                    i]  #**self.time_power_term

        return vec_embed_neg_len_sqrd_norms + self.wip

    def get_unsup_transcript_i(self, i):
        """
        Return a list of the current component assignments for the current
        segmentation of `i`.
        """
        return list(
            self.acoustic_model.get_assignments(
                self.utterances.get_segmented_embeds_i(i)))

    def get_max_unsup_transcript_i(self, i):
        """
        Return a list of the best components for current segmentation of `i`.
        """
        return self.acoustic_model.get_max_assignments(
            self.utterances.get_segmented_embeds_i(i))

    def get_max_unsup_transcript_i_embeds(self, i, embeddings):
        """
        Return a list of the best components for the given embeddings of `i`.
        """
        return self.acoustic_model.get_max_assignments(embeddings)
Пример #10
0
class BigramAcousticWordseg(object):
    """
    Unigram word segmentation of speech using acoustic word embeddings.

    Segmentation and sampling operations are carried out in this class.
    Segmentation results are mainly stored in `utterances`, which deals with
    all utterance-level information, but knows nothing about the acoustics. The
    `acoustic_model` deals with all the acoustic embedding operations. Blocked
    Gibbs sampling is used for inference. In the member functions, the index
    `i` generally refers to the index of an utterance.

    Parameters
    ----------
    am_K : int
        Acoustic model parameter.
    am_param_prior : e.g. instance of `FixedVarPrior`
        The acoustic model prior on the mean and covariance parameters.
    lm_params : dict
        A dictionary with at least an entry for "type", which can be
        "maxlikelihood", and the other entries giving the hyperparameters for
        that particular kind of language model.
    embedding_mats : dict of matrix
        The matrices of embeddings for every utterance.
    vec_ids_dict : dict of vector of int
        For every utterance, the vector IDs (see `Utterances`).
    landmarks_dict : dict of list of int
        For every utterance, the landmark points at which word boundaries are
        considered, given in the number of frames (10 ms units) from the start
        of each utterance. There is an implicit landmark at the start of every
        utterance.
    durations_dict : dict of vector of int
        The shape of this dict is the same as that of `vec_ids_dict`, but here
        the duration (in frames) of each of the embeddings are given.
    seed_boundaries_dict : dict of list of tuple
        Every tuple is the start (inclusive) and end (exclusive) embedding
        slice index of a seed token, giving its boundaries. If not given, no
        seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    seed_boundaries_dict : dict of list of int
        For every utterance, seed boundaries in 10 ms units (same format as
        `landmarks_dict`). If not given, no seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    n_slices_min : int
        The minimum number of landmarks over which an embedding can be
        calculated.
    n_slices_max : int
        The maximum number of landmarks over which an embedding can be
        calculated.
    min_duration : int
        Minimum duration of a segment.
    p_boundary_init : float
        See `Utterances`.
    beta_sent_boundary : float
        The symmetric Beta prior on the end of sentence probability; if this is
        set to -1, sentence boundary probabilities are not taken into account.
    lms : float
        Language model scaling factor.
    wip : float
        Word insertion penalty.
    fb_type : str
        The type of forward-backward algorithm to use:
        - "unigram": In this case, segmentation is carried out as it is done in
          the unigram case; i.e. only assignments are sampled using the bigram
          model.
        - "bigram": Sample assignments using the bigram language model.
    init_am_assignments : str
        This setting determines how the initial acoustic model assignments are
        determined:
        - "rand": Randomly assigned.
        - "one-by-one": Data vectors are added one at a time to the acoustic
          model.
    time_power_term : float
        Scaling the per-frame scaling; with 1.2 instead of 1, we get less words
        (prefer longer words).

    Attributes
    ----------
    utterances : Utterances
        Knows nothing about the acoustics. The indices in the `vec_ids`
        attribute refers to the embedding at the corresponding row in
        `acoustic_model.components.X`.
    acoustic_model : BigramFBGMM
        Knows nothing about utterance-level information. All embeddings are
        stored in this class as the data `components.X` attribute.
    ids_to_utterance_labels : list of str
        Keeps track of utterance labels for a specific utterance ID.
    unigram_counts : Kx1 vector of int
        Counts for each of the K components.
    bigram_counts : KxK matrix of int
        Element (j, i) is the count N_i_given_j of the component i following
        the component j.
    """
    def __init__(self,
                 am_K,
                 am_param_prior,
                 lm_params,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 seed_boundaries_dict=None,
                 seed_assignments_dict=None,
                 covariance_type="fixed",
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 beta_sent_boundary=2.0,
                 lms=1.,
                 wip=0.,
                 fb_type="bigram",
                 init_am_assignments="rand",
                 time_power_term=1.):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.beta_sent_boundary = beta_sent_boundary
        self.wip = wip
        self.lms = lms
        self.time_power_term = time_power_term
        self.set_fb_type(fb_type)

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats, vec_ids_dict)
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [
                seed_boundaries_dict[i] for i in ids_to_utterance_labels
            ]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     seed_boundaries=seed_boundaries,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]

        # Setup language model
        if lm_params["type"] == "smooth":
            intrp_lambda = lm_params["intrp_lambda"]
            a = lm_params["a"]
            b = lm_params["b"]
            K = am_K
            self.lm = BigramSmoothLM(intrp_lambda, a, b, K)

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1 * np.ones(N, dtype=int)
        if seed_assignments_dict is not None:

            # Use seed assignments if provided
            logger.info("Using seed assignments")
            self.seed_to_cluster = {}
            i_cluster = 0
            for i_utt, utt in enumerate(ids_to_utterance_labels):
                utt_init_embeds = np.array(
                    self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
                utt_init_assignments = np.array(seed_assignments_dict[utt][:])
                utt_init_assignments = utt_init_assignments[np.where(
                    utt_init_embeds != -1)]
                utt_init_embeds = utt_init_embeds[np.where(
                    utt_init_embeds != -1)]
                for seed in utt_init_assignments:
                    if not seed in self.seed_to_cluster:
                        if isinstance(seed, (int, long)):
                            self.seed_to_cluster[seed] = seed
                        else:
                            self.seed_to_cluster[seed] = i_cluster
                            i_cluster += 1
                utt_init_assignments = [
                    self.seed_to_cluster[i] for i in utt_init_assignments
                ]
                assignments[utt_init_embeds] = utt_init_assignments
            if am_K is None:
                am_K = max(self.seed_to_cluster.values()) + 1
            else:
                assert am_K >= max(self.seed_to_cluster.values()) + 1

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(embeddings,
                                              am_param_prior,
                                              am_K,
                                              assignments,
                                              covariance_type=covariance_type,
                                              lms=lms,
                                              lm=self.lm)

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(
                0, am_K, len(init_embeds))
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(
                        init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = BigramFBGMM(embeddings,
                                              am_param_prior,
                                              am_K,
                                              assignments,
                                              covariance_type=covariance_type,
                                              lms=lms,
                                              lm=self.lm)

        elif init_am_assignments == "one-by-one":
            assert False
            # # Initialize `acoustic_model`
            # logger.info("Using a one-by-one initial assignment")
            # self.acoustic_model = am_class(
            #     embeddings, am_param_prior, am_alpha, am_K, assignments,
            #     covariance_type=covariance_type, lms=lms
            #     )

            # # Assign the embeddings one-by-one
            # for i_embed in init_embeds:
            #     # print i_embed
            #     self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments

        # Setup initial language model counts
        self.set_lm_counts()

    def set_fb_type(self, fb_type):
        self.fb_type = fb_type

        # Assign forward-backward function
        if fb_type == "bigram":
            self.fb_func = forward_backward
            self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_bigram
        elif fb_type == "unigram":
            self.fb_func = unigram_acoustic_wordseg.forward_backward
            self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_unigram
        else:
            assert False, "invalid `fb_type`: " + fb_type

    def set_lm_counts(self):
        # K = self.acoustic_model.components.K_max
        # unigram_counts = np.zeros(K, np.int)
        # bigram_counts = np.zeros((K, K), np.int)
        for i_utt in xrange(self.utterances.D):
            self.lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt))
            # print
            # print i_utt, "-"*5, self.get_unsup_transcript_i(i_utt)
            # j_prev = None
            # for i_cur in self.get_unsup_transcript_i(i_utt):
            #     self.lm.unigram_counts[i_cur] += 1
            #     if j_prev is not None:
            #         self.lm.bigram_counts[j_prev, i_cur] += 1
            #     j_prev = i_cur
        # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts)

    def log_prob_z(self):
        """
        Return the log marginal probability of component assignment P(z).
        """
        lm_tmp = BigramSmoothLM(intrp_lambda=self.lm.intrp_lambda,
                                a=self.lm.a,
                                b=self.lm.b,
                                K=self.lm.K)
        log_prob_z = 0.
        for i_utt in xrange(self.utterances.D):
            j_prev = None
            for i_cur in self.get_unsup_transcript_i(i_utt):
                if j_prev is not None:
                    log_prob_z += np.log(lm_tmp.prob_i_given_j(i_cur, j_prev))
                    lm_tmp.bigram_counts[j_prev, i_cur] += 1
                else:
                    log_prob_z += np.log(lm_tmp.prob_i(i_cur))
                lm_tmp.unigram_counts[i_cur] += 1
        return log_prob_z

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""
        log_prob_z = self.log_prob_z()
        log_prob_X_given_z = self.acoustic_model.log_prob_X_given_z()
        return log_prob_z + log_prob_X_given_z

    # @profile
    def log_marg_i_embed_unigram(self, i_embed):
        """Return the unigram log marginal of the i'th data vector: p(x_i)"""
        assert i_embed != -1

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = self.lms * self.lm.log_prob_vec_i()
        # logger.info("log_prob_z: " + str(log_prob_z))

        # (24.23) in Murphy, p. 842`
        log_prob_z[:self.acoustic_model.components.
                   K] += self.acoustic_model.components.log_post_pred(i_embed)
        # Empty (unactive) components
        log_prob_z[self.acoustic_model.components.
                   K:] += self.acoustic_model.components.log_prior(i_embed)
        return _cython_utils.logsumexp(log_prob_z)

    # @profile
    def gibbs_sample_inside_loop_i_embed(self,
                                         i_embed,
                                         j_prev_assignment=None,
                                         anneal_temp=1,
                                         i_utt=None):
        """
        Perform the inside loop of Gibbs sampling for data vector `i_embed`.
        """

        # Temp
        # print "j_prev_assignment", j_prev_assignment
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print

        # Compute log probability of `X[i]` belonging to each component; this
        # is the bigram version of (24.26) in Murphy, p. 843.
        if j_prev_assignment is not None:
            log_prob_z = np.log(self.lm.prob_vec_given_j(j_prev_assignment))
        else:
            log_prob_z = self.lm.log_prob_vec_i()
        # print log_prob_z

        # Scale with language model scaling factor
        log_prob_z *= self.lms
        # print log_prob_z
        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("lms * log(P(z=i|z_prev=j)): " + str(log_prob_z))
            logger.debug(
                "log(p(x|z=i)): " +
                str(self.acoustic_model.components.log_post_pred(i_embed)))

        # Bigram version of (24.23) in Murphy, p. 842
        log_prob_z[:self.acoustic_model.components.
                   K] += self.acoustic_model.components.log_post_pred(i_embed)
        # Empty (unactive) components
        log_prob_z[self.acoustic_model.components.
                   K:] += self.acoustic_model.components.log_prior(i_embed)
        if anneal_temp != 1:
            log_prob_z = log_prob_z - _cython_utils.logsumexp(log_prob_z)
            log_prob_z_anneal = 1. / anneal_temp * log_prob_z - _cython_utils.logsumexp(
                1. / anneal_temp * log_prob_z)
            prob_z = np.exp(log_prob_z_anneal)
        else:
            prob_z = np.exp(log_prob_z - _cython_utils.logsumexp(log_prob_z))
        assert not np.isnan(np.sum(prob_z))

        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("P(z=i|x): " + str(prob_z))

        # Sample the new component assignment for `X[i]`
        k = utils.draw(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.acoustic_model.components.K:
            k = self.acoustic_model.components.K

        if i_utt is not None and i_utt == i_debug_monitor:
            logger.debug("Adding item " + str(i_embed) +
                         " to acoustic model component " + str(k))
        self.acoustic_model.components.add_item(i_embed, k)

        return k

    def gibbs_sample_i(self,
                       i,
                       anneal_temp=1,
                       anneal_gibbs_am=False,
                       assignments_only=False):
        """
        Block Gibbs sample new boundaries and embedding assignments for
        utterance `i`.

        Return
        ------
        log_prob : float
        """

        # # Temp
        # print i, self.ids_to_utterance_labels[i], str(self.get_unsup_transcript_i(i))

        # Debug trace
        logger.debug("Gibbs sampling utterance: " + str(i))
        if i == i_debug_monitor:
            logger.debug("-" * 39)
            logger.debug("log p(X) before sampling: " + str(self.log_marg()))
            logger.debug("Unsupervised transcript before sampling: " +
                         str(self.get_unsup_transcript_i(i)))
            logger.debug("Unigram counts before sampling: " +
                         str(self.lm.unigram_counts))
            logger.debug("Bigram counts before sampling: " +
                         str(self.lm.bigram_counts))

        # Remove counts from the `lm`
        self.lm.remove_counts_from_utterance(self.get_unsup_transcript_i(i))

        # Remove embeddings from utterance `i` from the `acoustic_model`
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.components.del_item(i_embed)

        # Sample segmentation
        if not assignments_only:

            # Get the log probabilities of the embeddings
            N = self.utterances.lengths[i]
            vec_embed_log_probs = self.get_vec_embed_log_probs(
                self.utterances.vec_ids[i, :(N**2 + N) / 2],
                self.utterances.durations[i, :(N**2 + N) / 2])
            # assert False, "vec_embed_log_probs should be calculated differently based on unigram or bigram segmentation"

            # Debug trace
            if i == i_debug_monitor:
                logger.debug(
                    "Statistics before sampling, but after removing, is given below"
                )
                if self.fb_type == "unigram":
                    log_margs = [
                        self.log_marg_i_embed_unigram(j)
                        for j in self.utterances.get_segmented_embeds_i(i)
                        if j != -1
                    ]
                else:
                    assert False, "to-do"
                embeddings = self.utterances.get_segmented_embeds_i(i)
                lengths = self.utterances.get_segmented_durations_i(i)
                logger.debug("Embeddings: " + str(embeddings))
                logger.debug(
                    "Utterance embeddings: " +
                    str(self.utterances.get_original_segmented_embeds_i(i)))
                logger.debug(
                    "Landmark indices: " +
                    str(self.utterances.get_segmented_landmark_indices(i)))
                logger.debug("Durations: " +
                             str(self.utterances.get_segmented_durations_i(i)))
                logger.debug("log_margs: " + str(log_margs))
                logger.debug("sum(log_margs*lengths): " +
                             str(np.sum(log_margs * np.array(lengths))))
                logger.debug("log p(X): " + str(self.log_marg()))

            # Draw new boundaries for utterance `i`
            log_p_continue = math.log(self.calc_p_continue())
            log_prob, self.utterances.boundaries[i, :N] = self.fb_func(
                vec_embed_log_probs, log_p_continue, N, self.n_slices_min,
                self.n_slices_max, i, anneal_temp)

            # Debug trace
            if i == i_debug_monitor:
                logger.debug(
                    "Statistics after sampling, but before adding new embeddings to `acoustic_model`"
                )
                if self.fb_type == "unigram":
                    log_margs = [
                        self.log_marg_i_embed_unigram(j)
                        for j in self.utterances.get_segmented_embeds_i(i)
                        if j != -1
                    ]
                else:
                    assert False, "to-do"
                lengths = self.utterances.get_segmented_durations_i(i)
                logger.debug("Embeddings: " +
                             str(self.utterances.get_segmented_embeds_i(i)))
                logger.debug(
                    "Utterance embeddings: " +
                    str(self.utterances.get_original_segmented_embeds_i(i)))
                logger.debug(
                    "Landmark indices: " +
                    str(self.utterances.get_segmented_landmark_indices(i)))
                logger.debug("Durations: " +
                             str(self.utterances.get_segmented_durations_i(i)))
                logger.debug("log_margs: " + str(log_margs))
                logger.debug("sum(log_margs*lengths): " +
                             str(np.sum(log_margs * np.array(lengths))))
                logger.debug("log p(X): " + str(self.log_marg()))

        # # Temp
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print

        # Assign new embeddings to components in `acoustic_model`
        if i == i_debug_monitor:
            logger.debug("Sampling component assignments")
        j_prev_assignment = None
        for i_embed in self.utterances.get_segmented_embeds_i(i):
            if i_embed == -1:
                # This only happens because of backtracking in the forward-backward functions
                continue  # don't assign a non-embedding (accidently the last embedding)
            if anneal_gibbs_am:
                anneal_temp = anneal_temp
            else:
                anneal_temp = 1

            j_prev_assignment = self.gibbs_sample_inside_loop_i_embed(
                i_embed, j_prev_assignment, anneal_temp=anneal_temp, i_utt=i)

        self.lm.counts_from_utterance(self.get_unsup_transcript_i(i))

        # logger.info("!!!")
        # logger.info(str(self.lm.unigram_counts))
        # logger.info(str(self.acoustic_model.components.counts))
        # logger.info(str(self.lm.bigram_counts))
        # logger.info("!!!")

        # print "!!!", self.lm.unigram_counts
        # print self.acoustic_model.components.counts
        # print "bigram_counts", self.lm.bigram_counts

        # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts)

        # import copy
        # lm = copy.copy(self.lm)
        # lm.unigram_counts.fill(0.0)
        # lm.bigram_counts.fill(0.0)
        # for i_utt in xrange(self.utterances.D):
        #     lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt))
        # npt.assert_equal(lm.unigram_counts, self.lm.unigram_counts)
        # npt.assert_equal(lm.bigram_counts, self.lm.bigram_counts)
        # assert False

        # print self.lm.unigram_counts
        # print self.acoustic_model.components.lm.unigram_counts
        # print self.acoustic_model.components.counts
        # print self.lm.bigram_counts
        # assert False

        # Temp
        # print self.utterances.get_segmented_embeds_i(i)
        # print self.get_unsup_transcript_i(i)

        # Update `lm` counts
        # self.lm.counts_from_utterance(self.get_unsup_transcript_i(i))
        # assert False

        # # # Temp
        # print self.lm.unigram_counts
        # print self.lm.bigram_counts
        # print self.acoustic_model.components.lm.unigram_counts

        # Debug trace
        if i == i_debug_monitor:
            logger.debug("log p(X) after sampling: " + str(self.log_marg()))
            logger.debug("Unsupervised transcript after sampling: " +
                         str(self.get_unsup_transcript_i(i)))
            logger.debug("Unigram counts after sampling: " +
                         str(self.lm.unigram_counts))
            logger.debug("Bigram counts after sampling: " +
                         str(self.lm.bigram_counts))
            logger.debug("-" * 39)

        if assignments_only:
            # Segmentation is not performed, so frame-scaled marginals does not make gibbs_sample_inside_loop_i_embed
            return 0.
        else:
            return log_prob

    def gibbs_sample(self,
                     n_iter,
                     am_n_iter=0,
                     anneal_schedule=None,
                     anneal_start_temp_inv=0.1,
                     anneal_end_temp_inv=1,
                     n_anneal_steps=-1,
                     anneal_gibbs_am=False,
                     assignments_only=False):
        """
        Perform blocked Gibbs sampling on all utterances.

        Parameters
        ----------
        n_iter : int
            Number of Gibbs sampling iterations of segmentation.
        am_n_iter : int
            Number of acoustic model Gibbs sampling iterations inbetween
            segmentation sampling iterations.
        anneal_schedule : str
            Can be one of the following:
            - None: A constant temperature of `anneal_end_temp_inv` is used
              throughout; if `anneal_end_temp_inv` is left at default (1), then
              this is equivalent to not performing annealing.
            - "linear": Linearly take the inverse temperature from
              `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule,
              annealing is performed over all `n_iter` iterations.
            - "step": Piecewise schedule in which the inverse temperature is
              taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps` steps (annealing will be performed over all
              `n_iter` iterations; it might be worth adding an additional
              variable for this case to allow the step schedule to stop early).
        assignments_only : bool
            Whether only component assignments should be sampled, or whether
            both component assignment and segmentation should be performed.

        Return
        ------
        record_dict : dict
            Contains several fields describing the sampling process. Each field
            is described by its key and statistics are given in a list which
            covers the Gibbs sampling iterations.
        """

        logger.info("Gibbs sampling for " + str(n_iter) + " iterations")
        logger.debug("Monitoring utterance " +
                     self.ids_to_utterance_labels[i_debug_monitor] +
                     " (index=" + str(i_debug_monitor) + ")")

        # Setup annealing iterator
        if anneal_schedule is None:
            get_anneal_temp = iter([])
        elif anneal_schedule == "linear":
            if n_anneal_steps == -1:
                n_anneal_steps = n_iter
            anneal_list = 1. / np.linspace(anneal_start_temp_inv,
                                           anneal_end_temp_inv, n_anneal_steps)
            get_anneal_temp = iter(anneal_list)
        elif anneal_schedule == "step":
            assert not n_anneal_steps == -1, (
                "`n_anneal_steps` of -1 not allowed for step annealing schedule"
            )
            n_iter_per_step = int(round(float(n_iter) / n_anneal_steps))
            anneal_list = np.linspace(anneal_start_temp_inv,
                                      anneal_end_temp_inv, n_anneal_steps)
            anneal_list = 1. / anneal_list
            # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1]
            anneal_list = np.repeat(anneal_list, n_iter_per_step)
            get_anneal_temp = iter(anneal_list)

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        record_dict["log_marg"] = []
        record_dict["log_marg*length"] = []
        record_dict["log_prob_z"] = []
        record_dict["log_prob_X_given_z"] = []
        record_dict["anneal_temp"] = []
        record_dict["components"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in xrange(n_iter):

            start_time = time.time()

            # Perform intermediate acoustic model re-sampling
            if am_n_iter > 0:
                assert False, "to-do"
                self.acoustic_model.gibbs_sample(am_n_iter,
                                                 consider_unassigned=False)

            # Get anneal temperature
            anneal_temp = next(get_anneal_temp, anneal_end_temp_inv)

            # Loop over utterances
            utt_order = range(self.utterances.D)
            random.shuffle(utt_order)
            if debug_gibbs_only:
                utt_order = [i_debug_monitor]
            log_prob = 0
            for i_utt in utt_order:
                log_prob += self.gibbs_sample_i(i_utt, anneal_temp,
                                                anneal_gibbs_am,
                                                assignments_only)

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["log_marg*length"].append(log_prob)
            record_dict["log_prob_z"].append(self.log_prob_z())
            record_dict["log_prob_X_given_z"].append(
                self.acoustic_model.log_prob_X_given_z())
            record_dict["anneal_temp"].append(anneal_temp)
            record_dict["components"].append(self.acoustic_model.components.K)
            record_dict["n_tokens"].append(
                self.acoustic_model.get_n_assigned())

            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

            logger.debug("Unigram counts after inference: " +
                         str(self.lm.unigram_counts))
            logger.debug("Bigram counts after inference: " +
                         str(self.lm.bigram_counts))

        return record_dict

    # @profile
    def get_vec_embed_log_probs_unigram(self, vec_ids, durations):
        """
        Return the unigram log marginal probs of the `vec_ids` embeddings,
        scaled by the given `durations`.
        """

        # Get marginals
        vec_embed_log_probs = -np.inf * np.ones(len(vec_ids))
        for i, embed_id in enumerate(vec_ids):
            if embed_id == -1:
                continue
            vec_embed_log_probs[i] = self.log_marg_i_embed_unigram(embed_id)

            # Scale log marginals by number of frames
            if np.isnan(durations[i]):
                vec_embed_log_probs[i] = -np.inf
            else:
                vec_embed_log_probs[i] *= durations[i]**self.time_power_term

        return vec_embed_log_probs + self.wip

    def get_vec_embed_log_probs_bigram(self, vec_ids, durations):
        pass

    def calc_p_continue(self):
        """
        Return the probability of not having an utterance break.

        It is assumed that the number of utterances are one less than the total
        number, since the current utterance is excluded from the calculation.
        """
        if self.beta_sent_boundary != -1:
            assert False, "to check"
            n_tokens = sum(self.acoustic_model.components.counts
                           )  # number of assigned tokens
            n_sentences = self.utterances.D - 1
            n_continue = n_tokens - n_sentences
            p_continue = ((n_continue + self.beta_sent_boundary / 2.0) /
                          (n_tokens + self.beta_sent_boundary))
        else:
            p_continue = 1.0
        return p_continue

    def get_unsup_transcript_i(self, i):
        """Return a list of the components for current segmentation of `i`."""
        return list(
            self.acoustic_model.components.get_assignments(
                self.utterances.get_segmented_embeds_i(i)))
class SegmentalKMeansWordseg(object):
    """
    Segmental k-menas word segmentation using acoustic word embeddings.

    Segmentation and sampling operations are carried out in this class.
    Segmentation results are mainly stored in `utterances`, which deals with
    all utterance-level information, but knows nothing about the acoustics. The
    `acoustic_model` deals with all the acoustic embedding operations. In the
    member functions, the index `i` generally refers to the index of an
    utterance.

    Parameters
    ----------
    am_K : int
        Acoustic model parameter.
    embedding_mats : dict of matrix
        The matrices of embeddings for every utterance.
    vec_ids_dict : dict of vector of int
        For every utterance, the vector IDs (see `Utterances`).
    landmarks_dict : dict of list of int
        For every utterance, the landmark points at which word boundaries are
        considered, given in the number of frames (10 ms units) from the start
        of each utterance. There is an implicit landmark at the start of every
        utterance.
    durations_dict : dict of vector of int
        The shape of this dict is the same as that of `vec_ids_dict`, but here
        the duration (in frames) of each of the embeddings are given.
    seed_boundaries_dict : dict of list of tuple
        Every tuple is the start (inclusive) and end (exclusive) embedding
        slice index of a seed token, giving its boundaries. If not given, no
        seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    seed_boundaries_dict : dict of list of int
        For every utterance, seed boundaries in 10 ms units (same format as
        `landmarks_dict`). If not given, no seeding is used.
    seed_assignments_dict : dict of list of int
        Every int is a cluster assignment for the corresponding seed token in
        `seed_boundaries_dict`. If not given, no seeding is used.
    n_slices_min : int
        The minimum number of landmarks over which an embedding can be
        calculated.
    n_slices_max : int
        The maximum number of landmarks over which an embedding can be
        calculated.
    min_duration : int
        Minimum duration of a segment.
    wip : float
        Word insertion penalty.
    p_boundary_init : float
        See `Utterances`.
    init_am_assignments : str
        This setting determines how the initial acoustic model assignments are
        determined:
        - "rand": Randomly assigned.
        - "one-by-one": Data vectors are added one at a time to the acoustic
          model.
        - "spread": Vectors are also randomly assigned, but here an attempt is
          made to spread the items over the different components.

    Attributes
    ----------
    utterances : Utterances
        Knows nothing about the acoustics. The indices in the `vec_ids`
        attribute refers to the embedding at the corresponding row in
        `acoustic_model.components.X`.
    acoustic_model : KMeans
        Knows nothing about utterance-level information. All embeddings are
        stored in this class as the data `components.X` attribute.
    ids_to_utterance_labels : list of str
        Keeps track of utterance labels for a specific utterance ID.
    """

    def __init__(self, am_K, embedding_mats, vec_ids_dict, durations_dict,
            landmarks_dict, seed_boundaries_dict=None,
            seed_assignments_dict=None, n_slices_min=0, n_slices_max=20,
            min_duration=0, p_boundary_init=0.5, init_am_assignments="rand",
            wip=0):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.wip = wip

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats, vec_ids_dict#, n_slices_min=n_slices_min
            )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(
            lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries,
            p_boundary_init=p_boundary_init, n_slices_min=n_slices_min,
            n_slices_max=n_slices_max, min_duration=min_duration
            )

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]
        logger.info("No. initial embeddings: " + str(init_embeds.shape[0]))

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1*np.ones(N, dtype=int)
        if seed_assignments_dict is not None:
            assert False, "to-do"
    #         # Use seed assignments if provided
    #         logger.info("Using seed assignments")
    #         self.seed_to_cluster = {}
    #         i_cluster = 0
    #         for i_utt, utt in enumerate(ids_to_utterance_labels):
    #             utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
    #             utt_init_assignments = np.array(seed_assignments_dict[utt][:])
    #             utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)]
    #             utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)]
    #             for seed in utt_init_assignments:
    #                 if not seed in self.seed_to_cluster:
    #                     if isinstance(seed, (int, long)):
    #                         self.seed_to_cluster[seed] = seed
    #                     else:
    #                         self.seed_to_cluster[seed] = i_cluster
    #                         i_cluster += 1
    #             utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments]
    #             assignments[utt_init_embeds] = utt_init_assignments
    #         if am_K is None:
    #             am_K = max(self.seed_to_cluster.values()) + 1
    #         else:
    #             assert am_K >= max(self.seed_to_cluster.values()) + 1

    #         # Initialize `acoustic_model`
    #         self.acoustic_model = kmeans.KMeans(
    #             embeddings, am_param_prior, am_alpha, am_K, assignments,
    #             covariance_type=covariance_type, lms=lms
    #             )                

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds))
            
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments)

        elif init_am_assignments == "spread":

            logger.info("Spreading component assignments")
            n_init_embeds = len(init_embeds)
            assignment_list = (range(am_K)*int(np.ceil(float(n_init_embeds)/am_K)))[:n_init_embeds]
            random.shuffle(assignment_list)
            assignments[init_embeds] = np.array(assignment_list)

            # Initialize `acoustic_model`
            self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments)

        elif init_am_assignments == "one-by-one":
            assert False, "to-do"

    #         # Initialize `acoustic_model`
    #         logger.info("Using a one-by-one initial assignment")
    #         self.acoustic_model = kmeans.KMeans(
    #             embeddings, am_param_prior, am_alpha, am_K, assignments,
    #             covariance_type=covariance_type, lms=lms
    #             )

    #         # Assign the embeddings one-by-one
    #         for i_embed in init_embeds:
    #             # print i_embed
    #             self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments

    def segment_i(self, i):
        """
        Segment new boundaries for utterance `i`.

        Return
        ------
        sum_neg_len_sqrd_norm : float
            The length-weighted k-means objective for this utterance.
        """

        # Debug trace
        logger.debug("Segmeting utterance: " + str(i))
        if i == i_debug_monitor:
            logger.debug("-"*39)
            logger.debug("Statistics before sampling")
            logger.debug(
                "sum_neg_sqrd_norm before sampling: " +
                str(self.acoustic_model.components.sum_neg_sqrd_norm())
                )
            # logger.debug(
            #     "sum_neg_sqrd_norm before sampling: " +
            #     str(self.acoustic_model.components.sum_neg_sqrd_norm())
            #     )
            # logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i)))
            logger.debug("Unsupervised transcript: " + str(self.get_unsup_transcript_i(i)))
            logger.debug("Unsupervised max transcript: " + str(self.get_max_unsup_transcript_i(i)))

        # Note the embeddings before segmentation
        old_embeds = self.utterances.get_segmented_embeds_i(i)
        # # Temp ----
        # for i_embed in old_embeds:
        #     if i_embed == -1:
        #         continue  # don't remove a non-embedding (would accidently remove the last embedding)
        #     self.acoustic_model.components.del_item(i_embed)
        # self.acoustic_model.components.clean_components()
        # # ---- Temp

        # Get the scores of the embeddings
        N = self.utterances.lengths[i]
        vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms(
            self.utterances.vec_ids[i, :(N**2 + N)/2],
            self.utterances.durations[i, :(N**2 + N)/2]
            )

        # Debug trace
        if i == i_debug_monitor:
            logger.debug("vec_embed_neg_len_sqrd_norms: " + str(vec_embed_neg_len_sqrd_norms))
            neg_sqrd_norms = [
                self.acoustic_model.components.max_neg_sqrd_norm_i(j) for j in
                self.utterances.get_segmented_embeds_i(i) if j != -1
                ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            logger.debug("Embeddings: " + str(embeddings))
            logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i)))
            logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i)))
            logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i)))
            logger.debug("neg_sqrd_norms: " + str(neg_sqrd_norms))
            logger.debug("neg_len_sqrd_norms: " + str(neg_sqrd_norms*np.array(lengths)))
            logger.debug("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms*np.array(lengths))))

        # Draw new boundaries for utterance i
        sum_neg_len_sqrd_norm, self.utterances.boundaries[i, :N] = forward_backward_kmeans_viterbi(
            vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i
            )

        # Debug trace
        if i == i_debug_monitor:
            logger.debug("Statistics after sampling, but before adding new embeddings to acoustic model")
            neg_sqrd_norms = [
                self.acoustic_model.components.max_neg_sqrd_norm_i(j) for j in
                self.utterances.get_segmented_embeds_i(i) if j != -1
                ]
            where_bounds = np.where(self.utterances.boundaries[i, :N])[0]
            embeddings = self.utterances.get_segmented_embeds_i(i)
            lengths = self.utterances.get_segmented_durations_i(i)
            logger.debug("Embeddings: " + str(embeddings))
            logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i)))
            logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i)))
            logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i)))
            logger.debug("neg_sqrd_norms: " + str(neg_sqrd_norms))
            logger.debug("neg_len_sqrd_norms: " + str(neg_sqrd_norms*np.array(lengths)))
            logger.debug("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms*np.array(lengths))))

        # Remove old embeddings and add new ones; this is equivalent to
        # assigning the new embeddings and updating the means.
        new_embeds = self.utterances.get_segmented_embeds_i(i)
        new_k = self.get_max_unsup_transcript_i(i)
        for i_embed in old_embeds:
            if i_embed == -1:
                continue  # don't remove a non-embedding (would accidently remove the last embedding)
            self.acoustic_model.components.del_item(i_embed)
        for i_embed, k in zip(new_embeds, new_k):
            self.acoustic_model.components.add_item(i_embed, k)
        self.acoustic_model.components.clean_components()
        # self.acoustic_model.components.setup_random_means()

        # Debug trace
        if i == i_debug_monitor:
            logger.debug(
                "sum_neg_sqrd_norm after sampling: " +
                str(self.acoustic_model.components.sum_neg_sqrd_norm())
                )
            logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i)))
            logger.debug("-"*39)

        return sum_neg_len_sqrd_norm  # technically, this is with the old means (before updating, above)

    def get_vec_embed_neg_len_sqrd_norms(self, vec_ids, durations):

        # Get scores
        vec_embed_neg_len_sqrd_norms = -np.inf*np.ones(len(vec_ids))
        for i, embed_id in enumerate(vec_ids):
            if embed_id == -1:
                continue
            vec_embed_neg_len_sqrd_norms[i] = self.acoustic_model.components.max_neg_sqrd_norm_i(
                embed_id
                )

            # Scale log marginals by number of frames
            if np.isnan(durations[i]):
                vec_embed_neg_len_sqrd_norms[i] = -np.inf
            else:
                vec_embed_neg_len_sqrd_norms[i] *= durations[i]#**self.time_power_term

        return vec_embed_neg_len_sqrd_norms + self.wip

    def segment(self, n_iter, n_iter_inbetween_kmeans=0):
        """
        Perform segmentation of all utterances and update the k-means model.

        Parameters
        ----------
        n_iter : int
            Number of iterations of segmentation.
        n_iter_inbetween_kmeans : int
            Number of k-means iterations inbetween segmentation iterations.

        Return
        ------
        record_dict : dict
            Contains several fields describing the optimization iterations.
            Each field is described by its key and statistics are given in a
            list covering the iterations.
        """

        logger.info("Segmenting for " + str(n_iter) + " iterations")
        logger.debug(
            "Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor]
            + " (index=" + str(i_debug_monitor) + ")"
            )

        # Setup record dictionary
        record_dict = {}
        record_dict["sum_neg_sqrd_norm"] = []
        record_dict["sum_neg_len_sqrd_norm"] = []
        record_dict["components"] = []
        # record_dict["n_mean_updates"] = []
        record_dict["sample_time"] = []
        record_dict["n_tokens"] = []

        # Loop over sampling iterations
        for i_iter in xrange(n_iter):

            start_time = time.time()

            # Loop over utterances
            utt_order = range(self.utterances.D)
            random.shuffle(utt_order)
            if segment_debug_only:
                utt_order = [i_debug_monitor]
            sum_neg_len_sqrd_norm = 0
            for i_utt in utt_order:
                sum_neg_len_sqrd_norm += self.segment_i(i_utt)

            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["sum_neg_sqrd_norm"].append(self.acoustic_model.components.sum_neg_sqrd_norm())
            record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm)
            record_dict["components"].append(self.acoustic_model.components.K)
            record_dict["n_tokens"].append(self.acoustic_model.get_n_assigned())

            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

            # Perform intermediate acoustic model re-sampling
            if n_iter_inbetween_kmeans > 0:
                self.acoustic_model.fit(
                    n_iter_inbetween_kmeans, consider_unassigned=False
                    )
                # if i_iter == n_iter:
                # # Remove empty components
                # for k in np.where(
                #         self.acoustic_model.components.counts[:self.acoustic_model.components.K] == 0
                #         )[0][::-1]:
                #     self.acoustic_model.components.del_component(k)

        return record_dict

    def get_unsup_transcript_i(self, i):
        """
        Return a list of the current component assignments for current
        segmentation of `i`.
        """
        return list(
            self.acoustic_model.components.get_assignments(self.utterances.get_segmented_embeds_i(i))
            )

    def get_max_unsup_transcript_i(self, i):
        """
        Return a list of the best components for current segmentation of `i`.
        """
        return self.acoustic_model.components.get_max_assignments(
            self.utterances.get_segmented_embeds_i(i)
            )
    def __init__(self, am_K, embedding_mats, vec_ids_dict, durations_dict,
            landmarks_dict, seed_boundaries_dict=None,
            seed_assignments_dict=None, n_slices_min=0, n_slices_max=20,
            min_duration=0, p_boundary_init=0.5, init_am_assignments="rand",
            wip=0):

        logger.info("Initializing")

        # Check parameters
        assert seed_assignments_dict is None or seed_boundaries_dict is not None

        # Initialize simple attributes
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.wip = wip

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats, vec_ids_dict#, n_slices_min=n_slices_min
            )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        if seed_boundaries_dict is not None:
            seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels]
        else:
            seed_boundaries = None
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(
            lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries,
            p_boundary_init=p_boundary_init, n_slices_min=n_slices_min,
            n_slices_max=n_slices_max, min_duration=min_duration
            )

        # Find all the embeddings that are in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]
        logger.info("No. initial embeddings: " + str(init_embeds.shape[0]))

        # Provide the initial acoustic model assignments and initialize the model accordingly
        assignments = -1*np.ones(N, dtype=int)
        if seed_assignments_dict is not None:
            assert False, "to-do"
    #         # Use seed assignments if provided
    #         logger.info("Using seed assignments")
    #         self.seed_to_cluster = {}
    #         i_cluster = 0
    #         for i_utt, utt in enumerate(ids_to_utterance_labels):
    #             utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int)
    #             utt_init_assignments = np.array(seed_assignments_dict[utt][:])
    #             utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)]
    #             utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)]
    #             for seed in utt_init_assignments:
    #                 if not seed in self.seed_to_cluster:
    #                     if isinstance(seed, (int, long)):
    #                         self.seed_to_cluster[seed] = seed
    #                     else:
    #                         self.seed_to_cluster[seed] = i_cluster
    #                         i_cluster += 1
    #             utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments]
    #             assignments[utt_init_embeds] = utt_init_assignments
    #         if am_K is None:
    #             am_K = max(self.seed_to_cluster.values()) + 1
    #         else:
    #             assert am_K >= max(self.seed_to_cluster.values()) + 1

    #         # Initialize `acoustic_model`
    #         self.acoustic_model = kmeans.KMeans(
    #             embeddings, am_param_prior, am_alpha, am_K, assignments,
    #             covariance_type=covariance_type, lms=lms
    #             )                

        elif init_am_assignments == "rand":

            # Assign each of the above embeddings randomly to one of the `am_K` clusters
            logger.info("Using random initial component assignments")
            init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds))
            
            # Make sure we have consecutive values
            for k in xrange(init_embeds_assignments.max()):
                while len(np.nonzero(init_embeds_assignments == k)[0]) == 0:
                    init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1
                if init_embeds_assignments.max() == k:
                    break
            assignments[init_embeds] = init_embeds_assignments

            # Initialize `acoustic_model`
            self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments)

        elif init_am_assignments == "spread":

            logger.info("Spreading component assignments")
            n_init_embeds = len(init_embeds)
            assignment_list = (range(am_K)*int(np.ceil(float(n_init_embeds)/am_K)))[:n_init_embeds]
            random.shuffle(assignment_list)
            assignments[init_embeds] = np.array(assignment_list)

            # Initialize `acoustic_model`
            self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments)

        elif init_am_assignments == "one-by-one":
            assert False, "to-do"

    #         # Initialize `acoustic_model`
    #         logger.info("Using a one-by-one initial assignment")
    #         self.acoustic_model = kmeans.KMeans(
    #             embeddings, am_param_prior, am_alpha, am_K, assignments,
    #             covariance_type=covariance_type, lms=lms
    #             )

    #         # Assign the embeddings one-by-one
    #         for i_embed in init_embeds:
    #             # print i_embed
    #             self.acoustic_model.gibbs_sample_inside_loop_i(i_embed)

        else:
            assert False, "invalid value for `init_am_assignments`: " + init_am_assignments