def __init__(self): self.utterances = Utterances() self.ongoing_training = False self.schedulued_training = False self.repeat_training = False self.model = self.load() self.training_stack = []
def __init__(self, K_max, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, init_assignments="rand", wip=0): # Attributes from parameters self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.wip = wip # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict #, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Embeddings in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] print("No. initial embeddings: {}".format(init_embeds.shape[0])) # Initialize the K-means components assignments = -1 * np.ones(N, dtype=int) if init_assignments == "rand": assignments[init_embeds] = np.random.randint( 0, K_max, len(init_embeds)) elif init_assignments == "spread": n_init_embeds = len(init_embeds) assignment_list = ( range(K_max) * int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds] random.shuffle(assignment_list) assignments[init_embeds] = np.array(assignment_list) self.acoustic_model = KMeans(embeddings, K_max, assignments)
def parse_excel(path_to_data, sheet_index): """ Parses hand-made excel to trainign and validation files """ df = pd.read_excel(path_to_data, sheet_name=sheet_index) # fills missing intents df["intent"] = df["intent"].fillna(method="ffill") df = df.drop("téma", axis=1) df = df.drop("podkategorie", axis=1) # melts data to two columns : [intent, utterance] df = pd.melt(df, id_vars="intent", value_name="utterance", var_name="drop") df.drop("drop", axis=1, inplace=True) df["utterance"] = df["utterance"].str.lower() df["intent"] = df["intent"].str.lower() df.replace(np.nan, '', regex=True, inplace=True) df["utterance"] = df["utterance"].apply(unidecode.unidecode) df["intent"] = df["intent"].apply(unidecode.unidecode) df["intent"] = df["intent"].str.replace(" ", "_", regex=False) df["utterance"] = df["utterance"].str.replace("[^A-Za-z0-9 ]+", " ", regex=True) df.replace('', np.nan, regex=True, inplace=True) df.dropna(subset=["utterance"], inplace=True) docs = df_to_doc(df) Utterances().save_utterances(docs)
class BigramAcousticWordseg(object): """ Unigram word segmentation of speech using acoustic word embeddings. Segmentation and sampling operations are carried out in this class. Segmentation results are mainly stored in `utterances`, which deals with all utterance-level information, but knows nothing about the acoustics. The `acoustic_model` deals with all the acoustic embedding operations. Blocked Gibbs sampling is used for inference. In the member functions, the index `i` generally refers to the index of an utterance. Parameters ---------- am_K : int Acoustic model parameter. am_param_prior : e.g. instance of `FixedVarPrior` The acoustic model prior on the mean and covariance parameters. lm_params : dict A dictionary with at least an entry for "type", which can be "maxlikelihood", and the other entries giving the hyperparameters for that particular kind of language model. embedding_mats : dict of matrix The matrices of embeddings for every utterance. vec_ids_dict : dict of vector of int For every utterance, the vector IDs (see `Utterances`). landmarks_dict : dict of list of int For every utterance, the landmark points at which word boundaries are considered, given in the number of frames (10 ms units) from the start of each utterance. There is an implicit landmark at the start of every utterance. durations_dict : dict of vector of int The shape of this dict is the same as that of `vec_ids_dict`, but here the duration (in frames) of each of the embeddings are given. seed_boundaries_dict : dict of list of tuple Every tuple is the start (inclusive) and end (exclusive) embedding slice index of a seed token, giving its boundaries. If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. seed_boundaries_dict : dict of list of int For every utterance, seed boundaries in 10 ms units (same format as `landmarks_dict`). If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. n_slices_min : int The minimum number of landmarks over which an embedding can be calculated. n_slices_max : int The maximum number of landmarks over which an embedding can be calculated. min_duration : int Minimum duration of a segment. p_boundary_init : float See `Utterances`. beta_sent_boundary : float The symmetric Beta prior on the end of sentence probability; if this is set to -1, sentence boundary probabilities are not taken into account. lms : float Language model scaling factor. wip : float Word insertion penalty. fb_type : str The type of forward-backward algorithm to use: - "unigram": In this case, segmentation is carried out as it is done in the unigram case; i.e. only assignments are sampled using the bigram model. - "bigram": Sample assignments using the bigram language model. init_am_assignments : str This setting determines how the initial acoustic model assignments are determined: - "rand": Randomly assigned. - "one-by-one": Data vectors are added one at a time to the acoustic model. time_power_term : float Scaling the per-frame scaling; with 1.2 instead of 1, we get less words (prefer longer words). Attributes ---------- utterances : Utterances Knows nothing about the acoustics. The indices in the `vec_ids` attribute refers to the embedding at the corresponding row in `acoustic_model.components.X`. acoustic_model : BigramFBGMM Knows nothing about utterance-level information. All embeddings are stored in this class as the data `components.X` attribute. ids_to_utterance_labels : list of str Keeps track of utterance labels for a specific utterance ID. unigram_counts : Kx1 vector of int Counts for each of the K components. bigram_counts : KxK matrix of int Element (j, i) is the count N_i_given_j of the component i following the component j. """ def __init__(self, am_K, am_param_prior, lm_params, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, covariance_type="fixed", n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, beta_sent_boundary=2.0, lms=1., wip=0., fb_type="bigram", init_am_assignments="rand", time_power_term=1.): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.beta_sent_boundary = beta_sent_boundary self.wip = wip self.lms = lms self.time_power_term = time_power_term self.set_fb_type(fb_type) # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(embedding_mats, vec_ids_dict) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances( lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration ) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] # Setup language model if lm_params["type"] == "smooth": intrp_lambda = lm_params["intrp_lambda"] a = lm_params["a"] b = lm_params["b"] K = am_K self.lm = BigramSmoothLM(intrp_lambda, a, b, K) # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1*np.ones(N, dtype=int) if seed_assignments_dict is not None: # Use seed assignments if provided logger.info("Using seed assignments") self.seed_to_cluster = {} i_cluster = 0 for i_utt, utt in enumerate(ids_to_utterance_labels): utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int) utt_init_assignments = np.array(seed_assignments_dict[utt][:]) utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)] utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)] for seed in utt_init_assignments: if not seed in self.seed_to_cluster: if isinstance(seed, (int, long)): self.seed_to_cluster[seed] = seed else: self.seed_to_cluster[seed] = i_cluster i_cluster += 1 utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments] assignments[utt_init_embeds] = utt_init_assignments if am_K is None: am_K = max(self.seed_to_cluster.values()) + 1 else: assert am_K >= max(self.seed_to_cluster.values()) + 1 # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM( embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm ) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM( embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm ) elif init_am_assignments == "one-by-one": assert False # # Initialize `acoustic_model` # logger.info("Using a one-by-one initial assignment") # self.acoustic_model = am_class( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) # # Assign the embeddings one-by-one # for i_embed in init_embeds: # # print i_embed # self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments # Setup initial language model counts self.set_lm_counts() def set_fb_type(self, fb_type): self.fb_type = fb_type # Assign forward-backward function if fb_type == "bigram": self.fb_func = forward_backward self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_bigram elif fb_type == "unigram": self.fb_func = unigram_acoustic_wordseg.forward_backward self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_unigram else: assert False, "invalid `fb_type`: " + fb_type def set_lm_counts(self): # K = self.acoustic_model.components.K_max # unigram_counts = np.zeros(K, np.int) # bigram_counts = np.zeros((K, K), np.int) for i_utt in xrange(self.utterances.D): self.lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt)) # print # print i_utt, "-"*5, self.get_unsup_transcript_i(i_utt) # j_prev = None # for i_cur in self.get_unsup_transcript_i(i_utt): # self.lm.unigram_counts[i_cur] += 1 # if j_prev is not None: # self.lm.bigram_counts[j_prev, i_cur] += 1 # j_prev = i_cur # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts) def log_prob_z(self): """ Return the log marginal probability of component assignment P(z). """ lm_tmp = BigramSmoothLM( intrp_lambda=self.lm.intrp_lambda, a=self.lm.a, b=self.lm.b, K=self.lm.K ) log_prob_z = 0. for i_utt in xrange(self.utterances.D): j_prev = None for i_cur in self.get_unsup_transcript_i(i_utt): if j_prev is not None: log_prob_z += np.log(lm_tmp.prob_i_given_j(i_cur, j_prev)) lm_tmp.bigram_counts[j_prev, i_cur] += 1 else: log_prob_z += np.log(lm_tmp.prob_i(i_cur)) lm_tmp.unigram_counts[i_cur] += 1 return log_prob_z def log_marg(self): """Return log marginal of data and component assignments: p(X, z)""" log_prob_z = self.log_prob_z() log_prob_X_given_z = self.acoustic_model.log_prob_X_given_z() return log_prob_z + log_prob_X_given_z # @profile def log_marg_i_embed_unigram(self, i_embed): """Return the unigram log marginal of the i'th data vector: p(x_i)""" assert i_embed != -1 # Compute log probability of `X[i]` belonging to each component # (24.26) in Murphy, p. 843 log_prob_z = self.lms * self.lm.log_prob_vec_i() # logger.info("log_prob_z: " + str(log_prob_z)) # (24.23) in Murphy, p. 842` log_prob_z[:self.acoustic_model.components.K] += self.acoustic_model.components.log_post_pred( i_embed ) # Empty (unactive) components log_prob_z[self.acoustic_model.components.K:] += self.acoustic_model.components.log_prior(i_embed) return _cython_utils.logsumexp(log_prob_z) # @profile def gibbs_sample_inside_loop_i_embed(self, i_embed, j_prev_assignment=None, anneal_temp=1, i_utt=None): """ Perform the inside loop of Gibbs sampling for data vector `i_embed`. """ # Temp # print "j_prev_assignment", j_prev_assignment # print self.lm.unigram_counts # print self.lm.bigram_counts # print # Compute log probability of `X[i]` belonging to each component; this # is the bigram version of (24.26) in Murphy, p. 843. if j_prev_assignment is not None: log_prob_z = np.log(self.lm.prob_vec_given_j(j_prev_assignment)) else: log_prob_z = self.lm.log_prob_vec_i() # print log_prob_z # Scale with language model scaling factor log_prob_z *= self.lms # print log_prob_z if i_utt is not None and i_utt == i_debug_monitor: logger.debug("lms * log(P(z=i|z_prev=j)): " + str(log_prob_z)) logger.debug("log(p(x|z=i)): " + str(self.acoustic_model.components.log_post_pred(i_embed))) # Bigram version of (24.23) in Murphy, p. 842 log_prob_z[:self.acoustic_model.components.K] += self.acoustic_model.components.log_post_pred(i_embed) # Empty (unactive) components log_prob_z[self.acoustic_model.components.K:] += self.acoustic_model.components.log_prior(i_embed) if anneal_temp != 1: log_prob_z = log_prob_z - _cython_utils.logsumexp(log_prob_z) log_prob_z_anneal = 1./anneal_temp * log_prob_z - _cython_utils.logsumexp(1./anneal_temp * log_prob_z) prob_z = np.exp(log_prob_z_anneal) else: prob_z = np.exp(log_prob_z - _cython_utils.logsumexp(log_prob_z)) assert not np.isnan(np.sum(prob_z)) if i_utt is not None and i_utt == i_debug_monitor: logger.debug("P(z=i|x): " + str(prob_z)) # Sample the new component assignment for `X[i]` k = utils.draw(prob_z) # There could be several empty, unactive components at the end if k > self.acoustic_model.components.K: k = self.acoustic_model.components.K if i_utt is not None and i_utt == i_debug_monitor: logger.debug("Adding item " + str(i_embed) + " to acoustic model component " + str(k)) self.acoustic_model.components.add_item(i_embed, k) return k def gibbs_sample_i(self, i, anneal_temp=1, anneal_gibbs_am=False, assignments_only=False): """ Block Gibbs sample new boundaries and embedding assignments for utterance `i`. Return ------ log_prob : float """ # # Temp # print i, self.ids_to_utterance_labels[i], str(self.get_unsup_transcript_i(i)) # Debug trace logger.debug("Gibbs sampling utterance: " + str(i)) if i == i_debug_monitor: logger.debug("-"*39) logger.debug("log p(X) before sampling: " + str(self.log_marg())) logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unigram counts before sampling: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts before sampling: " + str(self.lm.bigram_counts)) # Remove counts from the `lm` self.lm.remove_counts_from_utterance(self.get_unsup_transcript_i(i)) # Remove embeddings from utterance `i` from the `acoustic_model` for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.components.del_item(i_embed) # Sample segmentation if not assignments_only: # Get the log probabilities of the embeddings N = self.utterances.lengths[i] vec_embed_log_probs = self.get_vec_embed_log_probs( self.utterances.vec_ids[i, :(N**2 + N)/2], self.utterances.durations[i, :(N**2 + N)/2] ) # assert False, "vec_embed_log_probs should be calculated differently based on unigram or bigram segmentation" # Debug trace if i == i_debug_monitor: logger.debug("Statistics before sampling, but after removing, is given below") if self.fb_type == "unigram": log_margs = [ self.log_marg_i_embed_unigram(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] else: assert False, "to-do" embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(embeddings)) logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs*np.array(lengths)))) logger.debug("log p(X): " + str(self.log_marg())) # Draw new boundaries for utterance `i` log_p_continue = math.log(self.calc_p_continue()) log_prob, self.utterances.boundaries[i, :N] = self.fb_func( vec_embed_log_probs, log_p_continue, N, self.n_slices_min, self.n_slices_max, i, anneal_temp ) # Debug trace if i == i_debug_monitor: logger.debug("Statistics after sampling, but before adding new embeddings to `acoustic_model`") if self.fb_type == "unigram": log_margs = [ self.log_marg_i_embed_unigram(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] else: assert False, "to-do" lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(self.utterances.get_segmented_embeds_i(i))) logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs*np.array(lengths)))) logger.debug("log p(X): " + str(self.log_marg())) # # Temp # print self.lm.unigram_counts # print self.lm.bigram_counts # print # Assign new embeddings to components in `acoustic_model` if i == i_debug_monitor: logger.debug("Sampling component assignments") j_prev_assignment = None for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: # This only happens because of backtracking in the forward-backward functions continue # don't assign a non-embedding (accidently the last embedding) if anneal_gibbs_am: anneal_temp = anneal_temp else: anneal_temp = 1 j_prev_assignment = self.gibbs_sample_inside_loop_i_embed( i_embed, j_prev_assignment, anneal_temp=anneal_temp, i_utt=i ) self.lm.counts_from_utterance(self.get_unsup_transcript_i(i)) # logger.info("!!!") # logger.info(str(self.lm.unigram_counts)) # logger.info(str(self.acoustic_model.components.counts)) # logger.info(str(self.lm.bigram_counts)) # logger.info("!!!") # print "!!!", self.lm.unigram_counts # print self.acoustic_model.components.counts # print "bigram_counts", self.lm.bigram_counts # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts) # import copy # lm = copy.copy(self.lm) # lm.unigram_counts.fill(0.0) # lm.bigram_counts.fill(0.0) # for i_utt in xrange(self.utterances.D): # lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt)) # npt.assert_equal(lm.unigram_counts, self.lm.unigram_counts) # npt.assert_equal(lm.bigram_counts, self.lm.bigram_counts) # assert False # print self.lm.unigram_counts # print self.acoustic_model.components.lm.unigram_counts # print self.acoustic_model.components.counts # print self.lm.bigram_counts # assert False # Temp # print self.utterances.get_segmented_embeds_i(i) # print self.get_unsup_transcript_i(i) # Update `lm` counts # self.lm.counts_from_utterance(self.get_unsup_transcript_i(i)) # assert False # # # Temp # print self.lm.unigram_counts # print self.lm.bigram_counts # print self.acoustic_model.components.lm.unigram_counts # Debug trace if i == i_debug_monitor: logger.debug("log p(X) after sampling: " + str(self.log_marg())) logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unigram counts after sampling: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts after sampling: " + str(self.lm.bigram_counts)) logger.debug("-"*39) if assignments_only: # Segmentation is not performed, so frame-scaled marginals does not make gibbs_sample_inside_loop_i_embed return 0. else: return log_prob def gibbs_sample(self, n_iter, am_n_iter=0, anneal_schedule=None, anneal_start_temp_inv=0.1, anneal_end_temp_inv=1, n_anneal_steps=-1, anneal_gibbs_am=False, assignments_only=False): """ Perform blocked Gibbs sampling on all utterances. Parameters ---------- n_iter : int Number of Gibbs sampling iterations of segmentation. am_n_iter : int Number of acoustic model Gibbs sampling iterations inbetween segmentation sampling iterations. anneal_schedule : str Can be one of the following: - None: A constant temperature of `anneal_end_temp_inv` is used throughout; if `anneal_end_temp_inv` is left at default (1), then this is equivalent to not performing annealing. - "linear": Linearly take the inverse temperature from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule, annealing is performed over all `n_iter` iterations. - "step": Piecewise schedule in which the inverse temperature is taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps` steps (annealing will be performed over all `n_iter` iterations; it might be worth adding an additional variable for this case to allow the step schedule to stop early). assignments_only : bool Whether only component assignments should be sampled, or whether both component assignment and segmentation should be performed. Return ------ record_dict : dict Contains several fields describing the sampling process. Each field is described by its key and statistics are given in a list which covers the Gibbs sampling iterations. """ logger.info("Gibbs sampling for " + str(n_iter) + " iterations") logger.debug( "Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor] + " (index=" + str(i_debug_monitor) + ")" ) # Setup annealing iterator if anneal_schedule is None: get_anneal_temp = iter([]) elif anneal_schedule == "linear": if n_anneal_steps == -1: n_anneal_steps = n_iter anneal_list = 1./np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) get_anneal_temp = iter(anneal_list) elif anneal_schedule == "step": assert not n_anneal_steps == -1, ( "`n_anneal_steps` of -1 not allowed for step annealing schedule" ) n_iter_per_step = int(round(float(n_iter)/n_anneal_steps)) anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) anneal_list = 1./anneal_list # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1] anneal_list = np.repeat(anneal_list, n_iter_per_step) get_anneal_temp = iter(anneal_list) # Setup record dictionary record_dict = {} record_dict["sample_time"] = [] record_dict["log_marg"] = [] record_dict["log_marg*length"] = [] record_dict["log_prob_z"] = [] record_dict["log_prob_X_given_z"] = [] record_dict["anneal_temp"] = [] record_dict["components"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in xrange(n_iter): start_time = time.time() # Perform intermediate acoustic model re-sampling if am_n_iter > 0: assert False, "to-do" self.acoustic_model.gibbs_sample( am_n_iter, consider_unassigned=False ) # Get anneal temperature anneal_temp = next(get_anneal_temp, anneal_end_temp_inv) # Loop over utterances utt_order = range(self.utterances.D) random.shuffle(utt_order) if debug_gibbs_only: utt_order = [i_debug_monitor] log_prob = 0 for i_utt in utt_order: log_prob += self.gibbs_sample_i(i_utt, anneal_temp, anneal_gibbs_am, assignments_only) record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["log_marg"].append(self.log_marg()) record_dict["log_marg*length"].append(log_prob) record_dict["log_prob_z"].append(self.log_prob_z()) record_dict["log_prob_X_given_z"].append(self.acoustic_model.log_prob_X_given_z()) record_dict["anneal_temp"].append(anneal_temp) record_dict["components"].append(self.acoustic_model.components.K) record_dict["n_tokens"].append(self.acoustic_model.get_n_assigned()) info = "iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) logger.info(info) logger.debug("Unigram counts after inference: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts after inference: " + str(self.lm.bigram_counts)) return record_dict # @profile def get_vec_embed_log_probs_unigram(self, vec_ids, durations): """ Return the unigram log marginal probs of the `vec_ids` embeddings, scaled by the given `durations`. """ # Get marginals vec_embed_log_probs = -np.inf*np.ones(len(vec_ids)) for i, embed_id in enumerate(vec_ids): if embed_id == -1: continue vec_embed_log_probs[i] = self.log_marg_i_embed_unigram(embed_id) # Scale log marginals by number of frames if np.isnan(durations[i]): vec_embed_log_probs[i] = -np.inf else: vec_embed_log_probs[i] *= durations[i]**self.time_power_term return vec_embed_log_probs + self.wip def get_vec_embed_log_probs_bigram(self, vec_ids, durations): pass def calc_p_continue(self): """ Return the probability of not having an utterance break. It is assumed that the number of utterances are one less than the total number, since the current utterance is excluded from the calculation. """ if self.beta_sent_boundary != -1: assert False, "to check" n_tokens = sum(self.acoustic_model.components.counts) # number of assigned tokens n_sentences = self.utterances.D - 1 n_continue = n_tokens - n_sentences p_continue = ( (n_continue + self.beta_sent_boundary / 2.0) / (n_tokens + self.beta_sent_boundary) ) else: p_continue = 1.0 return p_continue def get_unsup_transcript_i(self, i): """Return a list of the components for current segmentation of `i`.""" return list( self.acoustic_model.components.get_assignments(self.utterances.get_segmented_embeds_i(i)) )
def __init__(self, am_K, am_param_prior, lm_params, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, covariance_type="fixed", n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, beta_sent_boundary=2.0, lms=1., wip=0., fb_type="bigram", init_am_assignments="rand", time_power_term=1.): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.beta_sent_boundary = beta_sent_boundary self.wip = wip self.lms = lms self.time_power_term = time_power_term self.set_fb_type(fb_type) # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(embedding_mats, vec_ids_dict) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances( lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration ) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] # Setup language model if lm_params["type"] == "smooth": intrp_lambda = lm_params["intrp_lambda"] a = lm_params["a"] b = lm_params["b"] K = am_K self.lm = BigramSmoothLM(intrp_lambda, a, b, K) # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1*np.ones(N, dtype=int) if seed_assignments_dict is not None: # Use seed assignments if provided logger.info("Using seed assignments") self.seed_to_cluster = {} i_cluster = 0 for i_utt, utt in enumerate(ids_to_utterance_labels): utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int) utt_init_assignments = np.array(seed_assignments_dict[utt][:]) utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)] utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)] for seed in utt_init_assignments: if not seed in self.seed_to_cluster: if isinstance(seed, (int, long)): self.seed_to_cluster[seed] = seed else: self.seed_to_cluster[seed] = i_cluster i_cluster += 1 utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments] assignments[utt_init_embeds] = utt_init_assignments if am_K is None: am_K = max(self.seed_to_cluster.values()) + 1 else: assert am_K >= max(self.seed_to_cluster.values()) + 1 # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM( embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm ) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM( embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm ) elif init_am_assignments == "one-by-one": assert False # # Initialize `acoustic_model` # logger.info("Using a one-by-one initial assignment") # self.acoustic_model = am_class( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) # # Assign the embeddings one-by-one # for i_embed in init_embeds: # # print i_embed # self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments # Setup initial language model counts self.set_lm_counts()
class BigBrain: def __init__(self): self.utterances = Utterances() self.ongoing_training = False self.schedulued_training = False self.repeat_training = False self.model = self.load() self.training_stack = [] def schedulue_training(self): if self.schedulued_training: self.repeat_training = True return False self.schedulued_training = True t = Timer(TRAIN_OFFSET, self.gym) t.start() return True def process_utterance_text(self, text): processed = unidecode.unidecode(text) processed = processed.lower() return processed def predict(self, utterance): if self.model is None: print("Model is not initialized yet") return None, 0 processed = self.process_utterance_text(utterance) response = self.model.predict(processed) if len(response) < 2: return None, 0 confidence = response[1][0] intent = label_to_intent(response[0][0]) return (intent, confidence) def gym(self): if self.ongoing_training: # reschedule training self.schedulue_training() return self.ongoing_training = True train_path, _ = self.utterances.generate_train_file(eval_count=0) MODEL["input"] = train_path MODEL["loss"] = "hs" self.model = fasttext.train_supervised(**MODEL) self.schedulued_training = False self.ongoing_training = False self.save() if self.repeat_training: self.repeat_training = False self.gym() def path(self, name): return DATA_PATH + name def create_metadata(self): save = None with open(self.path("meta.json"), 'w+', encoding="utf-8") as f: save = {"models": []} json.dump(save, f) return save def load_metadata(self): meta = None with open(self.path("meta.json")) as f: meta = json.load(f) return meta def push_model(self, filename): meta = self.load_metadata() if (len(meta["models"]) == N_BACKUP): to_delete = meta["models"][:1][0] os.remove(self.path(to_delete)) meta["models"] = meta["models"][1:] meta["models"].append(filename) with open(self.path("meta.json"), 'w+', encoding="utf-8") as f: json.dump(meta, f) def get_intents(self): labels = self.model.get_labels() intents = [] for label in labels: intents.append(label_to_intent(label)) return intents def scoop_model_params(self): train_parameters = [ 'lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'bucket', 'lrUpdateRate', 't' ] args_getter = self.model.f.getArgs() parameters = {} for param in train_parameters: attr = getattr(args_getter, param) if param == 'loss': attr = attr.name parameters[param] = attr return parameters def print_prop(self): model = self.model f = model.f args = f.getArgs() keys2 = [a for a in dir(args) if not a.startswith('__')] print(keys2) def get_models(self): meta = self.load_metadata() return meta["models"] def save(self): filename = "model-" + random_string(8) + ".bin" self.model.save_model(self.path(filename)) self.push_model(filename) def meta_train(self): train_path, eval_path = self.utterances.generate_train_file( eval_count=EVAL_COUNT) self.model = fasttext.train_supervised( input=train_path, autotuneValidationFile=eval_path, autotuneModelSize="{}M".format(MAX_MODEL_SIZE)) self.save() def load(self): models = self.get_models() if len(models) == 0: return None filename = self.get_models()[-1:][0] return fasttext.load_model(self.path(filename)) def check_intent(self, intent): result = re.search(r"^[a-z0-9_]+$", intent) return result is not None def train(self, utterance): utterance["index"] = random_string(INDEX_LENGTH) utterance["utterance"] = self.process_utterance_text( utterance["utterance"]) print(utterance) intent = utterance["intent"] intent = intent.lower() if not self.check_intent(intent): return (TRAIN.BAD_INTENT, TRAIN.NO_TRAIN) utterance["intent"] = intent self.utterances.save_utterance(utterance) if not self.schedulued_training: self.schedulue_training() return (TRAIN.TRAIN_OK, TRAIN_OFFSET) def just_train(self): self.meta_train()
class UnigramAcousticWordseg(object): """ Unigram word segmentation of speech using acoustic word embeddings. Segmentation and sampling operations are carried out in this class. Segmentation results are mainly stored in `utterances`, which deals with all utterance-level information, but knows nothing about the acoustics. The `acoustic_model` deals with all the acoustic embedding operations. Blocked Gibbs sampling is used for inference. In the member functions, the index `i` generally refers to the index of an utterance. Parameters ---------- am_class : e.g. `FBGMM` am_alpha : float Acoustic model parameter. am_K : int Acoustic model parameter. am_param_prior : e.g. instance of `FixedVarPrior` The acoustic model prior on the mean and covariance parameters. embedding_mats : dict of matrix The matrices of embeddings for every utterance. vec_ids_dict : dict of vector of int For every utterance, the vector IDs (see `Utterances`). landmarks_dict : dict of list of int For every utterance, the landmark points at which word boundaries are considered, given in the number of frames (10 ms units) from the start of each utterance. There is an implicit landmark at the start of every utterance. durations_dict : dict of vector of int The shape of this dict is the same as that of `vec_ids_dict`, but here the duration (in frames) of each of the embeddings are given. seed_boundaries_dict : dict of list of tuple Every tuple is the start (inclusive) and end (exclusive) embedding slice index of a seed token, giving its boundaries. If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. seed_boundaries_dict : dict of list of int For every utterance, seed boundaries in 10 ms units (same format as `landmarks_dict`). If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. n_slices_min : int The minimum number of landmarks over which an embedding can be calculated. n_slices_max : int The maximum number of landmarks over which an embedding can be calculated. min_duration : int Minimum duration of a segment. p_boundary_init : float See `Utterances`. beta_sent_boundary : float The symmetric Beta prior on the end of sentence probability; if this is set to -1, sentence boundary probabilities are not taken into account. lms : float Language model scaling factor. wip : float Word insertion penalty. fb_type : str The type of forward-backward algorithm to use: - "standard": The normal forward filtering backward sampling algorithm. - "viterbi": The Viterbi version of the forward backward algorithm, using MAP assignments instead of sampling segmentation of embedding component assignments. init_am_assignments : str This setting determines how the initial acoustic model assignments are determined: - "rand": Randomly assigned. - "one-by-one": Data vectors are added one at a time to the acoustic model. time_power_term : float Scaling the per-frame scaling; with 1.2 instead of 1, we get less words (prefer longer words). Attributes ---------- utterances : Utterances Knows nothing about the acoustics. The indices in the `vec_ids` attribute refers to the embedding at the corresponding row in `acoustic_model.components.X`. acoustic_model : FBGMM or IGMM Knows nothing about utterance-level information. All embeddings are stored in this class as the data `components.X` attribute. ids_to_utterance_labels : list of str Keeps track of utterance labels for a specific utterance ID. """ def __init__(self, am_class, am_alpha, am_K, am_param_prior, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, covariance_type="fixed", n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, beta_sent_boundary=2.0, lms=1., wip=0., fb_type="standard", init_am_assignments="rand", time_power_term=1.): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.beta_sent_boundary = beta_sent_boundary # self.lms = lms self.wip = wip self.time_power_term = time_power_term self.set_fb_type(fb_type) # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict #, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # lengths = [ # int(-1 + np.sqrt(1 + 4 * 2 * i)) / 2 for i in # [len(vec_ids_dict[j]) for j in ids_to_utterance_labels] # ] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [ seed_boundaries_dict[i] for i in ids_to_utterance_labels ] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1 * np.ones(N, dtype=int) if seed_assignments_dict is not None: # Use seed assignments if provided logger.info("Using seed assignments") self.seed_to_cluster = {} i_cluster = 0 for i_utt, utt in enumerate(ids_to_utterance_labels): utt_init_embeds = np.array( self.utterances.get_segmented_embeds_i(i_utt), dtype=int) utt_init_assignments = np.array(seed_assignments_dict[utt][:]) utt_init_assignments = utt_init_assignments[np.where( utt_init_embeds != -1)] utt_init_embeds = utt_init_embeds[np.where( utt_init_embeds != -1)] for seed in utt_init_assignments: if not seed in self.seed_to_cluster: if isinstance(seed, (int, long)): self.seed_to_cluster[seed] = seed else: self.seed_to_cluster[seed] = i_cluster i_cluster += 1 utt_init_assignments = [ self.seed_to_cluster[i] for i in utt_init_assignments ] assignments[utt_init_embeds] = utt_init_assignments if am_K is None: am_K = max(self.seed_to_cluster.values()) + 1 else: assert am_K >= max(self.seed_to_cluster.values()) + 1 # Initialize `acoustic_model` self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint( 0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where( init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) elif init_am_assignments == "one-by-one": # Initialize `acoustic_model` logger.info("Using a one-by-one initial assignment") self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) # Assign the embeddings one-by-one for i_embed in init_embeds: # print i_embed self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments def set_fb_type(self, fb_type): self.fb_type = fb_type # Assign forward-backward function if fb_type == "standard": self.fb_func = forward_backward elif fb_type == "viterbi": self.fb_func = forward_backward_viterbi else: assert False, "invalid `fb_type`: " + fb_type def gibbs_sample_i(self, i, anneal_temp=1, anneal_gibbs_am=False): """ Block Gibbs sample new boundaries and embedding assignments for utterance `i`. Return ------ log_prob : float """ # Debug trace logger.debug("Gibbs sampling utterance: " + str(i)) if i == i_debug_monitor: logger.debug("-" * 39) logger.debug("log p(X) before sampling: " + str(self.acoustic_model.log_marg())) logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i))) # Remove embeddings from utterance `i` from the `acoustic_model` for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.components.del_item(i_embed) # Get the log probabilities of the embeddings N = self.utterances.lengths[i] vec_embed_log_probs = self.get_vec_embed_log_probs( self.utterances.vec_ids[i, :(N**2 + N) / 2], self.utterances.durations[i, :(N**2 + N) / 2]) # Debug trace if i == i_debug_monitor: logger.debug( "Statistics before sampling, but after removing, is given below" ) log_margs = [ self.acoustic_model.log_marg_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) # lengths = [] # i_bound = -1 # for embed, bound in zip(embeddings, where_bounds): # if embed == -1: # continue # lengths.append(bound - i_bound) # i_bound = bound # print lengths # print self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(embeddings)) logger.debug( "Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug( "Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs * np.array(lengths)))) logger.debug("log p(X): " + str(self.acoustic_model.log_marg())) # Draw new boundaries for utterance `i` log_p_continue = math.log(self.calc_p_continue()) log_prob, self.utterances.boundaries[i, :N] = self.fb_func( vec_embed_log_probs, log_p_continue, N, self.n_slices_min, self.n_slices_max, i, anneal_temp) # Debug trace if i == i_debug_monitor: logger.debug( "Statistics after sampling, but before adding new embeddings to `acoustic_model`" ) log_margs = [ self.acoustic_model.log_marg_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] lengths = self.utterances.get_segmented_durations_i(i) # lengths = [] # i_bound = -1 # for bound in where_bounds: # lengths.append(bound - i_bound) # i_bound = bound logger.debug("Embeddings: " + str(self.utterances.get_segmented_embeds_i(i))) logger.debug( "Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug( "Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs * np.array(lengths)))) logger.debug("log p(X): " + str(self.acoustic_model.log_marg())) # npt.assert_almost_equal(np.sum(log_margs*np.array(lengths)), log_prob) # Assign new embeddings to components in `acoustic_model` for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: # This only happens because of backtracking in the forward-backward functions continue # don't assign a non-embedding (accidently the last embedding) if self.fb_type == "standard": if anneal_gibbs_am: self.acoustic_model.gibbs_sample_inside_loop_i( i_embed, anneal_temp) else: self.acoustic_model.gibbs_sample_inside_loop_i( i_embed, anneal_temp=1) elif self.fb_type == "viterbi": self.acoustic_model.map_assign_i(i_embed) # Debug trace if i == i_debug_monitor: logger.debug("log p(X) after sampling: " + str(self.acoustic_model.log_marg())) logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("-" * 39) # # temp # print str(self.get_unsup_transcript_i(i)) return log_prob def gibbs_sample(self, n_iter, am_n_iter=0, anneal_schedule=None, anneal_start_temp_inv=0.1, anneal_end_temp_inv=1, n_anneal_steps=-1, anneal_gibbs_am=False): """ Perform blocked Gibbs sampling on all utterances. Parameters ---------- n_iter : int Number of Gibbs sampling iterations of segmentation. am_n_iter : int Number of acoustic model Gibbs sampling iterations inbetween segmentation sampling iterations. anneal_schedule : str Can be one of the following: - None: A constant temperature of `anneal_end_temp_inv` is used throughout; if `anneal_end_temp_inv` is left at default (1), then this is equivalent to not performing annealing. - "linear": Linearly take the inverse temperature from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule, annealing is performed over all `n_iter` iterations. - "step": Piecewise schedule in which the inverse temperature is taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps` steps (annealing will be performed over all `n_iter` iterations; it might be worth adding an additional variable for this case to allow the step schedule to stop early). Return ------ record_dict : dict Contains several fields describing the sampling process. Each field is described by its key and statistics are given in a list which covers the Gibbs sampling iterations. """ logger.info("Gibbs sampling for " + str(n_iter) + " iterations") logger.debug("Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor] + " (index=" + str(i_debug_monitor) + ")") # Setup annealing iterator if anneal_schedule is None: get_anneal_temp = iter([]) elif anneal_schedule == "linear": if n_anneal_steps == -1: n_anneal_steps = n_iter anneal_list = 1. / np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) get_anneal_temp = iter(anneal_list) elif anneal_schedule == "step": assert not n_anneal_steps == -1, ( "`n_anneal_steps` of -1 not allowed for step annealing schedule" ) n_iter_per_step = int(round(float(n_iter) / n_anneal_steps)) anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) anneal_list = 1. / anneal_list # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1] anneal_list = np.repeat(anneal_list, n_iter_per_step) get_anneal_temp = iter(anneal_list) # Setup record dictionary record_dict = {} record_dict["sample_time"] = [] record_dict["log_marg"] = [] record_dict["log_marg*length"] = [] record_dict["log_prob_z"] = [] record_dict["log_prob_X_given_z"] = [] record_dict["anneal_temp"] = [] record_dict["components"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in xrange(n_iter): start_time = time.time() # Perform intermediate acoustic model re-sampling if am_n_iter > 0: self.acoustic_model.gibbs_sample(am_n_iter, consider_unassigned=False) # Get anneal temperature anneal_temp = next(get_anneal_temp, anneal_end_temp_inv) # Loop over utterances utt_order = range(self.utterances.D) random.shuffle(utt_order) if debug_gibbs_only: utt_order = [i_debug_monitor] log_prob = 0 for i_utt in utt_order: log_prob += self.gibbs_sample_i(i_utt, anneal_temp, anneal_gibbs_am) record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["log_marg"].append(self.acoustic_model.log_marg()) record_dict["log_marg*length"].append(log_prob) record_dict["log_prob_z"].append(self.acoustic_model.log_prob_z()) record_dict["log_prob_X_given_z"].append( self.acoustic_model.log_prob_X_given_z()) record_dict["anneal_temp"].append(anneal_temp) record_dict["components"].append(self.acoustic_model.components.K) record_dict["n_tokens"].append( self.acoustic_model.get_n_assigned()) info = "iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) logger.info(info) return record_dict def get_vec_embed_log_probs(self, vec_ids, durations): """ Return the log marginal probs of the `vec_ids` embeddings, scaled by the given `durations`. """ # Get marginals vec_embed_log_probs = -np.inf * np.ones(len(vec_ids)) for i, embed_id in enumerate(vec_ids): if embed_id == -1: continue vec_embed_log_probs[i] = self.acoustic_model.log_marg_i(embed_id) # Scale log marginals by number of frames if np.isnan(durations[i]): vec_embed_log_probs[i] = -np.inf else: vec_embed_log_probs[i] *= durations[i]**self.time_power_term # # Scale log marginals by number of frames # N = int(-1 + np.sqrt(1 + 4 * 2 * len(vec_ids))) / 2 # see `__init__` # i_ = 0 # for t in xrange(1, N + 1): # # Per-frame scaling # vec_embed_log_probs[i_:i_ + t] = vec_embed_log_probs[i_:i_ + t] * ( # np.arange(t, 0, -1) # ) # # # Add duration prior # # if not self.dur_gamma_a_loc_scale is None: # # duration_prior_log = gamma.logpdf( # # np.arange(t, 0, -1), self.dur_gamma_a_loc_scale[0], # # loc=self.dur_gamma_a_loc_scale[1], scale=self.dur_gamma_a_loc_scale[2] # # ) # # vec_embed_log_probs[i_:i_ + t] += self.dur_scaling_factor*duration_prior_log # i_ += t return vec_embed_log_probs + self.wip def calc_p_continue(self): """ Return the probability of not having an utterance break. It is assumed that the number of utterances are one less than the total number, since the current utterance is excluded from the calculation. """ if self.beta_sent_boundary != -1: assert False, "to check" n_tokens = sum(self.acoustic_model.components.counts ) # number of assigned tokens n_sentences = self.utterances.D - 1 n_continue = n_tokens - n_sentences p_continue = ((n_continue + self.beta_sent_boundary / 2.0) / (n_tokens + self.beta_sent_boundary)) else: p_continue = 1.0 return p_continue def get_unsup_transcript_i(self, i): """Return a list of the components for current segmentation of `i`.""" return list( self.acoustic_model.components.get_assignments( self.utterances.get_segmented_embeds_i(i))) def get_log_margs_i(self, i): """ Get the log marginals for the current segmentation of utterance `i`. The segments from utterance `i` is removed and then added back in. This function is used for monitoring and post-processing. """ # Remove embeddings from utterance `i` from the `acoustic_model` segmented_embeds = self.utterances.get_segmented_embeds_i(i) assignments = self.acoustic_model.components.get_assignments( segmented_embeds) for i_embed in segmented_embeds: if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.components.del_item(i_embed) log_margs = [ self.acoustic_model.log_marg_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] # Add the embeddings back into the model for embed, assignment in zip(segmented_embeds, assignments): self.acoustic_model.components.add_item(embed, assignment) return log_margs
def __init__(self, am_class, am_alpha, am_K, am_param_prior, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, covariance_type="fixed", n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, beta_sent_boundary=2.0, lms=1., wip=0., fb_type="standard", init_am_assignments="rand", time_power_term=1.): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.beta_sent_boundary = beta_sent_boundary # self.lms = lms self.wip = wip self.time_power_term = time_power_term self.set_fb_type(fb_type) # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict #, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # lengths = [ # int(-1 + np.sqrt(1 + 4 * 2 * i)) / 2 for i in # [len(vec_ids_dict[j]) for j in ids_to_utterance_labels] # ] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [ seed_boundaries_dict[i] for i in ids_to_utterance_labels ] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1 * np.ones(N, dtype=int) if seed_assignments_dict is not None: # Use seed assignments if provided logger.info("Using seed assignments") self.seed_to_cluster = {} i_cluster = 0 for i_utt, utt in enumerate(ids_to_utterance_labels): utt_init_embeds = np.array( self.utterances.get_segmented_embeds_i(i_utt), dtype=int) utt_init_assignments = np.array(seed_assignments_dict[utt][:]) utt_init_assignments = utt_init_assignments[np.where( utt_init_embeds != -1)] utt_init_embeds = utt_init_embeds[np.where( utt_init_embeds != -1)] for seed in utt_init_assignments: if not seed in self.seed_to_cluster: if isinstance(seed, (int, long)): self.seed_to_cluster[seed] = seed else: self.seed_to_cluster[seed] = i_cluster i_cluster += 1 utt_init_assignments = [ self.seed_to_cluster[i] for i in utt_init_assignments ] assignments[utt_init_embeds] = utt_init_assignments if am_K is None: am_K = max(self.seed_to_cluster.values()) + 1 else: assert am_K >= max(self.seed_to_cluster.values()) + 1 # Initialize `acoustic_model` self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint( 0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where( init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) elif init_am_assignments == "one-by-one": # Initialize `acoustic_model` logger.info("Using a one-by-one initial assignment") self.acoustic_model = am_class(embeddings, am_param_prior, am_alpha, am_K, assignments, covariance_type=covariance_type, lms=lms) # Assign the embeddings one-by-one for i_embed in init_embeds: # print i_embed self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments
class ESKmeans(object): """ Embedded segmental K-means. Segmentation and clustering are carried out using this class. Variables related to the segmentation are stored in the `utterances` attribute, which deals with all utterance-level information but knows nothing about the acoustics. The `kmeans` attribute deals with all the acoustic embedding operations. In member functions, index `i` generally refers to the index of an utterance. Parameters ---------- K_max : int Maximum number of components. embedding_mats : dict of matrix The matrices of embeddings for every utterance. vec_ids_dict : dict of vector of int For every utterance, the vector IDs (see `Utterances`). READ!!!! landmarks_dict : dict of list of int For every utterance, the landmark points at which word boundaries are considered, given in the number of frames (10 ms units) from the start of each utterance. There is an implicit landmark at the start of every utterance. durations_dict : dict of vector of int The shape of this dict is the same as that of `vec_ids_dict`, but here the duration (in frames) of each of the embeddings are given. n_slices_min : int The minimum number of landmarks over which an embedding can be calculated. n_slices_max : int The maximum number of landmarks over which an embedding can be calculated. min_duration : int Minimum duration of a segment. wip : float Word insertion penalty. p_boundary_init : float See `Utterances`. init_assignments : str This setting determines how the initial acoustic model assignments are determined: "rand" assigns data vectors randomly; "each-in-own" assigns each data point to a component of its own; and "spread" makes an attempt to spread data vectors evenly over the components. Attributes ---------- utterances : Utterances Knows nothing about the acoustics. The indices in the `vec_ids` attribute refers to the embedding at the corresponding row in `acoustic_model.X`. acoustic_model : KMeans Knows nothing about utterance-level information. All embeddings are stored in this class in its `X` attribute. ids_to_utterance_labels : list of str Keeps track of utterance labels for a specific utterance ID. """ def __init__(self, K_max, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, init_assignments="rand", wip=0): # Attributes from parameters self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.wip = wip # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict #, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Embeddings in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] print("No. initial embeddings: {}".format(init_embeds.shape[0])) print(init_embeds) # Initialize the K-means components assignments = -1 * np.ones(N, dtype=int) if init_assignments == "rand": assignments[init_embeds] = np.random.randint( 0, K_max, len(init_embeds)) elif init_assignments == "spread": n_init_embeds = len(init_embeds) assignment_list = ( range(K_max) * int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds] random.shuffle(assignment_list) assignments[init_embeds] = np.array(assignment_list) self.acoustic_model = KMeans(embeddings, K_max, assignments) def save(self, f): self.acoustic_model.save(f) # self.utterances.save(f) def load(self, f): self.acoustic_model.load(f) self.utterances.load(f) def segment_i(self, i): """ Segment new boundaries and cluster new segments for utterance `i`. Return ------ sum_neg_len_sqrd_norm : float The length-weighted K-means objective for this utterance. """ # Debug trace if DEBUG > 0: print("Segmenting utterance: " + str(i)) if i == I_DEBUG_MONITOR: print("-" * 79) print("Statistics before sampling") print("sum_neg_sqrd_norm before sampling: " + str(self.acoustic_model.sum_neg_sqrd_norm())) print("Unsupervised transcript: " + str(self.get_unsup_transcript_i(i))) print("Unsupervised max transcript: " + str(self.get_max_unsup_transcript_i(i))) # The embeddings before segmentation old_embeds = self.utterances.get_segmented_embeds_i(i) # Get the scores of the embeddings N = self.utterances.lengths[i] vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms( self.utterances.vec_ids[i, :(N**2 + N) // 2], self.utterances.durations[i, :(N**2 + N) // 2]) # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print("vec_embed_neg_len_sqrd_norms: " + str(vec_embed_neg_len_sqrd_norms)) neg_sqrd_norms = [ self.acoustic_model.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) print(lengths) print("Embeddings: " + str(embeddings)) print("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) print("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) print("Durations: " + str(self.utterances.get_segmented_durations_i(i))) print("neg_sqrd_norms: " + str(neg_sqrd_norms)) print("neg_len_sqrd_norms: " + str(neg_sqrd_norms * np.array(lengths))) print("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms * np.array(lengths)))) # Draw new boundaries for utterance i sum_neg_len_sqrd_norm, self.utterances.boundaries[ i, :N] = forward_backward_kmeans_viterbi( vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i) # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print( "Statistics after sampling, but before adding new embeddings to acoustic model" ) neg_sqrd_norms = [ self.acoustic_model.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) print(lengths) print("Embeddings: " + str(embeddings)) print("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) print("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) print("Durations: " + str(self.utterances.get_segmented_durations_i(i))) print("neg_sqrd_norms: " + str(neg_sqrd_norms)) print("neg_len_sqrd_norms: " + str(neg_sqrd_norms * np.array(lengths))) print("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms * np.array(lengths)))) # Remove old embeddings and add new ones; this is equivalent to # assigning the new embeddings and updating the means. new_embeds = self.utterances.get_segmented_embeds_i(i) new_k = self.get_max_unsup_transcript_i(i) for i_embed in old_embeds: if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.del_item(i_embed) for i_embed, k in zip(new_embeds, new_k): self.acoustic_model.add_item(i_embed, k) self.acoustic_model.clean_components() # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print("sum_neg_sqrd_norm after sampling: " + str(self.acoustic_model.sum_neg_sqrd_norm())) print("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) print("-" * 79) return sum_neg_len_sqrd_norm # technically, this is with the old means (before updating, above) def segment(self, n_iter, n_iter_inbetween_kmeans=0): """ Perform segmentation of all utterances and update the K-means model. Parameters ---------- n_iter : int Number of iterations of segmentation. n_iter_inbetween_kmeans : int Number of K-means iterations inbetween segmentation iterations. Return ------ record_dict : dict Contains several fields describing the optimization iterations. Each field is described by its key and statistics are given in a list covering the iterations. """ # Debug trace print("Segmenting for {} iterations".format(n_iter)) if DEBUG > 0: print("Monitoring utterance {} (index={:d})".format( self.ids_to_utterance_labels[I_DEBUG_MONITOR], I_DEBUG_MONITOR)) # Setup record dictionary record_dict = {} record_dict["sum_neg_sqrd_norm"] = [] record_dict["sum_neg_len_sqrd_norm"] = [] record_dict["components"] = [] record_dict["sample_time"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in range(n_iter): start_time = time.time() # Loop over utterances utt_order = list(range(self.utterances.D)) random.shuffle(utt_order) if SEGMENT_DEBUG_ONLY: utt_order = [I_DEBUG_MONITOR] sum_neg_len_sqrd_norm = 0 for i_utt in utt_order: sum_neg_len_sqrd_norm += self.segment_i(i_utt) record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["sum_neg_sqrd_norm"].append( self.acoustic_model.sum_neg_sqrd_norm()) record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm) record_dict["components"].append(self.acoustic_model.K) record_dict["n_tokens"].append( self.acoustic_model.get_n_assigned()) info = "Iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) print(info) # Perform intermediate acoustic model re-sampling if n_iter_inbetween_kmeans > 0: self.acoustic_model.fit(n_iter_inbetween_kmeans, consider_unassigned=False) return record_dict def segment_only_i(self, i): """ Segment new boundaries for utterance `i`, without cluster assignment. Although cluster assignments are not updated, the cluster assignments are determined and returned (but the `acoustic_model` is not updated). Return ------ i, sum_neg_len_sqrd_norm, new_boundaries, old_embeds, new_embeds, new_k : (int, vector, float, list, list, list) The utterance index; the length-weighted K-means objective for this utterance; newly segmented boundaries; embeddings before segmentation; new embeddings after segmentation; new embedding assignments. """ # Debug trace if DEBUG > 0: print("Segmenting utterance: " + str(i)) if i == I_DEBUG_MONITOR: print("-" * 79) print("Statistics before sampling") print("sum_neg_sqrd_norm before sampling: " + str(self.acoustic_model.sum_neg_sqrd_norm())) print("Unsupervised transcript: " + str(self.get_unsup_transcript_i(i))) print("Unsupervised max transcript: " + str(self.get_max_unsup_transcript_i(i))) # The embeddings before segmentation old_embeds = self.utterances.get_segmented_embeds_i(i) # Get the scores of the embeddings N = self.utterances.lengths[i] vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms( self.utterances.vec_ids[i, :(N**2 + N) / 2], self.utterances.durations[i, :(N**2 + N) / 2]) # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print("vec_embed_neg_len_sqrd_norms: " + str(vec_embed_neg_len_sqrd_norms)) neg_sqrd_norms = [ self.acoustic_model.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) print("Embeddings: " + str(embeddings)) print("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) print("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) print("Durations: " + str(self.utterances.get_segmented_durations_i(i))) print("neg_sqrd_norms: " + str(neg_sqrd_norms)) print("neg_len_sqrd_norms: " + str(neg_sqrd_norms * np.array(lengths))) print("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms * np.array(lengths)))) # Draw new boundaries for utterance i sum_neg_len_sqrd_norm, new_boundaries = forward_backward_kmeans_viterbi( vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i) # sum_neg_len_sqrd_norm, self.utterances.boundaries[i, :N] = forward_backward_kmeans_viterbi( # vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i # ) # new_boundaries = self.utterances.boundaries[i, :N] # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print( "Statistics after sampling, but before adding new embeddings to acoustic model" ) neg_sqrd_norms = [ self.acoustic_model.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) print("Embeddings: " + str(embeddings)) print("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) print("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) print("Durations: " + str(self.utterances.get_segmented_durations_i(i))) print("neg_sqrd_norms: " + str(neg_sqrd_norms)) print("neg_len_sqrd_norms: " + str(neg_sqrd_norms * np.array(lengths))) print("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms * np.array(lengths)))) # Remove old embeddings and add new ones; this is equivalent to # assigning the new embeddings and updating the means. # new_embeds = self.utterances.get_segmented_embeds_i(i) # new_k = self.get_max_unsup_transcript_i(i) new_embeds = self.utterances.get_segmented_embeds_i_bounds( i, new_boundaries) new_k = self.get_max_unsup_transcript_i_embeds(i, new_embeds) # for i_embed in old_embeds: # if i_embed == -1: # continue # don't remove a non-embedding (would accidently remove the last embedding) # self.acoustic_model.del_item(i_embed) # for i_embed, k in zip(new_embeds, new_k): # self.acoustic_model.add_item(i_embed, k) # self.acoustic_model.clean_components() # Debug trace if DEBUG > 0 and i == I_DEBUG_MONITOR: print("sum_neg_sqrd_norm after sampling: " + str(self.acoustic_model.sum_neg_sqrd_norm())) print("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) print("-" * 79) return i, sum_neg_len_sqrd_norm, new_boundaries, old_embeds, new_embeds, new_k def segment_parallel(self, n_iter, n_iter_inbetween_kmeans=0, n_cpus=1, n_batches=1): """ Perform segmentation of all utterances and update the K-means model. Parameters ---------- n_iter : int Number of iterations of segmentation. n_iter_inbetween_kmeans : int Number of K-means iterations inbetween segmentation iterations. n_cpus : int Number of parallel processes. n_batches : int Over each batch, an update is made. Return ------ record_dict : dict Contains several fields describing the optimization iterations. Each field is described by its key and statistics are given in a list covering the iterations. """ # Debug trace print("Segmenting for {} iterations".format(n_iter)) if DEBUG > 0: print("Monitoring utterance {} (index={:d})".format( self.ids_to_utterance_labels[I_DEBUG_MONITOR], I_DEBUG_MONITOR)) # Setup record dictionary record_dict = {} record_dict["sum_neg_sqrd_norm"] = [] record_dict["sum_neg_len_sqrd_norm"] = [] record_dict["components"] = [] record_dict["sample_time"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in range(n_iter): start_time = time.time() # Determine utterance order utt_global_order = range(self.utterances.D) random.shuffle(utt_global_order) n_batch_size = int( np.ceil(len(utt_global_order) / float(n_batches))) # Perform segmentation over batches sum_neg_len_sqrd_norm = 0 for i_batch in range(n_batches): utt_order = utt_global_order[n_batch_size * i_batch:n_batch_size * (i_batch + 1)] # Segment in parallel utt_batches = [utt_order[i::n_cpus] for i in range(n_cpus)] updates = Parallel(n_jobs=n_cpus)( delayed(local_segment_only_utts)(self, utts) for utts in utt_batches) # Aggregate updates updates = [item for sublist in updates for item in sublist] # flatten old_embeds = [] new_embeds = [] new_k = [] for (i_utt, cur_sum_neg_len_sqrd_norm, cur_new_bounds, cur_old_embeds, cur_new_embeds, cur_new_k) in updates: sum_neg_len_sqrd_norm += cur_sum_neg_len_sqrd_norm old_embeds.extend(cur_old_embeds) new_embeds.extend(cur_new_embeds) new_k.extend(cur_new_k) N = self.utterances.lengths[i_utt] self.utterances.boundaries[i_utt, :N] = cur_new_bounds # Remove old embeddings and add new ones; this is equivalent to # assigning the new embeddings and updating the means. for i_embed in old_embeds: if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.del_item(i_embed) for i_embed, k in zip(new_embeds, new_k): self.acoustic_model.add_item(i_embed, k) self.acoustic_model.clean_components() record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["sum_neg_sqrd_norm"].append( self.acoustic_model.sum_neg_sqrd_norm()) record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm) record_dict["components"].append(self.acoustic_model.K) record_dict["n_tokens"].append( self.acoustic_model.get_n_assigned()) info = "Iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) print(info) # Perform intermediate acoustic model re-sampling if n_iter_inbetween_kmeans > 0: self.acoustic_model.fit(n_iter_inbetween_kmeans, consider_unassigned=False) return record_dict def get_vec_embed_neg_len_sqrd_norms(self, vec_ids, durations): # Get scores vec_embed_neg_len_sqrd_norms = -np.inf * np.ones(len(vec_ids)) for i, embed_id in enumerate(vec_ids): if embed_id == -1: continue vec_embed_neg_len_sqrd_norms[ i] = self.acoustic_model.max_neg_sqrd_norm_i(embed_id) # Scale log marginals by number of frames # if np.isnan(durations[i]): if durations[i] == -1: vec_embed_neg_len_sqrd_norms[i] = -np.inf else: vec_embed_neg_len_sqrd_norms[i] *= durations[ i] #**self.time_power_term return vec_embed_neg_len_sqrd_norms + self.wip def get_unsup_transcript_i(self, i): """ Return a list of the current component assignments for the current segmentation of `i`. """ return list( self.acoustic_model.get_assignments( self.utterances.get_segmented_embeds_i(i))) def get_max_unsup_transcript_i(self, i): """ Return a list of the best components for current segmentation of `i`. """ return self.acoustic_model.get_max_assignments( self.utterances.get_segmented_embeds_i(i)) def get_max_unsup_transcript_i_embeds(self, i, embeddings): """ Return a list of the best components for the given embeddings of `i`. """ return self.acoustic_model.get_max_assignments(embeddings)
class BigramAcousticWordseg(object): """ Unigram word segmentation of speech using acoustic word embeddings. Segmentation and sampling operations are carried out in this class. Segmentation results are mainly stored in `utterances`, which deals with all utterance-level information, but knows nothing about the acoustics. The `acoustic_model` deals with all the acoustic embedding operations. Blocked Gibbs sampling is used for inference. In the member functions, the index `i` generally refers to the index of an utterance. Parameters ---------- am_K : int Acoustic model parameter. am_param_prior : e.g. instance of `FixedVarPrior` The acoustic model prior on the mean and covariance parameters. lm_params : dict A dictionary with at least an entry for "type", which can be "maxlikelihood", and the other entries giving the hyperparameters for that particular kind of language model. embedding_mats : dict of matrix The matrices of embeddings for every utterance. vec_ids_dict : dict of vector of int For every utterance, the vector IDs (see `Utterances`). landmarks_dict : dict of list of int For every utterance, the landmark points at which word boundaries are considered, given in the number of frames (10 ms units) from the start of each utterance. There is an implicit landmark at the start of every utterance. durations_dict : dict of vector of int The shape of this dict is the same as that of `vec_ids_dict`, but here the duration (in frames) of each of the embeddings are given. seed_boundaries_dict : dict of list of tuple Every tuple is the start (inclusive) and end (exclusive) embedding slice index of a seed token, giving its boundaries. If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. seed_boundaries_dict : dict of list of int For every utterance, seed boundaries in 10 ms units (same format as `landmarks_dict`). If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. n_slices_min : int The minimum number of landmarks over which an embedding can be calculated. n_slices_max : int The maximum number of landmarks over which an embedding can be calculated. min_duration : int Minimum duration of a segment. p_boundary_init : float See `Utterances`. beta_sent_boundary : float The symmetric Beta prior on the end of sentence probability; if this is set to -1, sentence boundary probabilities are not taken into account. lms : float Language model scaling factor. wip : float Word insertion penalty. fb_type : str The type of forward-backward algorithm to use: - "unigram": In this case, segmentation is carried out as it is done in the unigram case; i.e. only assignments are sampled using the bigram model. - "bigram": Sample assignments using the bigram language model. init_am_assignments : str This setting determines how the initial acoustic model assignments are determined: - "rand": Randomly assigned. - "one-by-one": Data vectors are added one at a time to the acoustic model. time_power_term : float Scaling the per-frame scaling; with 1.2 instead of 1, we get less words (prefer longer words). Attributes ---------- utterances : Utterances Knows nothing about the acoustics. The indices in the `vec_ids` attribute refers to the embedding at the corresponding row in `acoustic_model.components.X`. acoustic_model : BigramFBGMM Knows nothing about utterance-level information. All embeddings are stored in this class as the data `components.X` attribute. ids_to_utterance_labels : list of str Keeps track of utterance labels for a specific utterance ID. unigram_counts : Kx1 vector of int Counts for each of the K components. bigram_counts : KxK matrix of int Element (j, i) is the count N_i_given_j of the component i following the component j. """ def __init__(self, am_K, am_param_prior, lm_params, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, covariance_type="fixed", n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, beta_sent_boundary=2.0, lms=1., wip=0., fb_type="bigram", init_am_assignments="rand", time_power_term=1.): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.beta_sent_boundary = beta_sent_boundary self.wip = wip self.lms = lms self.time_power_term = time_power_term self.set_fb_type(fb_type) # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [ seed_boundaries_dict[i] for i in ids_to_utterance_labels ] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] # Setup language model if lm_params["type"] == "smooth": intrp_lambda = lm_params["intrp_lambda"] a = lm_params["a"] b = lm_params["b"] K = am_K self.lm = BigramSmoothLM(intrp_lambda, a, b, K) # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1 * np.ones(N, dtype=int) if seed_assignments_dict is not None: # Use seed assignments if provided logger.info("Using seed assignments") self.seed_to_cluster = {} i_cluster = 0 for i_utt, utt in enumerate(ids_to_utterance_labels): utt_init_embeds = np.array( self.utterances.get_segmented_embeds_i(i_utt), dtype=int) utt_init_assignments = np.array(seed_assignments_dict[utt][:]) utt_init_assignments = utt_init_assignments[np.where( utt_init_embeds != -1)] utt_init_embeds = utt_init_embeds[np.where( utt_init_embeds != -1)] for seed in utt_init_assignments: if not seed in self.seed_to_cluster: if isinstance(seed, (int, long)): self.seed_to_cluster[seed] = seed else: self.seed_to_cluster[seed] = i_cluster i_cluster += 1 utt_init_assignments = [ self.seed_to_cluster[i] for i in utt_init_assignments ] assignments[utt_init_embeds] = utt_init_assignments if am_K is None: am_K = max(self.seed_to_cluster.values()) + 1 else: assert am_K >= max(self.seed_to_cluster.values()) + 1 # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM(embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint( 0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where( init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = BigramFBGMM(embeddings, am_param_prior, am_K, assignments, covariance_type=covariance_type, lms=lms, lm=self.lm) elif init_am_assignments == "one-by-one": assert False # # Initialize `acoustic_model` # logger.info("Using a one-by-one initial assignment") # self.acoustic_model = am_class( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) # # Assign the embeddings one-by-one # for i_embed in init_embeds: # # print i_embed # self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments # Setup initial language model counts self.set_lm_counts() def set_fb_type(self, fb_type): self.fb_type = fb_type # Assign forward-backward function if fb_type == "bigram": self.fb_func = forward_backward self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_bigram elif fb_type == "unigram": self.fb_func = unigram_acoustic_wordseg.forward_backward self.get_vec_embed_log_probs = self.get_vec_embed_log_probs_unigram else: assert False, "invalid `fb_type`: " + fb_type def set_lm_counts(self): # K = self.acoustic_model.components.K_max # unigram_counts = np.zeros(K, np.int) # bigram_counts = np.zeros((K, K), np.int) for i_utt in xrange(self.utterances.D): self.lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt)) # print # print i_utt, "-"*5, self.get_unsup_transcript_i(i_utt) # j_prev = None # for i_cur in self.get_unsup_transcript_i(i_utt): # self.lm.unigram_counts[i_cur] += 1 # if j_prev is not None: # self.lm.bigram_counts[j_prev, i_cur] += 1 # j_prev = i_cur # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts) def log_prob_z(self): """ Return the log marginal probability of component assignment P(z). """ lm_tmp = BigramSmoothLM(intrp_lambda=self.lm.intrp_lambda, a=self.lm.a, b=self.lm.b, K=self.lm.K) log_prob_z = 0. for i_utt in xrange(self.utterances.D): j_prev = None for i_cur in self.get_unsup_transcript_i(i_utt): if j_prev is not None: log_prob_z += np.log(lm_tmp.prob_i_given_j(i_cur, j_prev)) lm_tmp.bigram_counts[j_prev, i_cur] += 1 else: log_prob_z += np.log(lm_tmp.prob_i(i_cur)) lm_tmp.unigram_counts[i_cur] += 1 return log_prob_z def log_marg(self): """Return log marginal of data and component assignments: p(X, z)""" log_prob_z = self.log_prob_z() log_prob_X_given_z = self.acoustic_model.log_prob_X_given_z() return log_prob_z + log_prob_X_given_z # @profile def log_marg_i_embed_unigram(self, i_embed): """Return the unigram log marginal of the i'th data vector: p(x_i)""" assert i_embed != -1 # Compute log probability of `X[i]` belonging to each component # (24.26) in Murphy, p. 843 log_prob_z = self.lms * self.lm.log_prob_vec_i() # logger.info("log_prob_z: " + str(log_prob_z)) # (24.23) in Murphy, p. 842` log_prob_z[:self.acoustic_model.components. K] += self.acoustic_model.components.log_post_pred(i_embed) # Empty (unactive) components log_prob_z[self.acoustic_model.components. K:] += self.acoustic_model.components.log_prior(i_embed) return _cython_utils.logsumexp(log_prob_z) # @profile def gibbs_sample_inside_loop_i_embed(self, i_embed, j_prev_assignment=None, anneal_temp=1, i_utt=None): """ Perform the inside loop of Gibbs sampling for data vector `i_embed`. """ # Temp # print "j_prev_assignment", j_prev_assignment # print self.lm.unigram_counts # print self.lm.bigram_counts # print # Compute log probability of `X[i]` belonging to each component; this # is the bigram version of (24.26) in Murphy, p. 843. if j_prev_assignment is not None: log_prob_z = np.log(self.lm.prob_vec_given_j(j_prev_assignment)) else: log_prob_z = self.lm.log_prob_vec_i() # print log_prob_z # Scale with language model scaling factor log_prob_z *= self.lms # print log_prob_z if i_utt is not None and i_utt == i_debug_monitor: logger.debug("lms * log(P(z=i|z_prev=j)): " + str(log_prob_z)) logger.debug( "log(p(x|z=i)): " + str(self.acoustic_model.components.log_post_pred(i_embed))) # Bigram version of (24.23) in Murphy, p. 842 log_prob_z[:self.acoustic_model.components. K] += self.acoustic_model.components.log_post_pred(i_embed) # Empty (unactive) components log_prob_z[self.acoustic_model.components. K:] += self.acoustic_model.components.log_prior(i_embed) if anneal_temp != 1: log_prob_z = log_prob_z - _cython_utils.logsumexp(log_prob_z) log_prob_z_anneal = 1. / anneal_temp * log_prob_z - _cython_utils.logsumexp( 1. / anneal_temp * log_prob_z) prob_z = np.exp(log_prob_z_anneal) else: prob_z = np.exp(log_prob_z - _cython_utils.logsumexp(log_prob_z)) assert not np.isnan(np.sum(prob_z)) if i_utt is not None and i_utt == i_debug_monitor: logger.debug("P(z=i|x): " + str(prob_z)) # Sample the new component assignment for `X[i]` k = utils.draw(prob_z) # There could be several empty, unactive components at the end if k > self.acoustic_model.components.K: k = self.acoustic_model.components.K if i_utt is not None and i_utt == i_debug_monitor: logger.debug("Adding item " + str(i_embed) + " to acoustic model component " + str(k)) self.acoustic_model.components.add_item(i_embed, k) return k def gibbs_sample_i(self, i, anneal_temp=1, anneal_gibbs_am=False, assignments_only=False): """ Block Gibbs sample new boundaries and embedding assignments for utterance `i`. Return ------ log_prob : float """ # # Temp # print i, self.ids_to_utterance_labels[i], str(self.get_unsup_transcript_i(i)) # Debug trace logger.debug("Gibbs sampling utterance: " + str(i)) if i == i_debug_monitor: logger.debug("-" * 39) logger.debug("log p(X) before sampling: " + str(self.log_marg())) logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unigram counts before sampling: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts before sampling: " + str(self.lm.bigram_counts)) # Remove counts from the `lm` self.lm.remove_counts_from_utterance(self.get_unsup_transcript_i(i)) # Remove embeddings from utterance `i` from the `acoustic_model` for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.components.del_item(i_embed) # Sample segmentation if not assignments_only: # Get the log probabilities of the embeddings N = self.utterances.lengths[i] vec_embed_log_probs = self.get_vec_embed_log_probs( self.utterances.vec_ids[i, :(N**2 + N) / 2], self.utterances.durations[i, :(N**2 + N) / 2]) # assert False, "vec_embed_log_probs should be calculated differently based on unigram or bigram segmentation" # Debug trace if i == i_debug_monitor: logger.debug( "Statistics before sampling, but after removing, is given below" ) if self.fb_type == "unigram": log_margs = [ self.log_marg_i_embed_unigram(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] else: assert False, "to-do" embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(embeddings)) logger.debug( "Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug( "Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs * np.array(lengths)))) logger.debug("log p(X): " + str(self.log_marg())) # Draw new boundaries for utterance `i` log_p_continue = math.log(self.calc_p_continue()) log_prob, self.utterances.boundaries[i, :N] = self.fb_func( vec_embed_log_probs, log_p_continue, N, self.n_slices_min, self.n_slices_max, i, anneal_temp) # Debug trace if i == i_debug_monitor: logger.debug( "Statistics after sampling, but before adding new embeddings to `acoustic_model`" ) if self.fb_type == "unigram": log_margs = [ self.log_marg_i_embed_unigram(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] else: assert False, "to-do" lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(self.utterances.get_segmented_embeds_i(i))) logger.debug( "Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug( "Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("log_margs: " + str(log_margs)) logger.debug("sum(log_margs*lengths): " + str(np.sum(log_margs * np.array(lengths)))) logger.debug("log p(X): " + str(self.log_marg())) # # Temp # print self.lm.unigram_counts # print self.lm.bigram_counts # print # Assign new embeddings to components in `acoustic_model` if i == i_debug_monitor: logger.debug("Sampling component assignments") j_prev_assignment = None for i_embed in self.utterances.get_segmented_embeds_i(i): if i_embed == -1: # This only happens because of backtracking in the forward-backward functions continue # don't assign a non-embedding (accidently the last embedding) if anneal_gibbs_am: anneal_temp = anneal_temp else: anneal_temp = 1 j_prev_assignment = self.gibbs_sample_inside_loop_i_embed( i_embed, j_prev_assignment, anneal_temp=anneal_temp, i_utt=i) self.lm.counts_from_utterance(self.get_unsup_transcript_i(i)) # logger.info("!!!") # logger.info(str(self.lm.unigram_counts)) # logger.info(str(self.acoustic_model.components.counts)) # logger.info(str(self.lm.bigram_counts)) # logger.info("!!!") # print "!!!", self.lm.unigram_counts # print self.acoustic_model.components.counts # print "bigram_counts", self.lm.bigram_counts # npt.assert_equal(self.acoustic_model.components.counts, self.lm.unigram_counts) # import copy # lm = copy.copy(self.lm) # lm.unigram_counts.fill(0.0) # lm.bigram_counts.fill(0.0) # for i_utt in xrange(self.utterances.D): # lm.counts_from_utterance(self.get_unsup_transcript_i(i_utt)) # npt.assert_equal(lm.unigram_counts, self.lm.unigram_counts) # npt.assert_equal(lm.bigram_counts, self.lm.bigram_counts) # assert False # print self.lm.unigram_counts # print self.acoustic_model.components.lm.unigram_counts # print self.acoustic_model.components.counts # print self.lm.bigram_counts # assert False # Temp # print self.utterances.get_segmented_embeds_i(i) # print self.get_unsup_transcript_i(i) # Update `lm` counts # self.lm.counts_from_utterance(self.get_unsup_transcript_i(i)) # assert False # # # Temp # print self.lm.unigram_counts # print self.lm.bigram_counts # print self.acoustic_model.components.lm.unigram_counts # Debug trace if i == i_debug_monitor: logger.debug("log p(X) after sampling: " + str(self.log_marg())) logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unigram counts after sampling: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts after sampling: " + str(self.lm.bigram_counts)) logger.debug("-" * 39) if assignments_only: # Segmentation is not performed, so frame-scaled marginals does not make gibbs_sample_inside_loop_i_embed return 0. else: return log_prob def gibbs_sample(self, n_iter, am_n_iter=0, anneal_schedule=None, anneal_start_temp_inv=0.1, anneal_end_temp_inv=1, n_anneal_steps=-1, anneal_gibbs_am=False, assignments_only=False): """ Perform blocked Gibbs sampling on all utterances. Parameters ---------- n_iter : int Number of Gibbs sampling iterations of segmentation. am_n_iter : int Number of acoustic model Gibbs sampling iterations inbetween segmentation sampling iterations. anneal_schedule : str Can be one of the following: - None: A constant temperature of `anneal_end_temp_inv` is used throughout; if `anneal_end_temp_inv` is left at default (1), then this is equivalent to not performing annealing. - "linear": Linearly take the inverse temperature from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule, annealing is performed over all `n_iter` iterations. - "step": Piecewise schedule in which the inverse temperature is taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in `n_anneal_steps` steps (annealing will be performed over all `n_iter` iterations; it might be worth adding an additional variable for this case to allow the step schedule to stop early). assignments_only : bool Whether only component assignments should be sampled, or whether both component assignment and segmentation should be performed. Return ------ record_dict : dict Contains several fields describing the sampling process. Each field is described by its key and statistics are given in a list which covers the Gibbs sampling iterations. """ logger.info("Gibbs sampling for " + str(n_iter) + " iterations") logger.debug("Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor] + " (index=" + str(i_debug_monitor) + ")") # Setup annealing iterator if anneal_schedule is None: get_anneal_temp = iter([]) elif anneal_schedule == "linear": if n_anneal_steps == -1: n_anneal_steps = n_iter anneal_list = 1. / np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) get_anneal_temp = iter(anneal_list) elif anneal_schedule == "step": assert not n_anneal_steps == -1, ( "`n_anneal_steps` of -1 not allowed for step annealing schedule" ) n_iter_per_step = int(round(float(n_iter) / n_anneal_steps)) anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps) anneal_list = 1. / anneal_list # anneal_list = [100.0, 10.0, 3.0, 1.0, 0.1] anneal_list = np.repeat(anneal_list, n_iter_per_step) get_anneal_temp = iter(anneal_list) # Setup record dictionary record_dict = {} record_dict["sample_time"] = [] record_dict["log_marg"] = [] record_dict["log_marg*length"] = [] record_dict["log_prob_z"] = [] record_dict["log_prob_X_given_z"] = [] record_dict["anneal_temp"] = [] record_dict["components"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in xrange(n_iter): start_time = time.time() # Perform intermediate acoustic model re-sampling if am_n_iter > 0: assert False, "to-do" self.acoustic_model.gibbs_sample(am_n_iter, consider_unassigned=False) # Get anneal temperature anneal_temp = next(get_anneal_temp, anneal_end_temp_inv) # Loop over utterances utt_order = range(self.utterances.D) random.shuffle(utt_order) if debug_gibbs_only: utt_order = [i_debug_monitor] log_prob = 0 for i_utt in utt_order: log_prob += self.gibbs_sample_i(i_utt, anneal_temp, anneal_gibbs_am, assignments_only) record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["log_marg"].append(self.log_marg()) record_dict["log_marg*length"].append(log_prob) record_dict["log_prob_z"].append(self.log_prob_z()) record_dict["log_prob_X_given_z"].append( self.acoustic_model.log_prob_X_given_z()) record_dict["anneal_temp"].append(anneal_temp) record_dict["components"].append(self.acoustic_model.components.K) record_dict["n_tokens"].append( self.acoustic_model.get_n_assigned()) info = "iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) logger.info(info) logger.debug("Unigram counts after inference: " + str(self.lm.unigram_counts)) logger.debug("Bigram counts after inference: " + str(self.lm.bigram_counts)) return record_dict # @profile def get_vec_embed_log_probs_unigram(self, vec_ids, durations): """ Return the unigram log marginal probs of the `vec_ids` embeddings, scaled by the given `durations`. """ # Get marginals vec_embed_log_probs = -np.inf * np.ones(len(vec_ids)) for i, embed_id in enumerate(vec_ids): if embed_id == -1: continue vec_embed_log_probs[i] = self.log_marg_i_embed_unigram(embed_id) # Scale log marginals by number of frames if np.isnan(durations[i]): vec_embed_log_probs[i] = -np.inf else: vec_embed_log_probs[i] *= durations[i]**self.time_power_term return vec_embed_log_probs + self.wip def get_vec_embed_log_probs_bigram(self, vec_ids, durations): pass def calc_p_continue(self): """ Return the probability of not having an utterance break. It is assumed that the number of utterances are one less than the total number, since the current utterance is excluded from the calculation. """ if self.beta_sent_boundary != -1: assert False, "to check" n_tokens = sum(self.acoustic_model.components.counts ) # number of assigned tokens n_sentences = self.utterances.D - 1 n_continue = n_tokens - n_sentences p_continue = ((n_continue + self.beta_sent_boundary / 2.0) / (n_tokens + self.beta_sent_boundary)) else: p_continue = 1.0 return p_continue def get_unsup_transcript_i(self, i): """Return a list of the components for current segmentation of `i`.""" return list( self.acoustic_model.components.get_assignments( self.utterances.get_segmented_embeds_i(i)))
class SegmentalKMeansWordseg(object): """ Segmental k-menas word segmentation using acoustic word embeddings. Segmentation and sampling operations are carried out in this class. Segmentation results are mainly stored in `utterances`, which deals with all utterance-level information, but knows nothing about the acoustics. The `acoustic_model` deals with all the acoustic embedding operations. In the member functions, the index `i` generally refers to the index of an utterance. Parameters ---------- am_K : int Acoustic model parameter. embedding_mats : dict of matrix The matrices of embeddings for every utterance. vec_ids_dict : dict of vector of int For every utterance, the vector IDs (see `Utterances`). landmarks_dict : dict of list of int For every utterance, the landmark points at which word boundaries are considered, given in the number of frames (10 ms units) from the start of each utterance. There is an implicit landmark at the start of every utterance. durations_dict : dict of vector of int The shape of this dict is the same as that of `vec_ids_dict`, but here the duration (in frames) of each of the embeddings are given. seed_boundaries_dict : dict of list of tuple Every tuple is the start (inclusive) and end (exclusive) embedding slice index of a seed token, giving its boundaries. If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. seed_boundaries_dict : dict of list of int For every utterance, seed boundaries in 10 ms units (same format as `landmarks_dict`). If not given, no seeding is used. seed_assignments_dict : dict of list of int Every int is a cluster assignment for the corresponding seed token in `seed_boundaries_dict`. If not given, no seeding is used. n_slices_min : int The minimum number of landmarks over which an embedding can be calculated. n_slices_max : int The maximum number of landmarks over which an embedding can be calculated. min_duration : int Minimum duration of a segment. wip : float Word insertion penalty. p_boundary_init : float See `Utterances`. init_am_assignments : str This setting determines how the initial acoustic model assignments are determined: - "rand": Randomly assigned. - "one-by-one": Data vectors are added one at a time to the acoustic model. - "spread": Vectors are also randomly assigned, but here an attempt is made to spread the items over the different components. Attributes ---------- utterances : Utterances Knows nothing about the acoustics. The indices in the `vec_ids` attribute refers to the embedding at the corresponding row in `acoustic_model.components.X`. acoustic_model : KMeans Knows nothing about utterance-level information. All embeddings are stored in this class as the data `components.X` attribute. ids_to_utterance_labels : list of str Keeps track of utterance labels for a specific utterance ID. """ def __init__(self, am_K, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, init_am_assignments="rand", wip=0): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.wip = wip # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict#, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances( lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration ) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] logger.info("No. initial embeddings: " + str(init_embeds.shape[0])) # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1*np.ones(N, dtype=int) if seed_assignments_dict is not None: assert False, "to-do" # # Use seed assignments if provided # logger.info("Using seed assignments") # self.seed_to_cluster = {} # i_cluster = 0 # for i_utt, utt in enumerate(ids_to_utterance_labels): # utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int) # utt_init_assignments = np.array(seed_assignments_dict[utt][:]) # utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)] # utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)] # for seed in utt_init_assignments: # if not seed in self.seed_to_cluster: # if isinstance(seed, (int, long)): # self.seed_to_cluster[seed] = seed # else: # self.seed_to_cluster[seed] = i_cluster # i_cluster += 1 # utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments] # assignments[utt_init_embeds] = utt_init_assignments # if am_K is None: # am_K = max(self.seed_to_cluster.values()) + 1 # else: # assert am_K >= max(self.seed_to_cluster.values()) + 1 # # Initialize `acoustic_model` # self.acoustic_model = kmeans.KMeans( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments) elif init_am_assignments == "spread": logger.info("Spreading component assignments") n_init_embeds = len(init_embeds) assignment_list = (range(am_K)*int(np.ceil(float(n_init_embeds)/am_K)))[:n_init_embeds] random.shuffle(assignment_list) assignments[init_embeds] = np.array(assignment_list) # Initialize `acoustic_model` self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments) elif init_am_assignments == "one-by-one": assert False, "to-do" # # Initialize `acoustic_model` # logger.info("Using a one-by-one initial assignment") # self.acoustic_model = kmeans.KMeans( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) # # Assign the embeddings one-by-one # for i_embed in init_embeds: # # print i_embed # self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments def segment_i(self, i): """ Segment new boundaries for utterance `i`. Return ------ sum_neg_len_sqrd_norm : float The length-weighted k-means objective for this utterance. """ # Debug trace logger.debug("Segmeting utterance: " + str(i)) if i == i_debug_monitor: logger.debug("-"*39) logger.debug("Statistics before sampling") logger.debug( "sum_neg_sqrd_norm before sampling: " + str(self.acoustic_model.components.sum_neg_sqrd_norm()) ) # logger.debug( # "sum_neg_sqrd_norm before sampling: " + # str(self.acoustic_model.components.sum_neg_sqrd_norm()) # ) # logger.debug("Unsupervised transcript before sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unsupervised transcript: " + str(self.get_unsup_transcript_i(i))) logger.debug("Unsupervised max transcript: " + str(self.get_max_unsup_transcript_i(i))) # Note the embeddings before segmentation old_embeds = self.utterances.get_segmented_embeds_i(i) # # Temp ---- # for i_embed in old_embeds: # if i_embed == -1: # continue # don't remove a non-embedding (would accidently remove the last embedding) # self.acoustic_model.components.del_item(i_embed) # self.acoustic_model.components.clean_components() # # ---- Temp # Get the scores of the embeddings N = self.utterances.lengths[i] vec_embed_neg_len_sqrd_norms = self.get_vec_embed_neg_len_sqrd_norms( self.utterances.vec_ids[i, :(N**2 + N)/2], self.utterances.durations[i, :(N**2 + N)/2] ) # Debug trace if i == i_debug_monitor: logger.debug("vec_embed_neg_len_sqrd_norms: " + str(vec_embed_neg_len_sqrd_norms)) neg_sqrd_norms = [ self.acoustic_model.components.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(embeddings)) logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("neg_sqrd_norms: " + str(neg_sqrd_norms)) logger.debug("neg_len_sqrd_norms: " + str(neg_sqrd_norms*np.array(lengths))) logger.debug("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms*np.array(lengths)))) # Draw new boundaries for utterance i sum_neg_len_sqrd_norm, self.utterances.boundaries[i, :N] = forward_backward_kmeans_viterbi( vec_embed_neg_len_sqrd_norms, N, self.n_slices_min, self.n_slices_max, i ) # Debug trace if i == i_debug_monitor: logger.debug("Statistics after sampling, but before adding new embeddings to acoustic model") neg_sqrd_norms = [ self.acoustic_model.components.max_neg_sqrd_norm_i(j) for j in self.utterances.get_segmented_embeds_i(i) if j != -1 ] where_bounds = np.where(self.utterances.boundaries[i, :N])[0] embeddings = self.utterances.get_segmented_embeds_i(i) lengths = self.utterances.get_segmented_durations_i(i) logger.debug("Embeddings: " + str(embeddings)) logger.debug("Utterance embeddings: " + str(self.utterances.get_original_segmented_embeds_i(i))) logger.debug("Landmark indices: " + str(self.utterances.get_segmented_landmark_indices(i))) logger.debug("Durations: " + str(self.utterances.get_segmented_durations_i(i))) logger.debug("neg_sqrd_norms: " + str(neg_sqrd_norms)) logger.debug("neg_len_sqrd_norms: " + str(neg_sqrd_norms*np.array(lengths))) logger.debug("sum_neg_len_sqrd_norms: " + str(np.sum(neg_sqrd_norms*np.array(lengths)))) # Remove old embeddings and add new ones; this is equivalent to # assigning the new embeddings and updating the means. new_embeds = self.utterances.get_segmented_embeds_i(i) new_k = self.get_max_unsup_transcript_i(i) for i_embed in old_embeds: if i_embed == -1: continue # don't remove a non-embedding (would accidently remove the last embedding) self.acoustic_model.components.del_item(i_embed) for i_embed, k in zip(new_embeds, new_k): self.acoustic_model.components.add_item(i_embed, k) self.acoustic_model.components.clean_components() # self.acoustic_model.components.setup_random_means() # Debug trace if i == i_debug_monitor: logger.debug( "sum_neg_sqrd_norm after sampling: " + str(self.acoustic_model.components.sum_neg_sqrd_norm()) ) logger.debug("Unsupervised transcript after sampling: " + str(self.get_unsup_transcript_i(i))) logger.debug("-"*39) return sum_neg_len_sqrd_norm # technically, this is with the old means (before updating, above) def get_vec_embed_neg_len_sqrd_norms(self, vec_ids, durations): # Get scores vec_embed_neg_len_sqrd_norms = -np.inf*np.ones(len(vec_ids)) for i, embed_id in enumerate(vec_ids): if embed_id == -1: continue vec_embed_neg_len_sqrd_norms[i] = self.acoustic_model.components.max_neg_sqrd_norm_i( embed_id ) # Scale log marginals by number of frames if np.isnan(durations[i]): vec_embed_neg_len_sqrd_norms[i] = -np.inf else: vec_embed_neg_len_sqrd_norms[i] *= durations[i]#**self.time_power_term return vec_embed_neg_len_sqrd_norms + self.wip def segment(self, n_iter, n_iter_inbetween_kmeans=0): """ Perform segmentation of all utterances and update the k-means model. Parameters ---------- n_iter : int Number of iterations of segmentation. n_iter_inbetween_kmeans : int Number of k-means iterations inbetween segmentation iterations. Return ------ record_dict : dict Contains several fields describing the optimization iterations. Each field is described by its key and statistics are given in a list covering the iterations. """ logger.info("Segmenting for " + str(n_iter) + " iterations") logger.debug( "Monitoring utterance " + self.ids_to_utterance_labels[i_debug_monitor] + " (index=" + str(i_debug_monitor) + ")" ) # Setup record dictionary record_dict = {} record_dict["sum_neg_sqrd_norm"] = [] record_dict["sum_neg_len_sqrd_norm"] = [] record_dict["components"] = [] # record_dict["n_mean_updates"] = [] record_dict["sample_time"] = [] record_dict["n_tokens"] = [] # Loop over sampling iterations for i_iter in xrange(n_iter): start_time = time.time() # Loop over utterances utt_order = range(self.utterances.D) random.shuffle(utt_order) if segment_debug_only: utt_order = [i_debug_monitor] sum_neg_len_sqrd_norm = 0 for i_utt in utt_order: sum_neg_len_sqrd_norm += self.segment_i(i_utt) record_dict["sample_time"].append(time.time() - start_time) start_time = time.time() record_dict["sum_neg_sqrd_norm"].append(self.acoustic_model.components.sum_neg_sqrd_norm()) record_dict["sum_neg_len_sqrd_norm"].append(sum_neg_len_sqrd_norm) record_dict["components"].append(self.acoustic_model.components.K) record_dict["n_tokens"].append(self.acoustic_model.get_n_assigned()) info = "iteration: " + str(i_iter) for key in sorted(record_dict): info += ", " + key + ": " + str(record_dict[key][-1]) logger.info(info) # Perform intermediate acoustic model re-sampling if n_iter_inbetween_kmeans > 0: self.acoustic_model.fit( n_iter_inbetween_kmeans, consider_unassigned=False ) # if i_iter == n_iter: # # Remove empty components # for k in np.where( # self.acoustic_model.components.counts[:self.acoustic_model.components.K] == 0 # )[0][::-1]: # self.acoustic_model.components.del_component(k) return record_dict def get_unsup_transcript_i(self, i): """ Return a list of the current component assignments for current segmentation of `i`. """ return list( self.acoustic_model.components.get_assignments(self.utterances.get_segmented_embeds_i(i)) ) def get_max_unsup_transcript_i(self, i): """ Return a list of the best components for current segmentation of `i`. """ return self.acoustic_model.components.get_max_assignments( self.utterances.get_segmented_embeds_i(i) )
def __init__(self, am_K, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, seed_boundaries_dict=None, seed_assignments_dict=None, n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, init_am_assignments="rand", wip=0): logger.info("Initializing") # Check parameters assert seed_assignments_dict is None or seed_boundaries_dict is not None # Initialize simple attributes self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.wip = wip # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict#, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` if seed_boundaries_dict is not None: seed_boundaries = [seed_boundaries_dict[i] for i in ids_to_utterance_labels] else: seed_boundaries = None lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances( lengths, vec_ids, durations, landmarks, seed_boundaries=seed_boundaries, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration ) # Find all the embeddings that are in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] logger.info("No. initial embeddings: " + str(init_embeds.shape[0])) # Provide the initial acoustic model assignments and initialize the model accordingly assignments = -1*np.ones(N, dtype=int) if seed_assignments_dict is not None: assert False, "to-do" # # Use seed assignments if provided # logger.info("Using seed assignments") # self.seed_to_cluster = {} # i_cluster = 0 # for i_utt, utt in enumerate(ids_to_utterance_labels): # utt_init_embeds = np.array(self.utterances.get_segmented_embeds_i(i_utt), dtype=int) # utt_init_assignments = np.array(seed_assignments_dict[utt][:]) # utt_init_assignments = utt_init_assignments[np.where(utt_init_embeds != -1)] # utt_init_embeds = utt_init_embeds[np.where(utt_init_embeds != -1)] # for seed in utt_init_assignments: # if not seed in self.seed_to_cluster: # if isinstance(seed, (int, long)): # self.seed_to_cluster[seed] = seed # else: # self.seed_to_cluster[seed] = i_cluster # i_cluster += 1 # utt_init_assignments = [self.seed_to_cluster[i] for i in utt_init_assignments] # assignments[utt_init_embeds] = utt_init_assignments # if am_K is None: # am_K = max(self.seed_to_cluster.values()) + 1 # else: # assert am_K >= max(self.seed_to_cluster.values()) + 1 # # Initialize `acoustic_model` # self.acoustic_model = kmeans.KMeans( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) elif init_am_assignments == "rand": # Assign each of the above embeddings randomly to one of the `am_K` clusters logger.info("Using random initial component assignments") init_embeds_assignments = np.random.randint(0, am_K, len(init_embeds)) # Make sure we have consecutive values for k in xrange(init_embeds_assignments.max()): while len(np.nonzero(init_embeds_assignments == k)[0]) == 0: init_embeds_assignments[np.where(init_embeds_assignments > k)] -= 1 if init_embeds_assignments.max() == k: break assignments[init_embeds] = init_embeds_assignments # Initialize `acoustic_model` self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments) elif init_am_assignments == "spread": logger.info("Spreading component assignments") n_init_embeds = len(init_embeds) assignment_list = (range(am_K)*int(np.ceil(float(n_init_embeds)/am_K)))[:n_init_embeds] random.shuffle(assignment_list) assignments[init_embeds] = np.array(assignment_list) # Initialize `acoustic_model` self.acoustic_model = kmeans.KMeans(embeddings, am_K, assignments) elif init_am_assignments == "one-by-one": assert False, "to-do" # # Initialize `acoustic_model` # logger.info("Using a one-by-one initial assignment") # self.acoustic_model = kmeans.KMeans( # embeddings, am_param_prior, am_alpha, am_K, assignments, # covariance_type=covariance_type, lms=lms # ) # # Assign the embeddings one-by-one # for i_embed in init_embeds: # # print i_embed # self.acoustic_model.gibbs_sample_inside_loop_i(i_embed) else: assert False, "invalid value for `init_am_assignments`: " + init_am_assignments