def train(self, corpus): """Train on segmented utterances.""" # Get phoneme and boundary information self.phoneme_counts = corpus.phoneme_counts self.phoneme_freqs = counter_freqs(self.phoneme_counts) # Get diphone information self.diphone_freqs = counter_freqs(corpus.diphone_counts) ## Compute P(xy|#) and P(#|xy) # For P(#|xy), make a list of 1 for True, 0 for False to make # summing easier diphone_outcomes = defaultdict(list) # For P(xy|#), track all diphones with each label boundary_diphone_counts = {True: Counter(), False: Counter()} # Count it all up total_boundaries = 0 total_diphones = 0 for diphone, label in chain.from_iterable(corpus.diphone_boundaries): boundary_diphone_counts[label][diphone] += 1 label = int(label) diphone_outcomes[diphone].append(label) total_diphones += 1 total_boundaries += label # P(xy|#) self.boundary_diphone_probs = \ {label: counter_freqs(counts) for label, counts in boundary_diphone_counts.items()} assert all(0 <= prob <= 1.0 for diphone_probs in self.boundary_diphone_probs.values() for prob in diphone_probs.values()) # If this assertion fails but the value is very close to 1.0, this just means # there is a lot of accumulated rounding error. assert all(.999 < sum(diphone_probs.values()) < 1.001 for diphone_probs in self.boundary_diphone_probs.values()) # P(#|xy) self.diphone_boundary_probs = \ {diphone: (sum(outcomes) / len(outcomes)) for diphone, outcomes in diphone_outcomes.iteritems()} assert all(0 <= prob <= 1.0 for prob in self.diphone_boundary_probs.values()) # P(#) self.p_boundary = total_boundaries / total_diphones assert 0 <= self.p_boundary <= 1.0 ## Get phrase initial/final counts initial_counts, final_counts = corpus.outside_phoneme_counts # P(x|initial) self.initial_freqs = counter_freqs(initial_counts) # P(x|final) self.final_freqs = counter_freqs(final_counts)
def phoneme_features(corpus, out_csv): """Write phoneme counts to a CSV.""" # Convert counts to probabilities phoneme_freq = counter_freqs(corpus.phoneme_counts) # Write header out_csv.writerow(('phoneme', 'prob', 'rank')) # Write data for idx, (phoneme, count) in enumerate(sorted(phoneme_freq.items(), key=itemgetter(1), reverse=True)): out_csv.writerow((phoneme, count, idx + 1))
def diphone_features(corpus, out_csv): """Write diphone features of a corpus to a CSV.""" diphone_boundaries = chain.from_iterable(corpus.diphone_boundaries) # Convert counts to probabilities diphone_freq = counter_freqs(corpus.diphone_counts) # Write header out_csv.writerow(('diphone', 'prob', BOUNDARY_HEADER)) # Write data for diphone, label in diphone_boundaries: out_csv.writerow((''.join(diphone), diphone_freq[diphone], convert_r_bool(label)))
def dibs_features(corpus, out_csv): """Write information for the DiBs segmentation model to a CSV.""" # Get phoneme and boundary information phoneme_counts = corpus.phoneme_counts phoneme_freq = counter_freqs(phoneme_counts) # Get diphone information diphone_freq = counter_freqs(corpus.diphone_counts) ## Compute P(xy|#) and P(#|xy) # For P(#|xy), make a list of 1 for True, 0 for False to make # summing easier diphone_outcomes = defaultdict(list) # For P(xy|#), track all diphones with each label boundary_diphone_counts = {True: Counter(), False: Counter()} # Count it all up total_boundaries = 0 total_diphones = 0 for diphone, label in chain.from_iterable(corpus.diphone_boundaries): boundary_diphone_counts[label][diphone] += 1 label = int(label) diphone_outcomes[diphone].append(label) total_diphones += 1 total_boundaries += label # P(xy|#) boundary_diphone_probs = \ {label: counter_freqs(counts) for label, counts in boundary_diphone_counts.items()} assert all(0 <= prob <= 1.0 for diphone_probs in boundary_diphone_probs.values() for prob in diphone_probs.values()) # If this assertion fails but the value is very close to 1.0, this just means # there is a lot of accumulated rounding error. assert all(.99 < sum(diphone_probs.values()) < 1.01 for diphone_probs in boundary_diphone_probs.values()) # P(#|xy) diphone_boundary_probs = \ {diphone: (sum(outcomes) / len(outcomes)) for diphone, outcomes in diphone_outcomes.iteritems()} assert all(0 <= prob <= 1.0 for prob in diphone_boundary_probs.values()) # P(#) p_boundary = total_boundaries / total_diphones assert 0 <= p_boundary <= 1.0 ## Get phrase initial/final counts initial_counts, final_counts = corpus.outside_phoneme_counts # P(x|initial) initial_freq = counter_freqs(initial_counts) # P(x|final) final_freq = counter_freqs(final_counts) # Output information for each boundary out_csv.writerow(('diphone', 'prob.true', 'prob.dibs', 'prob.est1', 'prob.est2', 'score', BOUNDARY_HEADER)) for diphone, label in chain.from_iterable(corpus.diphone_boundaries): # Estimate P(x|inital) and P(y|final) for a diphone xy phone1, phone2 = diphone p_phone1_final = final_freq[phone1] if phone1 in final_freq else 0.0 p_phone2_init = initial_freq[phone2] if phone2 in initial_freq else 0.0 # Compute the DiBS score assert 1.0 >= diphone_freq[diphone] >= 0.0 dibs_score = (2.0 * p_phone1_final * p_phone2_init) / diphone_freq[diphone] # True P(#|xy) # If you want to do it by Bayes' rule, it would be: # P(xy|#) * P(#) / P(xy) # (boundary_diphone_probs[True][diphone] * p_boundary) / diphone_freq[diphone] true_prob = diphone_boundary_probs[diphone] # Compute Daland's estimated P(#|xy) assuming P(#) = .28, the mean # value explored in their study dibs_prob = (p_phone1_final * p_phone2_init * 0.28) / diphone_freq[diphone] # Probability that it's not a boundary p_phone1_final = final_freq[phone1] if phone1 in final_freq else 0.0 p_phone2_init = initial_freq[phone2] if phone2 in initial_freq else 0.0 # A way to estimate with a more normal normalization in the denominator est1_prob = ((p_phone1_final * p_phone2_init * 0.28) / (phoneme_freq[phone1] * phoneme_freq[phone2])) # Another way of estimating: # P(x) * P(#|x) * P(y|#) / P(xy) # P(x) * P(final|x) * P(y|initial) / P(xy) try: p_phone1_final = final_counts[phone1] / phoneme_counts[phone1] except KeyError: p_phone1_final = 0.0 est2_prob = ((phoneme_freq[phone1] * p_phone1_final * p_phone2_init) / (diphone_freq[diphone])) out_csv.writerow((''.join(diphone), true_prob, dibs_prob, est1_prob, est2_prob, dibs_score, convert_r_bool(label)))