def transition_log_probability(self, *states): states = [s if s is not None else (None,None) for s in states] if self.order == 1: # Transitions are all equiprobable return - logprob(len(self.label_dom)) points,functions = zip(*states) if points[0] is None: # Just use the fun transition to get prob of final state fn_context = tuple(functions[:1]) return self.fn_transition_dist[fn_context].logprob(None) if all(p is None for p in points[1:]) == 1: # Initial states: all points equiprobable # Only permit points in the (0,0) enharmonic space if points[0][0] != 0 or points[0][1] != 0: return float('-inf') # Get fn prob from initial dist return self.fn_transition_dist[tuple()].logprob(functions[0]) - logprob(12) # The function is conditioned on all previous functions fn_context = tuple(functions[1:]) fn_prob = self.fn_transition_dist[fn_context].logprob(functions[0]) vect = vector(points[1], points[0]) # The vector is conditioned on the function # and the pairs of vector and function preceding that vector_prob = self.point_transition_dist[functions[0]].logprob(vect) # Multiply together the vector and function probs return vector_prob + fn_prob
def prob_adder(start, end, signtup, words): sign, tag, tag_prob = signtup if self.use_tagger_probs: # Use the tagger to get lexical probabilities lex_prob = self.tagger.lexical_probability(start, end, tag) else: # We might get multiple words here: use the first # This is not really a satisfactory solution: better would be # to get the tagger to tell us which word to use if isinstance(words, list): word = words[0] elif not isinstance(words, basestring): # Check the word is a string # If not, we probably shouldn't be trying to get a probability raise ParseError, "PCFG model is trying to assign lexical "\ "probabilities to words, but the words aren't strings. "\ "Maybe you should have disabled lexical probs wtih "\ "parser option 'nolex'" else: word = words # Consult the model to get the lexical probability of this sign lex_prob = self.model.inside_probability('leaf', sign, word) # Triangular number: nodes in the tree for multiword categories # This has the effect of penalizing multiword categories # proportionally to the number of tree nodes deriving the # categories they're competing with derived from single-word cats tree_size = comb(end-start+1, 2) lex_prob = lex_prob ** tree_size # Add the probabilities to the category sign.inside_probability = logprob(lex_prob) sign.probability = logprob(self.model.outside_probability(sign)) \ + sign.inside_probability
def update_model(self, arrays, array_ids): """ Replaces the distributions of the saved model with the probabilities taken from the arrays of updates. self.model is expected to be made up of mutable distributions when this is called. """ trans, ems, trans_denom, ems_denom = arrays state_ids, em_ids = array_ids num_states = len(self.model.label_dom) num_emissions = len(self.model.emission_dom) for state in self.model.label_dom: # Get the transition denominator for going from this state state_i = state_ids[state] denom = trans_denom[state_i] for next_state in self.model.label_dom: state_j = state_ids[next_state] # Update the probability of this transition prob = logprob(trans[state_i][state_j] + ADD_SMALL) - \ logprob(trans_denom[state_i] + num_states*ADD_SMALL) self.model.label_dist[(state, )].update(next_state, prob) for emission in self.model.emission_dom: # Update the probability of this emission prob = logprob(ems[state_i][em_ids[emission]] + ADD_SMALL) - \ logprob(ems_denom[state_i] + num_emissions*ADD_SMALL) self.model.emission_dist[state].update(emission, prob)
def update_model(self, arrays, array_ids): """ Replaces the distributions of the saved model with the probabilities taken from the arrays of updates. self.model is expected to be made up of mutable distributions when this is called. """ trans, ems, trans_denom, ems_denom = arrays state_ids, em_ids = array_ids num_states = len(self.model.label_dom) num_emissions = len(self.model.emission_dom) for state in self.model.label_dom: # Get the transition denominator for going from this state state_i = state_ids[state] denom = trans_denom[state_i] for next_state in self.model.label_dom: state_j = state_ids[next_state] # Update the probability of this transition prob = logprob(trans[state_i][state_j] + ADD_SMALL) - \ logprob(trans_denom[state_i] + num_states*ADD_SMALL) self.model.label_dist[(state,)].update(next_state, prob) for emission in self.model.emission_dom: # Update the probability of this emission prob = logprob(ems[state_i][em_ids[emission]] + ADD_SMALL) - \ logprob(ems_denom[state_i] + num_emissions*ADD_SMALL) self.model.emission_dist[state].update(emission, prob)
def prob_adder(start, end, signtup, words): sign, tag, tag_prob = signtup if self.use_tagger_probs: # Use the tagger to get lexical probabilities lex_prob = self.tagger.lexical_probability(start, end, tag) else: # We might get multiple words here: use the first # This is not really a satisfactory solution: better would be # to get the tagger to tell us which word to use if isinstance(words, list): word = words[0] elif not isinstance(words, basestring): # Check the word is a string # If not, we probably shouldn't be trying to get a probability raise ParseError, "PCFG model is trying to assign lexical "\ "probabilities to words, but the words aren't strings. "\ "Maybe you should have disabled lexical probs wtih "\ "parser option 'nolex'" else: word = words # Consult the model to get the lexical probability of this sign lex_prob = self.model.inside_probability('leaf', sign, word) # Triangular number: nodes in the tree for multiword categories # This has the effect of penalizing multiword categories # proportionally to the number of tree nodes deriving the # categories they're competing with derived from single-word cats tree_size = comb(end - start + 1, 2) lex_prob = lex_prob**tree_size # Add the probabilities to the category sign.inside_probability = logprob(lex_prob) sign.probability = logprob(self.model.outside_probability(sign)) \ + sign.inside_probability
def _res_mod(result, sign): # Function to add the probability to each result from the input # Use the model to get the probabilities inside_prob = logprob(model.inside_probability( 'unary', result, sign)) + \ sign.inside_probability outside_prob = logprob(model.outside_probability(result)) result.inside_probability = inside_prob result.probability = outside_prob + inside_probability
def transition_log_probability(self, state, *previous_states): previous_states = [ s if s is not None else (None, None) for s in previous_states ] if self.order == 1: roots, schemata = [], [] else: roots, schemata = zip(*previous_states) schemata = tuple(schemata) # Use the separate method to get the probability from the schema dist schema_prob = self.schema_transition_log_probability( state, *previous_states) # Then the root transition distribution # Don't look at the root transition if this is a unigram model if self.order == 1 or all(s is None for s in schemata): # All roots equiprobable root_prob = -logprob(12) elif state is None: # Final state: no root transition, prob comes from state dist root_prob = 0 else: # Calculate the root change from the previous chord root_change = (state[0] - roots[0]) % 12 # Condition the root change prob on the *previous* schema root_prob = self.root_transition_dist[schemata[0]].logprob( root_change) # Multiply together the probability of the schema transition and the # root change return root_prob + schema_prob
def transition_log_probability(self, state, *previous_states): previous_states = [s if s is not None else (None,None) for s in previous_states] if self.order == 1: roots,schemata = [],[] else: roots,schemata = zip(*previous_states) schemata = tuple(schemata) # Use the separate method to get the probability from the schema dist schema_prob = self.schema_transition_log_probability(state, *previous_states) # Then the root transition distribution # Don't look at the root transition if this is a unigram model if self.order == 1 or all(s is None for s in schemata): # All roots equiprobable root_prob = - logprob(12) elif state is None: # Final state: no root transition, prob comes from state dist root_prob = 0 else: # Calculate the root change from the previous chord root_change = (state[0] - roots[0]) % 12 # Condition the root change prob on the *previous* schema root_prob = self.root_transition_dist[schemata[0]].logprob(root_change) # Multiply together the probability of the schema transition and the # root change return root_prob + schema_prob
def emission_log_probability(self, emission, state): """ Gives the probability P(emission | label). Returned as a base 2 log. The emission should be a pair of (root,label), together defining a chord. There's a special case of this. If the emission is a list, it's assumed to be a I{distribution} over emissions. The list should contain (prob,em) pairs, where I{em} is an emission, such as is normally passed into this function, and I{prob} is the weight to give to this possible emission. The probabilities of the possible emissions are summed up, weighted by the I{prob} values. """ if type(emission) is list: # Average probability over the possible emissions probs = [] for (prob,em) in emission: probs.append(logprob(prob) + \ self.emission_log_probability(em, state)) return sum_logs(probs) # Single chord label state_root,schema = state chord_root,label = emission # Probability is 0 if the roots don't match if state_root != chord_root: return float('-inf') else: return self.emission_dist[schema].logprob(label)
def _get_transition_backoff_scaler(self, context): # This is just for the schema distribution if context not in self._discount_cache: # The prob mass reserved for unseen events can be computed by # summing probabilities over all seen events and subtracting # from 1. # Our discounting model distributes this probability evenly over # the unseen events, so we can compute the discounted mass by # getting the probability of one unseen event and multiplying it. seen_labels = set([lab for lab in self.schemata+[None] if self.schema_transition_counts[context][lab] > 0]) if len(seen_labels) == 0: # Not seen anything in this context. All mass is discounted! self._discount_cache[context] = 0.0 else: unseen_labels = set(self.schemata+[None]) - seen_labels # Try getting some event that won't have been seen # Compute how much mass is reserved for unseen events discounted_mass = self.schema_transition_dist[context].prob( "%%% UNSEEN LABEL %%%") \ * len(unseen_labels) # Compute how much probability the n-1 order model assigns to # things unseen by this model backoff_context = context[:-1] backoff_seen_mass = sum_logs([ self.backoff_model.schema_transition_log_probability_schemata(lab, *backoff_context) for lab in unseen_labels]) self._discount_cache[context] = logprob(discounted_mass) - \ backoff_seen_mass return self._discount_cache[context]
def emission_log_probability(self, emission, state): """ Gives the probability P(emission | label). Returned as a base 2 log. The emission should be a pair of (root,label), together defining a chord. There's a special case of this. If the emission is a list, it's assumed to be a I{distribution} over emissions. The list should contain (prob,em) pairs, where I{em} is an emission, such as is normally passed into this function, and I{prob} is the weight to give to this possible emission. The probabilities of the possible emissions are summed up, weighted by the I{prob} values. """ if type(emission) is list: # Average probability over the possible emissions probs = [] for (prob, em) in emission: probs.append(logprob(prob) + \ self.emission_log_probability(em, state)) return sum_logs(probs) # Single chord label state_root, schema = state chord_root, label = emission # Probability is 0 if the roots don't match if state_root != chord_root: return float('-inf') else: return self.emission_dist[schema].logprob(label)
def _get_transition_backoff_scaler(self, context): # This is just for the schema distribution if context not in self._discount_cache: # The prob mass reserved for unseen events can be computed by # summing probabilities over all seen events and subtracting # from 1. # Our discounting model distributes this probability evenly over # the unseen events, so we can compute the discounted mass by # getting the probability of one unseen event and multiplying it. seen_labels = set([ lab for lab in self.schemata + [None] if self.schema_transition_counts[context][lab] > 0 ]) if len(seen_labels) == 0: # Not seen anything in this context. All mass is discounted! self._discount_cache[context] = 0.0 else: unseen_labels = set(self.schemata + [None]) - seen_labels # Try getting some event that won't have been seen # Compute how much mass is reserved for unseen events discounted_mass = self.schema_transition_dist[context].prob( "%%% UNSEEN LABEL %%%") \ * len(unseen_labels) # Compute how much probability the n-1 order model assigns to # things unseen by this model backoff_context = context[:-1] backoff_seen_mass = sum_logs([ self.backoff_model. schema_transition_log_probability_schemata( lab, *backoff_context) for lab in unseen_labels ]) self._discount_cache[context] = logprob(discounted_mass) - \ backoff_seen_mass return self._discount_cache[context]
def _binary_expansion_probability(self, sign_pair, result): """ Used by L{_apply_binary_rule} and L{_apply_binary_rule_semantics} to compute the expansion probabilitiy. This is a separate function because both of the above do the same to compute the probabilities, so I don't want to repeat the code. Returns a tuple of the probability and the inside probability. """ parent = result left, right = sign_pair expansion = 'right' # Get the probabilities from the model subtree_prob = logprob(self.model.inside_probability( expansion, parent, left, right)) outside_prob = logprob(self.model.outside_probability(parent)) # Multiply in the daughters' inside probs to get the inside prob inside_prob = subtree_prob + left.inside_probability + \ right.inside_probability return (inside_prob+outside_prob, inside_prob)
def normal_forward_probabilities(self, sequence): """If you want the normalized matrix of forward probabilities, it's ok to use normal (non-log) probabilities and these can be computed more quickly, since you don't need to sum logs (which is time consuming). Returns the matrix, and also the vector of values that each timestep was divided by to normalize (i.e. total probability of each timestep over all states). Also returns the total log probability of the sequence. @return: (matrix,normalizing vector,log prob) """ T = len(sequence) N = len(self.label_dom) alpha = numpy.zeros((T, N), numpy.float64) scale = numpy.zeros(T, numpy.float64) # Prepare the first column of the matrix: probs of all states in the # first timestep for i, state in enumerate(self.label_dom): alpha[0,i] = self.transition_probability(state, None) * \ self.emission_probability(sequence[0], state) # Normalize by dividing all values by the total probability total = sum(alpha[0, :]) for i in range(N): alpha[0, i] /= total scale[0] = total # Iterate over the other timesteps for t in range(1, T): for j, sj in enumerate(self.label_dom): # Multiply each previous state's prob by the transition prob # to this state and sum them all together log_prob = sum( (alpha[t-1, i] * self.transition_probability(sj, si) \ for i,si in enumerate(self.label_dom)), 0.0) # Also multiply this by the emission probability alpha[t, j] = log_prob * \ self.emission_probability(sequence[t], sj) # Normalize by dividing all values by the total probability total = sum(alpha[t, :]) for j in range(N): alpha[t, j] /= total scale[t] = total # Multiply together the probability of each timestep to get the whole # probability of the sequence log_prob = sum((logprob(total) for total in scale), 0.0) return alpha, scale, log_prob
def normal_forward_probabilities(self, sequence): """If you want the normalized matrix of forward probabilities, it's ok to use normal (non-log) probabilities and these can be computed more quickly, since you don't need to sum logs (which is time consuming). Returns the matrix, and also the vector of values that each timestep was divided by to normalize (i.e. total probability of each timestep over all states). Also returns the total log probability of the sequence. @return: (matrix,normalizing vector,log prob) """ T = len(sequence) N = len(self.label_dom) alpha = numpy.zeros((T, N), numpy.float64) scale = numpy.zeros(T, numpy.float64) # Prepare the first column of the matrix: probs of all states in the # first timestep for i,state in enumerate(self.label_dom): alpha[0,i] = self.transition_probability(state, None) * \ self.emission_probability(sequence[0], state) # Normalize by dividing all values by the total probability total = sum(alpha[0,:]) for i in range(N): alpha[0,i] /= total scale[0] = total # Iterate over the other timesteps for t in range(1, T): for j,sj in enumerate(self.label_dom): # Multiply each previous state's prob by the transition prob # to this state and sum them all together log_prob = sum( (alpha[t-1, i] * self.transition_probability(sj, si) \ for i,si in enumerate(self.label_dom)), 0.0) # Also multiply this by the emission probability alpha[t, j] = log_prob * \ self.emission_probability(sequence[t], sj) # Normalize by dividing all values by the total probability total = sum(alpha[t,:]) for j in range(N): alpha[t,j] /= total scale[t] = total # Multiply together the probability of each timestep to get the whole # probability of the sequence log_prob = sum((logprob(total) for total in scale), 0.0) return alpha,scale,log_prob
def clear_cache(self): """ Initializes or empties probability distribution caches. Make sure to call this if you change or update the distributions. """ # Whole emission-state identity self._emission_cache = {} # Class-dependent emission identity self._emission_class_cache = {} # Whole transition identity self._transition_cache = {} # Recompute the probability scalers illegal_prob = dict([(label, 0.0) for label in self.schemata]) for label0,label1 in self.illegal_transitions: # Sum up the probability of illegal transitions, which will be # treated as 0 illegal_prob[label0] += self.schema_transition_dist[label0].prob(label1) self._schema_prob_scalers = {} for label in self.schemata: # Compute what to scale the other probabilities by self._schema_prob_scalers[label] = - logprob(1.0 - illegal_prob[label])
def _apply_beam(self): """ Applies a beam, using the already given threshold, to the set, pruning out any signs with a probability lower than the given ratio of the most probable sign. """ if not self._beamed: max = self._max_probability() cutoff = max + logprob(self.threshold) to_remove = [sign for sign in self.values() if sign.probability < cutoff] for sign in to_remove: self.remove(sign) logger.debug("Beam removed %d signs (max %s, min %s)" % \ (len(to_remove),max, cutoff)) # Beam is now applied: check the remaining size if self.maxsize != 0: if len(self) > self.maxsize: logger.debug("Hard beam removed %d signs" % (self.maxsize-len(self))) # Too many signs: apply a hard cutoff ordered = list(sorted(self.values(), key=lambda s:s.probability)) for sign in ordered[self.maxsize:]: self.remove(sign) # Don't apply the beam again until something changes self._beamed = True
def emission_log_probability(self, emission, state): """ Gives the probability P(emission | label). Returned as a base 2 log. The emission should be a pair of (root,label), together defining a chord. There's a special case of this. If the emission is a list, it's assumed to be a I{distribution} over emissions. The list should contain (prob,em) pairs, where I{em} is an emission, such as is normally passed into this function, and I{prob} is the weight to give to this possible emission. The probabilities of the possible emissions are summed up, weighted by the I{prob} values. """ if type(emission) is list: # Average probability over the possible emissions probs = [] for (prob,em) in emission: probs.append(logprob(prob) + \ self.emission_log_probability(em, state)) return sum_logs(probs) # Single chord label point,function = state chord_root,label = emission X,Y,x,y = point # Work out the chord substitution subst = (chord_root - coordinate_to_et_2d((x,y))) % 12 # Generate the substitution given the chord function subst_prob = self.subst_emission_dist[function].logprob(subst) # Generate the label given the subst and chord function label_prob = self.type_emission_dist[(subst,function)].logprob(label) return subst_prob + label_prob
def __init__(self, *args, **kwargs): super(RaphstoHmmUnigram, self).__init__(*args, **kwargs) # Precompute the uniform transition probability self._uniform_transition = 1.0 / len(self.label_dom) self._uniform_transition_log = -logprob(len(self.label_dom))
def train(self, emissions, max_iterations=None, \ convergence_logprob=None, logger=None, processes=1, save=True, save_intermediate=False): """ Performs unsupervised training using Baum-Welch EM. This is an instance method, because it is performed on a model that has already been initialized. You might, for example, create such a model using C{initialize_chord_types}. This is based on the training procedure in NLTK for HMMs: C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}. @type emissions: list of lists of emissions @param emissions: training data. Each element is a list of emissions representing a sequence in the training data. Each emission is an emission like those used for L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, i.e. a list of note observations @type max_iterations: int @param max_iterations: maximum number of iterations to allow for EM (default 100). Overrides the corresponding module option @type convergence_logprob: float @param convergence_logprob: maximum change in log probability to consider convergence to have been reached (default 1e-3). Overrides the corresponding module option @type logger: logging.Logger @param logger: a logger to send progress logging to @type processes: int @param processes: number processes to spawn. A pool of this many processes will be used to compute distribution updates for sequences in parallel during each iteration. @type save: bool @param save: save the model at the end of training @type save_intermediate: bool @param save_intermediate: save the model after each iteration. Implies C{save} """ from . import raphsto_d if logger is None: from jazzparser.utils.loggers import create_dummy_logger logger = create_dummy_logger() if save_intermediate: save = True # No point in creating more processes than there are sequences if processes > len(emissions): processes = len(emissions) self.model.add_history("Beginning Baum-Welch unigram training on %s" % get_host_info_string()) self.model.add_history("Training on %d sequences (with %s chords)" % \ (len(emissions), ", ".join("%d" % len(seq) for seq in emissions))) # Use kwargs if given, otherwise module options if max_iterations is None: max_iterations = self.options['max_iterations'] if convergence_logprob is None: convergence_logprob = self.options['convergence_logprob'] # Enumerate the states state_ids = dict((state,num) for (num,state) in \ enumerate(self.model.label_dom)) # Enumerate the beat values (they're probably consecutive ints, but # let's not rely on it) beat_ids = dict((beat,num) for (num,beat) in \ enumerate(self.model.beat_dom)) num_beats = len(beat_ids) # Enumerate the d-values (d-function's domain) d_ids = dict((d,num) for (num,d) in \ enumerate(self.model.emission_dist_dom)) num_ds = len(d_ids) # Make a mutable distribution for the emission distribution we'll # be updating emission_mdist = DictionaryConditionalProbDist( dict((s, MutableProbDist(self.model.emission_dist[s], self.model.emission_dist_dom)) for s in self.model.emission_dist.conditions())) # Create dummy distributions to fill the places of the transition # distribution components key_mdist = DictionaryConditionalProbDist({}) chord_mdist = DictionaryConditionalProbDist({}) chord_uni_mdist = MutableProbDist({}, []) # Construct a model using these mutable distributions so we can # evaluate using them model = self.model_cls(key_mdist, chord_mdist, emission_mdist, chord_uni_mdist, chord_set=self.model.chord_set) iteration = 0 last_logprob = None while iteration < max_iterations: logger.info("Beginning iteration %d" % iteration) current_logprob = 0.0 # ems contains the new emission numerator probabilities # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r} # alpha(x_n).beta(x_n) / # Sum_{x'_n} (alpha(x'_n).beta(x'_n)) ems = zeros((num_beats,num_ds), float64) # And these are the denominators ems_denom = zeros(num_beats, float64) def _training_callback(result): """ Callback for the _sequence_updates processes that takes the updates from a single sequence and adds them onto the global update accumulators. """ # _sequence_updates() returns all of this as a tuple (ems_local, ems_denom_local, seq_logprob) = result # Add these probabilities from this sequence to the # global matrices # Emission numerator array_add(ems, ems_local, ems) # Denominators array_add(ems_denom, ems_denom_local, ems_denom) ## End of _training_callback # Only use a process pool if there's more than one sequence if processes > 1: # Create a process pool to use for training logger.info("Creating a pool of %d processes" % processes) pool = Pool(processes=processes) async_results = [] for seq_i,sequence in enumerate(emissions): logger.info("Iteration %d, sequence %d" % (iteration, seq_i)) T = len(sequence) if T == 0: continue # Fire off a new call to the process pool for every sequence async_results.append( pool.apply_async(_sequence_updates_uni, (sequence, model, self.model.label_dom, state_ids, beat_ids, d_ids, raphsto_d), callback=_training_callback) ) pool.close() # Wait for all the workers to complete pool.join() # Call get() on every AsyncResult so that any exceptions in # workers get raised for res in async_results: # If there was an exception in _sequence_update, it # will get raised here res_tuple = res.get() # Add this sequence's logprob into the total for all sequences current_logprob += res_tuple[2] else: logger.info("One sequence: not using a process pool") sequence = emissions[0] if len(sequence) > 0: updates = _sequence_updates_uni( sequence, model, self.model.label_dom, state_ids, beat_ids, d_ids, raphsto_d) _training_callback(updates) # Update the overall logprob current_logprob = updates[2] # Update the model's probabilities from the accumulated values for beat in self.model.beat_dom: denom = ems_denom[beat_ids[beat]] for d in self.model.emission_dist_dom: if denom == 0.0: # Zero denominator prob = - logprob(len(d_ids)) else: prob = logprob(ems[beat_ids[beat]][d_ids[d]] + ADD_SMALL) - logprob(denom + len(d_ids)*ADD_SMALL) model.emission_dist[beat].update(d, prob) # Clear the model's cache so we get the new probabilities model.clear_cache() logger.info("Training data log prob: %s" % current_logprob) if last_logprob is not None and current_logprob < last_logprob: logger.error("Log probability dropped by %s" % \ (last_logprob - current_logprob)) if last_logprob is not None: logger.info("Log prob change: %s" % \ (current_logprob - last_logprob)) # Check whether the log probability has converged if iteration > 0 and \ abs(current_logprob - last_logprob) < convergence_logprob: # Don't iterate any more logger.info("Distribution has converged: ceasing training") break iteration += 1 last_logprob = current_logprob # Update the main model # Only save if we've been asked to save between iterations self.update_model(model, save=save_intermediate) self.model.add_history("Completed Baum-Welch unigram training") # Update the distribution's parameters with those we've trained self.update_model(model, save=save) return
def normal_forward_probabilities(self, sequence, seq_prob=False, decomposed=False): """ Specialized version of this to make it faster. @note: verified that this gets identical results to the superclass @param seq_prob: return the log probability of the whole sequence as well as the array (tuple of (array,logprob)). @return: 2D Numpy array. The first dimension represents timesteps, the second the states. """ from numpy import newaxis N = len(sequence) states = self.label_dom S = len(states) chords = self.chord_types C = len(chords) # Prepare the transition and emission matrices ems = self.get_small_emission_matrix(sequence) trans = self.get_small_transition_matrix() # Initialize an empty matrix # The dims of the matrix are (time, key, root, label) forward_matrix = numpy.zeros((N,12,12,C), numpy.float64) # Create an array for the total logprobs coefficients = numpy.zeros((N,), numpy.float64) # First fill in the first columns with transitions from None for root in range(12): for c,chord in enumerate(chords): for key in range(12): # Fill in with the (None-padded) transition probability forward_matrix[0,key,root,c] = self.transition_probability( (key,root,chord), None) # Multiply in the emission probabilities # These get broadcast over the last dim, key forward_matrix[0] = forward_matrix[0] * ems[0] # Normalize coefficients[0] = logprob(numpy.sum(forward_matrix[0])) forward_matrix[0] /= numpy.sum(forward_matrix[0]) for time in range(1, N): # Multiply in the transition matrix to get the new state probabilities trans_step = forward_matrix[time-1] * trans # DIMS: key, root, label, key[-1], root[-1], label[-1] for i in range(3): # Sum over previous states trans_step = numpy.sum(trans_step, axis=-1) # Multiply in the emission probabilities # This broadcasts over keys, since emissions don't care about key forward_matrix[time] = trans_step * ems[time] # Normalize the timestep coefficients[time] = logprob(numpy.sum(forward_matrix[time])) forward_matrix[time] /= numpy.sum(forward_matrix[time]) if not decomposed: # Reshape the array so it has only two dimensions # The dimensions are ordered in the same way as the components of the # labels, so we just reshape forward_matrix = forward_matrix.reshape(N, 12*12*C) if seq_prob: return forward_matrix, numpy.sum(coefficients) else: return forward_matrix
def __init__(self, *args, **kwargs): super(RaphstoHmmUnigram, self).__init__(*args, **kwargs) # Precompute the uniform transition probability self._uniform_transition = 1.0 / len(self.label_dom) self._uniform_transition_log = - logprob(len(self.label_dom))
def update_model(self, arrays, array_ids): """ Replaces the distributions of the saved model with the probabilities taken from the arrays of updates. self.model is expected to be made up of mutable distributions when this is called. """ ( initial_keys, initial_chords, key_trans, chord_trans, ems, initial_keys_denom, initial_chords_denom, key_trans_denom, chord_trans_denom, ems_denom, ) = arrays chord_ids, chord_type_ids = array_ids num_chords = len(self.model.chord_dom) num_emissions = len(self.model.emission_dom) num_chord_types = len(self.model.chord_vocab) # Initial keys distribution # Only update this distribution if asked to: often we should leave it if self.options["initkey"]: for key in range(12): prob = logprob(initial_keys[key] + ADD_SMALL) - logprob(initial_keys_denom[0] + ADD_SMALL * 12) self.model.initial_key_dist.update(key, prob) # Initial chords distribution for chord in self.model.chord_dom: chordi = chord_ids[chord] prob = logprob(initial_chords[chordi] + ADD_SMALL) - logprob( initial_chords_denom[0] + ADD_SMALL * num_chords ) self.model.initial_chord_dist.update(chord, prob) # Key transition distribution for key in range(12): prob = logprob(key_trans[key] + ADD_SMALL) - logprob(key_trans_denom[0] + ADD_SMALL * 12) self.model.key_transition_dist.update(key, prob) # Chord transition distribution for chord0 in self.model.chord_dom: chordi = chord_ids[chord0] for chord1 in self.model.chord_dom + [None]: chordj = chord_ids[chord1] prob = logprob(chord_trans[chordi][chordj] + ADD_SMALL) - logprob( chord_trans_denom[chordi] + ADD_SMALL * num_chords ) self.model.chord_transition_dist[chord0].update(chord1, prob) # Emission distribution for label in self.model.chord_vocab: labeli = chord_type_ids[label] for pitch in range(12): prob = logprob(ems[labeli][pitch] + ADD_SMALL) - logprob( ems_denom[labeli] + ADD_SMALL * num_chord_types ) self.model.emission_dist[label].update(pitch, prob)
def train_transition_distribution(self, inputs, grammar, contprob=0.3): """ Train the transition distribution parameters in a supervised manner, using chord corpus input. This is used as an initialization step to set transition parameters before running EM on unannotated data. @type inputs: L{jazzparser.data.input.AnnotatedDbBulkInput} @param inputs: annotated chord training data @type contprob: float or string @param contprob: probability mass to reserve for staying on the same state (self transitions). Use special value 'learn' to learn the probabilities from the durations """ self.add_history( "Training transition probabilities using %d annotated chord "\ "sequences" % len(inputs)) learn_cont = contprob == "learn" # Prepare the label sequences that we'll train on if learn_cont: # Repeat values with a duration > 1 sequences = [] for seq in inputs: sequence = [] last_cat = None for chord,cat in zip(seq, seq.categories): # Put it in once for each duration for i in range(chord.duration): sequence.append((chord,cat)) sequences.append(sequence) else: sequences = [list(zip(sequence, sequence.categories)) for \ sequence in inputs] # Prepare a list of transformations to apply to the categories label_transform = {} # First include all the categories we want to keep as they were for schema in self.schemata: label_transform[schema] = (schema, 0) # Then include any transformations the grammar defines for pos,mapping in grammar.equiv_map.items(): label_transform[pos] = (mapping.target.pos, mapping.root) # Apply the transformation to all the training data training_samples = [] for chord_cats in sequences: seq_samples = [] for chord,cat in chord_cats: # Transform the label if it has a transformation if cat in label_transform: use_cat, alter_root = label_transform[cat] else: use_cat, alter_root = cat, 0 root = (chord.root + alter_root) % 12 seq_samples.append((str(use_cat), root)) training_samples.append(seq_samples) training_data = sum([ [(cat0, cat1, (root1 - root0) % 12) for ((cat0,root0),(cat1,root1)) in \ group_pairs(seq_samples)] \ for seq_samples in training_samples], []) # Count up the observations schema_transition_counts = ConditionalFreqDist() root_transition_counts = ConditionalFreqDist() for (label0, label1, root_change) in training_data: # Only use counts for categories the model's looking for if label0 in self.schemata and label1 in self.schemata: schema_transition_counts[label0].inc(label1) root_transition_counts[(label0,label1)].inc(root_change) # Transition probability to final state (end of sequence) for sequence in training_samples: # Inc the count of going from the label the sequence ends on to # the final state schema_transition_counts[sequence[-1][0]].inc(None) # Use Laplace (plus one) smoothing # We don't use the laplace_estimator because we want the conversion # to a dict prob dist to get all the labels, not just to discount # the ones it's seen for label0 in self.schemata: for label1 in self.schemata: for root_change in range(12): # Exclude self-transition for now, unless we're learning it if learn_cont or not (label0 == label1 and root_change == 0): schema_transition_counts[label0].inc(label1) root_transition_counts[(label0,label1)].inc(root_change) # We don't add a count for going to the final state: we don't # want to initialize it with too much weight # Estimate distribution from this frequency distribution schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ ConditionalProbDist(schema_transition_counts, mle_estimator, None), \ mutable=True, samples=self.schemata+[None]) root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ ConditionalProbDist(root_transition_counts, mle_estimator, None), \ mutable=True, samples=range(12)) if not learn_cont: # Discount all probabilities to allow for self-transition probs discount = logprob(1.0 - contprob) self_prob = logprob(contprob) for label0 in self.schemata: # Give saved prob mass to self-transitions trans_dist[label0].update((label0, 0), self_prob) # Discount all other transitions to allow for this for label1 in self.schemata: for root_change in range(12): if not (label0 == label1 and root_change == 0): # Discount non self transitions trans_dist[label0].update((label1, root_change), \ trans_dist[label0].logprob((label1, root_change)) + \ discount) # Recreate the dict prob dist so it's not mutable any more schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist) root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist) ## Now for the initial distribution # Count up the observations initial_counts = FreqDist() for sequence in training_samples: initial_counts.inc(sequence[0][0]) # Use Laplace (plus one) smoothing #for label in self.schemata: # initial_counts.inc(label) # Estimate distribution from this frequency distribution initial_dist = prob_dist_to_dictionary_prob_dist(\ mle_estimator(initial_counts, None), samples=self.schemata) # Replace the model's transition distributions self.schema_transition_dist = schema_trans_dist self.root_transition_dist = root_trans_dist self.initial_state_dist = initial_dist # Invalidate the cache self.clear_cache()
def normal_forward_probabilities(self, sequence, array=False): """If you want the normalized matrix of forward probabilities, it's ok to use normal (non-log) probabilities and these can be computed more quickly, since you don't need to sum logs (which is time consuming). Returns the matrix, and also the vector of values that each timestep was divided by to normalize (i.e. total probability of each timestep over all states). Also returns the total log probability of the sequence. @type array: bool @param array: if True, returns a numpy 2d array instead of a list of dicts. @return: (matrix,normalizing vector,log prob) """ T = len(sequence) N = len(self.label_dom) alpha = numpy.zeros((T, N), numpy.float64) scale = numpy.zeros(T, numpy.float64) # Prepare the first column of the matrix: probs of all states in the # first timestep for i,state in enumerate(self.label_dom): alpha[0,i] = self.transition_probability(state, None) * \ self.emission_probability(sequence[0], state) # Normalize by dividing all values by the total probability total = array_sum(alpha[0,:]) alpha[0,:] /= total scale[0] = total # Iterate over the other timesteps for t in range(1, T): for j,sj in enumerate(self.label_dom): # Multiply each previous state's prob by the transition prob # to this state and sum them all together prob = sum( (alpha[t-1, i] * self.transition_probability(sj, si) \ for i,si in enumerate(self.label_dom)), 0.0) # Also multiply this by the emission probability alpha[t, j] = prob * \ self.emission_probability(sequence[t], sj) # Normalize by dividing all values by the total probability total = array_sum(alpha[t,:]) alpha[t,:] /= total scale[t] = total # Multiply together the probability of each timestep to get the whole # probability of the sequence # This gets the same result as if we did: # alpha = model.forward_log_probabilities(sequence, normalize=False, array=True) # log_prob = sum_logs(alpha[T-1,:]) log_prob = sum((logprob(total) for total in scale), 0.0) if not array: # Convert this into a list of dicts matrix = [] for t in range(T): timestep = {} for (i,label) in enumerate(self.label_dom): timestep[label] = alpha[t,i] matrix.append(timestep) return matrix,scale,log_prob else: return alpha,scale,log_prob