Exemplo n.º 1
0
 def transition_log_probability(self, *states):
     states = [s if s is not None else (None,None) for s in states]
     
     if self.order == 1:
         # Transitions are all equiprobable
         return - logprob(len(self.label_dom))
     
     points,functions = zip(*states)
     
     if points[0] is None:
         # Just use the fun transition to get prob of final state
         fn_context = tuple(functions[:1])
         return self.fn_transition_dist[fn_context].logprob(None)
     
     if all(p is None for p in points[1:]) == 1:
         # Initial states: all points equiprobable
         # Only permit points in the (0,0) enharmonic space
         if points[0][0] != 0 or points[0][1] != 0:
             return float('-inf')
         # Get fn prob from initial dist
         return self.fn_transition_dist[tuple()].logprob(functions[0]) - logprob(12)
     
     # The function is conditioned on all previous functions
     fn_context = tuple(functions[1:])
     fn_prob = self.fn_transition_dist[fn_context].logprob(functions[0])
     
     vect = vector(points[1], points[0])
     # The vector is conditioned on the function 
     #  and the pairs of vector and function preceding that
     vector_prob = self.point_transition_dist[functions[0]].logprob(vect)
     
     # Multiply together the vector and function probs
     return vector_prob + fn_prob
Exemplo n.º 2
0
 def prob_adder(start, end, signtup, words):
     sign, tag, tag_prob = signtup
     if self.use_tagger_probs:
         # Use the tagger to get lexical probabilities
         lex_prob = self.tagger.lexical_probability(start, end, tag)
     else:
         # We might get multiple words here: use the first
         # This is not really a satisfactory solution: better would be 
         #  to get the tagger to tell us which word to use
         if isinstance(words, list):
             word = words[0]
         elif not isinstance(words, basestring):
             # Check the word is a string
             # If not, we probably shouldn't be trying to get a probability
             raise ParseError, "PCFG model is trying to assign lexical "\
                 "probabilities to words, but the words aren't strings. "\
                 "Maybe you should have disabled lexical probs wtih "\
                 "parser option 'nolex'"
         else:
             word = words
         # Consult the model to get the lexical probability of this sign
         lex_prob = self.model.inside_probability('leaf', sign, word)
     # Triangular number: nodes in the tree for multiword categories
     # This has the effect of penalizing multiword categories 
     #  proportionally to the number of tree nodes deriving the 
     #  categories they're competing with derived from single-word cats
     tree_size = comb(end-start+1, 2)
     lex_prob = lex_prob ** tree_size
     # Add the probabilities to the category
     sign.inside_probability = logprob(lex_prob)
     sign.probability = logprob(self.model.outside_probability(sign)) \
                         + sign.inside_probability
Exemplo n.º 3
0
    def update_model(self, arrays, array_ids):
        """
        Replaces the distributions of the saved model with the probabilities 
        taken from the arrays of updates. self.model is expected to be 
        made up of mutable distributions when this is called.
        
        """
        trans, ems, trans_denom, ems_denom = arrays
        state_ids, em_ids = array_ids
        num_states = len(self.model.label_dom)
        num_emissions = len(self.model.emission_dom)

        for state in self.model.label_dom:
            # Get the transition denominator for going from this state
            state_i = state_ids[state]
            denom = trans_denom[state_i]

            for next_state in self.model.label_dom:
                state_j = state_ids[next_state]
                # Update the probability of this transition
                prob = logprob(trans[state_i][state_j] + ADD_SMALL) - \
                        logprob(trans_denom[state_i] + num_states*ADD_SMALL)
                self.model.label_dist[(state, )].update(next_state, prob)

            for emission in self.model.emission_dom:
                # Update the probability of this emission
                prob = logprob(ems[state_i][em_ids[emission]] + ADD_SMALL) - \
                        logprob(ems_denom[state_i] + num_emissions*ADD_SMALL)
                self.model.emission_dist[state].update(emission, prob)
Exemplo n.º 4
0
 def update_model(self, arrays, array_ids):
     """
     Replaces the distributions of the saved model with the probabilities 
     taken from the arrays of updates. self.model is expected to be 
     made up of mutable distributions when this is called.
     
     """
     trans, ems, trans_denom, ems_denom = arrays
     state_ids, em_ids = array_ids
     num_states = len(self.model.label_dom)
     num_emissions = len(self.model.emission_dom)
     
     for state in self.model.label_dom:
         # Get the transition denominator for going from this state
         state_i = state_ids[state]
         denom = trans_denom[state_i]
         
         for next_state in self.model.label_dom:
             state_j = state_ids[next_state]
             # Update the probability of this transition
             prob = logprob(trans[state_i][state_j] + ADD_SMALL) - \
                     logprob(trans_denom[state_i] + num_states*ADD_SMALL)
             self.model.label_dist[(state,)].update(next_state, prob)
         
         for emission in self.model.emission_dom:
             # Update the probability of this emission
             prob = logprob(ems[state_i][em_ids[emission]] + ADD_SMALL) - \
                     logprob(ems_denom[state_i] + num_emissions*ADD_SMALL)
             self.model.emission_dist[state].update(emission, prob)
Exemplo n.º 5
0
 def prob_adder(start, end, signtup, words):
     sign, tag, tag_prob = signtup
     if self.use_tagger_probs:
         # Use the tagger to get lexical probabilities
         lex_prob = self.tagger.lexical_probability(start, end, tag)
     else:
         # We might get multiple words here: use the first
         # This is not really a satisfactory solution: better would be
         #  to get the tagger to tell us which word to use
         if isinstance(words, list):
             word = words[0]
         elif not isinstance(words, basestring):
             # Check the word is a string
             # If not, we probably shouldn't be trying to get a probability
             raise ParseError, "PCFG model is trying to assign lexical "\
                 "probabilities to words, but the words aren't strings. "\
                 "Maybe you should have disabled lexical probs wtih "\
                 "parser option 'nolex'"
         else:
             word = words
         # Consult the model to get the lexical probability of this sign
         lex_prob = self.model.inside_probability('leaf', sign, word)
     # Triangular number: nodes in the tree for multiword categories
     # This has the effect of penalizing multiword categories
     #  proportionally to the number of tree nodes deriving the
     #  categories they're competing with derived from single-word cats
     tree_size = comb(end - start + 1, 2)
     lex_prob = lex_prob**tree_size
     # Add the probabilities to the category
     sign.inside_probability = logprob(lex_prob)
     sign.probability = logprob(self.model.outside_probability(sign)) \
                         + sign.inside_probability
Exemplo n.º 6
0
 def _res_mod(result, sign):
     # Function to add the probability to each result from the input
     # Use the model to get the probabilities
     inside_prob = logprob(model.inside_probability(
                                         'unary', result, sign)) + \
                                 sign.inside_probability
     outside_prob = logprob(model.outside_probability(result))
     result.inside_probability = inside_prob
     result.probability = outside_prob + inside_probability
Exemplo n.º 7
0
    def transition_log_probability(self, state, *previous_states):
        previous_states = [
            s if s is not None else (None, None) for s in previous_states
        ]
        if self.order == 1:
            roots, schemata = [], []
        else:
            roots, schemata = zip(*previous_states)
        schemata = tuple(schemata)

        # Use the separate method to get the probability from the schema dist
        schema_prob = self.schema_transition_log_probability(
            state, *previous_states)

        # Then the root transition distribution
        # Don't look at the root transition if this is a unigram model
        if self.order == 1 or all(s is None for s in schemata):
            # All roots equiprobable
            root_prob = -logprob(12)
        elif state is None:
            # Final state: no root transition, prob comes from state dist
            root_prob = 0
        else:
            # Calculate the root change from the previous chord
            root_change = (state[0] - roots[0]) % 12
            # Condition the root change prob on the *previous* schema
            root_prob = self.root_transition_dist[schemata[0]].logprob(
                root_change)

        # Multiply together the probability of the schema transition and the
        #  root change
        return root_prob + schema_prob
Exemplo n.º 8
0
 def transition_log_probability(self, state, *previous_states):
     previous_states = [s if s is not None else (None,None) for s in previous_states]
     if self.order == 1:
         roots,schemata = [],[]
     else:
         roots,schemata = zip(*previous_states)
     schemata = tuple(schemata)
     
     # Use the separate method to get the probability from the schema dist
     schema_prob = self.schema_transition_log_probability(state, *previous_states)
     
     # Then the root transition distribution
     # Don't look at the root transition if this is a unigram model
     if self.order == 1 or all(s is None for s in schemata):
         # All roots equiprobable
         root_prob = - logprob(12)
     elif state is None:
         # Final state: no root transition, prob comes from state dist
         root_prob = 0
     else:
         # Calculate the root change from the previous chord
         root_change = (state[0] - roots[0]) % 12
         # Condition the root change prob on the *previous* schema
         root_prob = self.root_transition_dist[schemata[0]].logprob(root_change)
     
     # Multiply together the probability of the schema transition and the 
     #  root change
     return root_prob + schema_prob
Exemplo n.º 9
0
 def emission_log_probability(self, emission, state):
     """
     Gives the probability P(emission | label). Returned as a base 2
     log.
     
     The emission should be a pair of (root,label), together defining a 
     chord.
     
     There's a special case of this. If the emission is a list, it's 
     assumed to be a I{distribution} over emissions. The list should 
     contain (prob,em) pairs, where I{em} is an emission, such as is 
     normally passed into this function, and I{prob} is the weight to 
     give to this possible emission. The probabilities of the possible 
     emissions are summed up, weighted by the I{prob} values.
     
     """
     if type(emission) is list:
         # Average probability over the possible emissions
         probs = []
         for (prob,em) in emission:
             probs.append(logprob(prob) + \
                          self.emission_log_probability(em, state))
         return sum_logs(probs)
     
     # Single chord label
     state_root,schema = state
     chord_root,label = emission
     # Probability is 0 if the roots don't match
     if state_root != chord_root:
         return float('-inf')
     else:
         return self.emission_dist[schema].logprob(label)
Exemplo n.º 10
0
 def _get_transition_backoff_scaler(self, context):
     # This is just for the schema distribution
     if context not in self._discount_cache:
         # The prob mass reserved for unseen events can be computed by 
         #  summing probabilities over all seen events and subtracting 
         #  from 1.
         # Our discounting model distributes this probability evenly over 
         #  the unseen events, so we can compute the discounted mass by 
         #  getting the probability of one unseen event and multiplying it.
         seen_labels = set([lab for lab in self.schemata+[None] if 
                                     self.schema_transition_counts[context][lab] > 0])
         if len(seen_labels) == 0:
             # Not seen anything in this context. All mass is discounted!
             self._discount_cache[context] = 0.0
         else:
             unseen_labels = set(self.schemata+[None]) - seen_labels
             # Try getting some event that won't have been seen
             # Compute how much mass is reserved for unseen events
             discounted_mass = self.schema_transition_dist[context].prob(
                                                 "%%% UNSEEN LABEL %%%") \
                                             * len(unseen_labels)
             # Compute how much probability the n-1 order model assigns to 
             #  things unseen by this model
             backoff_context = context[:-1]
             backoff_seen_mass = sum_logs([
                 self.backoff_model.schema_transition_log_probability_schemata(lab, 
                                                     *backoff_context) 
                                                 for lab in unseen_labels])
             self._discount_cache[context] = logprob(discounted_mass) - \
                                                     backoff_seen_mass
     return self._discount_cache[context]
Exemplo n.º 11
0
    def emission_log_probability(self, emission, state):
        """
        Gives the probability P(emission | label). Returned as a base 2
        log.
        
        The emission should be a pair of (root,label), together defining a 
        chord.
        
        There's a special case of this. If the emission is a list, it's 
        assumed to be a I{distribution} over emissions. The list should 
        contain (prob,em) pairs, where I{em} is an emission, such as is 
        normally passed into this function, and I{prob} is the weight to 
        give to this possible emission. The probabilities of the possible 
        emissions are summed up, weighted by the I{prob} values.
        
        """
        if type(emission) is list:
            # Average probability over the possible emissions
            probs = []
            for (prob, em) in emission:
                probs.append(logprob(prob) + \
                             self.emission_log_probability(em, state))
            return sum_logs(probs)

        # Single chord label
        state_root, schema = state
        chord_root, label = emission
        # Probability is 0 if the roots don't match
        if state_root != chord_root:
            return float('-inf')
        else:
            return self.emission_dist[schema].logprob(label)
Exemplo n.º 12
0
 def _get_transition_backoff_scaler(self, context):
     # This is just for the schema distribution
     if context not in self._discount_cache:
         # The prob mass reserved for unseen events can be computed by
         #  summing probabilities over all seen events and subtracting
         #  from 1.
         # Our discounting model distributes this probability evenly over
         #  the unseen events, so we can compute the discounted mass by
         #  getting the probability of one unseen event and multiplying it.
         seen_labels = set([
             lab for lab in self.schemata + [None]
             if self.schema_transition_counts[context][lab] > 0
         ])
         if len(seen_labels) == 0:
             # Not seen anything in this context. All mass is discounted!
             self._discount_cache[context] = 0.0
         else:
             unseen_labels = set(self.schemata + [None]) - seen_labels
             # Try getting some event that won't have been seen
             # Compute how much mass is reserved for unseen events
             discounted_mass = self.schema_transition_dist[context].prob(
                                                 "%%% UNSEEN LABEL %%%") \
                                             * len(unseen_labels)
             # Compute how much probability the n-1 order model assigns to
             #  things unseen by this model
             backoff_context = context[:-1]
             backoff_seen_mass = sum_logs([
                 self.backoff_model.
                 schema_transition_log_probability_schemata(
                     lab, *backoff_context) for lab in unseen_labels
             ])
             self._discount_cache[context] = logprob(discounted_mass) - \
                                                     backoff_seen_mass
     return self._discount_cache[context]
Exemplo n.º 13
0
 def _binary_expansion_probability(self, sign_pair, result):
     """
     Used by L{_apply_binary_rule} and L{_apply_binary_rule_semantics} to 
     compute the expansion probabilitiy.
     
     This is a separate function because both of the above do the same 
     to compute the probabilities, so I don't want to repeat the code.
     
     Returns a tuple of the probability and the inside probability.
     
     """
     parent = result
     left, right = sign_pair
     expansion = 'right'
     # Get the probabilities from the model
     subtree_prob = logprob(self.model.inside_probability(
                                         expansion, parent, left, right))
     outside_prob = logprob(self.model.outside_probability(parent))
     # Multiply in the daughters' inside probs to get the inside prob
     inside_prob = subtree_prob + left.inside_probability + \
                                     right.inside_probability
     return (inside_prob+outside_prob, inside_prob)
Exemplo n.º 14
0
    def normal_forward_probabilities(self, sequence):
        """If you want the normalized matrix of forward probabilities, it's 
        ok to use normal (non-log) probabilities and these can be computed 
        more quickly, since you don't need to sum logs (which is time 
        consuming).
        
        Returns the matrix, and also the vector of values that each timestep 
        was divided by to normalize (i.e. total probability of each timestep 
        over all states).
        Also returns the total log probability of the sequence.
        
        @return: (matrix,normalizing vector,log prob)
        
        """
        T = len(sequence)
        N = len(self.label_dom)
        alpha = numpy.zeros((T, N), numpy.float64)
        scale = numpy.zeros(T, numpy.float64)

        # Prepare the first column of the matrix: probs of all states in the
        #  first timestep
        for i, state in enumerate(self.label_dom):
            alpha[0,i] = self.transition_probability(state, None) * \
                            self.emission_probability(sequence[0], state)
        # Normalize by dividing all values by the total probability
        total = sum(alpha[0, :])
        for i in range(N):
            alpha[0, i] /= total
        scale[0] = total

        # Iterate over the other timesteps
        for t in range(1, T):
            for j, sj in enumerate(self.label_dom):
                # Multiply each previous state's prob by the transition prob
                #  to this state and sum them all together
                log_prob = sum(
                    (alpha[t-1, i] * self.transition_probability(sj, si) \
                        for i,si in enumerate(self.label_dom)), 0.0)
                # Also multiply this by the emission probability
                alpha[t, j] = log_prob * \
                                self.emission_probability(sequence[t], sj)
            # Normalize by dividing all values by the total probability
            total = sum(alpha[t, :])
            for j in range(N):
                alpha[t, j] /= total
            scale[t] = total

        # Multiply together the probability of each timestep to get the whole
        # probability of the sequence
        log_prob = sum((logprob(total) for total in scale), 0.0)
        return alpha, scale, log_prob
Exemplo n.º 15
0
 def normal_forward_probabilities(self, sequence):
     """If you want the normalized matrix of forward probabilities, it's 
     ok to use normal (non-log) probabilities and these can be computed 
     more quickly, since you don't need to sum logs (which is time 
     consuming).
     
     Returns the matrix, and also the vector of values that each timestep 
     was divided by to normalize (i.e. total probability of each timestep 
     over all states).
     Also returns the total log probability of the sequence.
     
     @return: (matrix,normalizing vector,log prob)
     
     """
     T = len(sequence)
     N = len(self.label_dom)
     alpha = numpy.zeros((T, N), numpy.float64)
     scale = numpy.zeros(T, numpy.float64)
     
     # Prepare the first column of the matrix: probs of all states in the 
     #  first timestep
     for i,state in enumerate(self.label_dom):
         alpha[0,i] = self.transition_probability(state, None) * \
                         self.emission_probability(sequence[0], state)
     # Normalize by dividing all values by the total probability
     total = sum(alpha[0,:])
     for i in range(N):
         alpha[0,i] /= total
     scale[0] = total
     
     # Iterate over the other timesteps
     for t in range(1, T):
         for j,sj in enumerate(self.label_dom):
             # Multiply each previous state's prob by the transition prob 
             #  to this state and sum them all together
             log_prob = sum(
                 (alpha[t-1, i] * self.transition_probability(sj, si) \
                     for i,si in enumerate(self.label_dom)), 0.0)
             # Also multiply this by the emission probability
             alpha[t, j] = log_prob * \
                             self.emission_probability(sequence[t], sj)
         # Normalize by dividing all values by the total probability
         total = sum(alpha[t,:])
         for j in range(N):
             alpha[t,j] /= total
         scale[t] = total
     
     # Multiply together the probability of each timestep to get the whole 
     # probability of the sequence
     log_prob = sum((logprob(total) for total in scale), 0.0)
     return alpha,scale,log_prob
Exemplo n.º 16
0
 def clear_cache(self):
     """
     Initializes or empties probability distribution caches.
     
     Make sure to call this if you change or update the distributions.
     
     """
     # Whole emission-state identity
     self._emission_cache = {}
     # Class-dependent emission identity
     self._emission_class_cache = {}
     # Whole transition identity
     self._transition_cache = {}
     
     # Recompute the probability scalers
     illegal_prob = dict([(label, 0.0) for label in self.schemata])
     for label0,label1 in self.illegal_transitions:
         # Sum up the probability of illegal transitions, which will be 
         #  treated as 0
         illegal_prob[label0] += self.schema_transition_dist[label0].prob(label1)
     self._schema_prob_scalers = {}
     for label in self.schemata:
         # Compute what to scale the other probabilities by
         self._schema_prob_scalers[label] = - logprob(1.0 - illegal_prob[label])
Exemplo n.º 17
0
 def _apply_beam(self):
     """
     Applies a beam, using the already given threshold, to the set,
     pruning out any signs with a probability lower than the 
     given ratio of the most probable sign.
     """
     if not self._beamed:
         max = self._max_probability()
         cutoff = max + logprob(self.threshold)
         to_remove = [sign for sign in self.values() if sign.probability < cutoff]
         for sign in to_remove:
             self.remove(sign)
         logger.debug("Beam removed %d signs (max %s, min %s)" % \
                         (len(to_remove),max, cutoff))
         # Beam is now applied: check the remaining size
         if self.maxsize != 0:
             if len(self) > self.maxsize:
                 logger.debug("Hard beam removed %d signs" % (self.maxsize-len(self)))
                 # Too many signs: apply a hard cutoff
                 ordered = list(sorted(self.values(), key=lambda s:s.probability))
                 for sign in ordered[self.maxsize:]:
                     self.remove(sign)
         # Don't apply the beam again until something changes
         self._beamed = True
Exemplo n.º 18
0
 def emission_log_probability(self, emission, state):
     """
     Gives the probability P(emission | label). Returned as a base 2
     log.
     
     The emission should be a pair of (root,label), together defining a 
     chord.
     
     There's a special case of this. If the emission is a list, it's 
     assumed to be a I{distribution} over emissions. The list should 
     contain (prob,em) pairs, where I{em} is an emission, such as is 
     normally passed into this function, and I{prob} is the weight to 
     give to this possible emission. The probabilities of the possible 
     emissions are summed up, weighted by the I{prob} values.
     
     """
     if type(emission) is list:
         # Average probability over the possible emissions
         probs = []
         for (prob,em) in emission:
             probs.append(logprob(prob) + \
                          self.emission_log_probability(em, state))
         return sum_logs(probs)
     
     # Single chord label
     point,function = state
     chord_root,label = emission
     X,Y,x,y = point
     # Work out the chord substitution
     subst = (chord_root - coordinate_to_et_2d((x,y))) % 12
     
     # Generate the substitution given the chord function
     subst_prob = self.subst_emission_dist[function].logprob(subst)
     # Generate the label given the subst and chord function
     label_prob = self.type_emission_dist[(subst,function)].logprob(label)
     return subst_prob + label_prob
Exemplo n.º 19
0
 def __init__(self, *args, **kwargs):
     super(RaphstoHmmUnigram, self).__init__(*args, **kwargs)
     # Precompute the uniform transition probability
     self._uniform_transition = 1.0 / len(self.label_dom)
     self._uniform_transition_log = -logprob(len(self.label_dom))
Exemplo n.º 20
0
 def train(self, emissions, max_iterations=None, \
                 convergence_logprob=None, logger=None, processes=1,
                 save=True, save_intermediate=False):
     """
     Performs unsupervised training using Baum-Welch EM.
     
     This is an instance method, because it is performed on a model 
     that has already been initialized. You might, for example, 
     create such a model using C{initialize_chord_types}.
     
     This is based on the training procedure in NLTK for HMMs:
     C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
     
     @type emissions: list of lists of emissions
     @param emissions: training data. Each element is a list of 
         emissions representing a sequence in the training data.
         Each emission is an emission like those used for 
         L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, 
         i.e. a list of note 
         observations
     @type max_iterations: int
     @param max_iterations: maximum number of iterations to allow 
         for EM (default 100). Overrides the corresponding 
         module option
     @type convergence_logprob: float
     @param convergence_logprob: maximum change in log probability 
         to consider convergence to have been reached (default 1e-3). 
         Overrides the corresponding module option
     @type logger: logging.Logger
     @param logger: a logger to send progress logging to
     @type processes: int
     @param processes: number processes to spawn. A pool of this 
         many processes will be used to compute distribution updates 
         for sequences in parallel during each iteration.
     @type save: bool
     @param save: save the model at the end of training
     @type save_intermediate: bool
     @param save_intermediate: save the model after each iteration. Implies 
         C{save}
     
     """
     from . import raphsto_d
     if logger is None:
         from jazzparser.utils.loggers import create_dummy_logger
         logger = create_dummy_logger()
     
     if save_intermediate:
         save = True
         
     # No point in creating more processes than there are sequences
     if processes > len(emissions):
         processes = len(emissions)
     
     self.model.add_history("Beginning Baum-Welch unigram training on %s" % get_host_info_string())
     self.model.add_history("Training on %d sequences (with %s chords)" % \
         (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
     
     # Use kwargs if given, otherwise module options
     if max_iterations is None:
         max_iterations = self.options['max_iterations']
     if convergence_logprob is None:
         convergence_logprob = self.options['convergence_logprob']
     
     # Enumerate the states
     state_ids = dict((state,num) for (num,state) in \
                                 enumerate(self.model.label_dom))
     
     # Enumerate the beat values (they're probably consecutive ints, but 
     #  let's not rely on it)
     beat_ids = dict((beat,num) for (num,beat) in \
                                 enumerate(self.model.beat_dom))
     num_beats = len(beat_ids)
     # Enumerate the d-values (d-function's domain)
     d_ids = dict((d,num) for (num,d) in \
                                 enumerate(self.model.emission_dist_dom))
     num_ds = len(d_ids)
     
     # Make a mutable distribution for the emission distribution we'll 
     #  be updating
     emission_mdist = DictionaryConditionalProbDist(
                 dict((s, MutableProbDist(self.model.emission_dist[s], 
                                          self.model.emission_dist_dom))
                     for s in self.model.emission_dist.conditions()))
     # Create dummy distributions to fill the places of the transition 
     #  distribution components
     key_mdist = DictionaryConditionalProbDist({})
     chord_mdist = DictionaryConditionalProbDist({})
     chord_uni_mdist = MutableProbDist({}, [])
     
     # Construct a model using these mutable distributions so we can 
     #  evaluate using them
     model = self.model_cls(key_mdist, 
                            chord_mdist,
                            emission_mdist, 
                            chord_uni_mdist,
                            chord_set=self.model.chord_set)
     
     iteration = 0
     last_logprob = None
     while iteration < max_iterations:
         logger.info("Beginning iteration %d" % iteration)
         current_logprob = 0.0
         
         # ems contains the new emission numerator probabilities
         # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r}
         #                  alpha(x_n).beta(x_n) / 
         #                    Sum_{x'_n} (alpha(x'_n).beta(x'_n))
         ems = zeros((num_beats,num_ds), float64)
         # And these are the denominators
         ems_denom = zeros(num_beats, float64)
         
         def _training_callback(result):
             """
             Callback for the _sequence_updates processes that takes 
             the updates from a single sequence and adds them onto 
             the global update accumulators.
             
             """
             # _sequence_updates() returns all of this as a tuple
             (ems_local, ems_denom_local, seq_logprob) = result
             
             # Add these probabilities from this sequence to the 
             #  global matrices
             # Emission numerator
             array_add(ems, ems_local, ems)
             # Denominators
             array_add(ems_denom, ems_denom_local, ems_denom)
         ## End of _training_callback
         
         
         # Only use a process pool if there's more than one sequence
         if processes > 1:
             # Create a process pool to use for training
             logger.info("Creating a pool of %d processes" % processes)
             pool = Pool(processes=processes)
             
             async_results = []
             for seq_i,sequence in enumerate(emissions):
                 logger.info("Iteration %d, sequence %d" % (iteration, seq_i))
                 T = len(sequence)
                 if T == 0:
                     continue
                 
                 # Fire off a new call to the process pool for every sequence
                 async_results.append(
                         pool.apply_async(_sequence_updates_uni, 
                                             (sequence, model, 
                                                 self.model.label_dom, 
                                                 state_ids, 
                                                 beat_ids, d_ids, raphsto_d), 
                                             callback=_training_callback) )
             pool.close()
             # Wait for all the workers to complete
             pool.join()
             
             # Call get() on every AsyncResult so that any exceptions in 
             #  workers get raised
             for res in async_results:
                 # If there was an exception in _sequence_update, it 
                 #  will get raised here
                 res_tuple = res.get()
                 # Add this sequence's logprob into the total for all sequences
                 current_logprob += res_tuple[2]
         else:
             logger.info("One sequence: not using a process pool")
             sequence = emissions[0]
             
             if len(sequence) > 0:
                 updates = _sequence_updates_uni(
                                     sequence, model,
                                     self.model.label_dom,
                                     state_ids, 
                                     beat_ids, d_ids, raphsto_d)
                 _training_callback(updates)
                 # Update the overall logprob
                 current_logprob = updates[2]
         
         # Update the model's probabilities from the accumulated values
         for beat in self.model.beat_dom:
             denom = ems_denom[beat_ids[beat]]
             for d in self.model.emission_dist_dom:
                 if denom == 0.0:
                     # Zero denominator
                     prob = - logprob(len(d_ids))
                 else:
                     prob = logprob(ems[beat_ids[beat]][d_ids[d]] + ADD_SMALL) - logprob(denom + len(d_ids)*ADD_SMALL)
                 model.emission_dist[beat].update(d, prob)
         
         # Clear the model's cache so we get the new probabilities
         model.clear_cache()
         
         logger.info("Training data log prob: %s" % current_logprob)
         if last_logprob is not None and current_logprob < last_logprob:
             logger.error("Log probability dropped by %s" % \
                             (last_logprob - current_logprob))
         if last_logprob is not None:
             logger.info("Log prob change: %s" % \
                             (current_logprob - last_logprob))
         # Check whether the log probability has converged
         if iteration > 0 and \
                 abs(current_logprob - last_logprob) < convergence_logprob:
             # Don't iterate any more
             logger.info("Distribution has converged: ceasing training")
             break
         
         iteration += 1
         last_logprob = current_logprob
         
         # Update the main model
         # Only save if we've been asked to save between iterations
         self.update_model(model, save=save_intermediate)
     
     self.model.add_history("Completed Baum-Welch unigram training")
     # Update the distribution's parameters with those we've trained
     self.update_model(model, save=save)
     return
Exemplo n.º 21
0
 def normal_forward_probabilities(self, sequence, seq_prob=False, decomposed=False):
     """
     Specialized version of this to make it faster.
     
     @note: verified that this gets identical results to the superclass
     
     @param seq_prob: return the log probability of the whole sequence 
         as well as the array (tuple of (array,logprob)).
     @return: 2D Numpy array.
         The first dimension represents timesteps, the second the states.
     
     """
     from numpy import newaxis
     N = len(sequence)
     states = self.label_dom
     S = len(states)
     chords = self.chord_types
     C = len(chords)
     
     # Prepare the transition and emission matrices
     ems = self.get_small_emission_matrix(sequence)
     trans = self.get_small_transition_matrix()
     # Initialize an empty matrix
     # The dims of the matrix are (time, key, root, label)
     forward_matrix = numpy.zeros((N,12,12,C), numpy.float64)
     # Create an array for the total logprobs
     coefficients = numpy.zeros((N,), numpy.float64)
     
     # First fill in the first columns with transitions from None
     for root in range(12):
         for c,chord in enumerate(chords):
             for key in range(12):
                 # Fill in with the (None-padded) transition probability
                 forward_matrix[0,key,root,c] = self.transition_probability(
                                                 (key,root,chord), None)
     # Multiply in the emission probabilities
     # These get broadcast over the last dim, key
     forward_matrix[0] = forward_matrix[0] * ems[0]
     # Normalize
     coefficients[0] = logprob(numpy.sum(forward_matrix[0]))
     forward_matrix[0] /= numpy.sum(forward_matrix[0])
     
     for time in range(1, N):
         # Multiply in the transition matrix to get the new state probabilities
         trans_step = forward_matrix[time-1] * trans
         # DIMS: key, root, label, key[-1], root[-1], label[-1]
         for i in range(3):
             # Sum over previous states
             trans_step = numpy.sum(trans_step, axis=-1)
         # Multiply in the emission probabilities
         # This broadcasts over keys, since emissions don't care about key
         forward_matrix[time] = trans_step * ems[time]
         # Normalize the timestep
         coefficients[time] = logprob(numpy.sum(forward_matrix[time]))
         forward_matrix[time] /= numpy.sum(forward_matrix[time])
     
     if not decomposed:
         # Reshape the array so it has only two dimensions
         # The dimensions are ordered in the same way as the components of the 
         #  labels, so we just reshape
         forward_matrix = forward_matrix.reshape(N, 12*12*C)
     
     if seq_prob:
         return forward_matrix, numpy.sum(coefficients)
     else:
         return forward_matrix
Exemplo n.º 22
0
 def __init__(self, *args, **kwargs):
     super(RaphstoHmmUnigram, self).__init__(*args, **kwargs)
     # Precompute the uniform transition probability
     self._uniform_transition = 1.0 / len(self.label_dom)
     self._uniform_transition_log = - logprob(len(self.label_dom))
    def update_model(self, arrays, array_ids):
        """
        Replaces the distributions of the saved model with the probabilities 
        taken from the arrays of updates. self.model is expected to be 
        made up of mutable distributions when this is called.
        
        """
        (
            initial_keys,
            initial_chords,
            key_trans,
            chord_trans,
            ems,
            initial_keys_denom,
            initial_chords_denom,
            key_trans_denom,
            chord_trans_denom,
            ems_denom,
        ) = arrays
        chord_ids, chord_type_ids = array_ids

        num_chords = len(self.model.chord_dom)
        num_emissions = len(self.model.emission_dom)
        num_chord_types = len(self.model.chord_vocab)

        # Initial keys distribution
        # Only update this distribution if asked to: often we should leave it
        if self.options["initkey"]:
            for key in range(12):
                prob = logprob(initial_keys[key] + ADD_SMALL) - logprob(initial_keys_denom[0] + ADD_SMALL * 12)
                self.model.initial_key_dist.update(key, prob)

        # Initial chords distribution
        for chord in self.model.chord_dom:
            chordi = chord_ids[chord]

            prob = logprob(initial_chords[chordi] + ADD_SMALL) - logprob(
                initial_chords_denom[0] + ADD_SMALL * num_chords
            )
            self.model.initial_chord_dist.update(chord, prob)

        # Key transition distribution
        for key in range(12):
            prob = logprob(key_trans[key] + ADD_SMALL) - logprob(key_trans_denom[0] + ADD_SMALL * 12)
            self.model.key_transition_dist.update(key, prob)

        # Chord transition distribution
        for chord0 in self.model.chord_dom:
            chordi = chord_ids[chord0]

            for chord1 in self.model.chord_dom + [None]:
                chordj = chord_ids[chord1]

                prob = logprob(chord_trans[chordi][chordj] + ADD_SMALL) - logprob(
                    chord_trans_denom[chordi] + ADD_SMALL * num_chords
                )
                self.model.chord_transition_dist[chord0].update(chord1, prob)

        # Emission distribution
        for label in self.model.chord_vocab:
            labeli = chord_type_ids[label]

            for pitch in range(12):
                prob = logprob(ems[labeli][pitch] + ADD_SMALL) - logprob(
                    ems_denom[labeli] + ADD_SMALL * num_chord_types
                )
                self.model.emission_dist[label].update(pitch, prob)
Exemplo n.º 24
0
 def train_transition_distribution(self, inputs, grammar, contprob=0.3):
     """
     Train the transition distribution parameters in a supervised manner, 
     using chord corpus input.
     
     This is used as an initialization step to set transition parameters 
     before running EM on unannotated data.
     
     @type inputs: L{jazzparser.data.input.AnnotatedDbBulkInput}
     @param inputs: annotated chord training data
     @type contprob: float or string
     @param contprob: probability mass to reserve for staying on the 
         same state (self transitions). Use special value 'learn' to 
         learn the probabilities from the durations
     
     """
     self.add_history(
             "Training transition probabilities using %d annotated chord "\
             "sequences" % len(inputs))
     learn_cont = contprob == "learn"
     
     # Prepare the label sequences that we'll train on
     if learn_cont:
         # Repeat values with a duration > 1
         sequences = []
         for seq in inputs:
             sequence = []
             last_cat = None
             for chord,cat in zip(seq, seq.categories):
                 # Put it in once for each duration
                 for i in range(chord.duration):
                     sequence.append((chord,cat))
             sequences.append(sequence)
     else:
         sequences = [list(zip(sequence, sequence.categories)) for \
                                 sequence in inputs]
     
     # Prepare a list of transformations to apply to the categories
     label_transform = {}
     # First include all the categories we want to keep as they were
     for schema in self.schemata:
         label_transform[schema] = (schema, 0)
     # Then include any transformations the grammar defines
     for pos,mapping in grammar.equiv_map.items():
         label_transform[pos] = (mapping.target.pos, mapping.root)
     
     # Apply the transformation to all the training data
     training_samples = []
     for chord_cats in sequences:
         seq_samples = []
         for chord,cat in chord_cats:
             # Transform the label if it has a transformation
             if cat in label_transform:
                 use_cat, alter_root = label_transform[cat]
             else:
                 use_cat, alter_root = cat, 0
             root = (chord.root + alter_root) % 12
             seq_samples.append((str(use_cat), root))
         training_samples.append(seq_samples)
     
     training_data = sum([
         [(cat0, cat1, (root1 - root0) % 12)
                 for ((cat0,root0),(cat1,root1)) in \
                     group_pairs(seq_samples)] \
             for seq_samples in training_samples], [])
     
     # Count up the observations
     schema_transition_counts = ConditionalFreqDist()
     root_transition_counts = ConditionalFreqDist()
     for (label0, label1, root_change) in training_data:
         # Only use counts for categories the model's looking for
         if label0 in self.schemata and label1 in self.schemata:
             schema_transition_counts[label0].inc(label1)
             root_transition_counts[(label0,label1)].inc(root_change)
     
     # Transition probability to final state (end of sequence)
     for sequence in training_samples:
         # Inc the count of going from the label the sequence ends on to 
         #  the final state
         schema_transition_counts[sequence[-1][0]].inc(None)
         
     # Use Laplace (plus one) smoothing
     # We don't use the laplace_estimator because we want the conversion 
     #  to a dict prob dist to get all the labels, not just to discount 
     #  the ones it's seen
     for label0 in self.schemata:
         for label1 in self.schemata:
             for root_change in range(12):
                 # Exclude self-transition for now, unless we're learning it
                 if learn_cont or not (label0 == label1 and root_change == 0):
                     schema_transition_counts[label0].inc(label1)
                     root_transition_counts[(label0,label1)].inc(root_change)
             # We don't add a count for going to the final state: we don't 
             #  want to initialize it with too much weight
     
     # Estimate distribution from this frequency distribution
     schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
             ConditionalProbDist(schema_transition_counts, mle_estimator, None), \
                 mutable=True, samples=self.schemata+[None])
     root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
             ConditionalProbDist(root_transition_counts, mle_estimator, None), \
                 mutable=True, samples=range(12))
     
     if not learn_cont:
         # Discount all probabilities to allow for self-transition probs
         discount = logprob(1.0 - contprob)
         self_prob = logprob(contprob)
         for label0 in self.schemata:
             # Give saved prob mass to self-transitions
             trans_dist[label0].update((label0, 0), self_prob)
             
             # Discount all other transitions to allow for this
             for label1 in self.schemata:
                 for root_change in range(12):
                     if not (label0 == label1 and root_change == 0):
                         # Discount non self transitions
                         trans_dist[label0].update((label1, root_change), \
                             trans_dist[label0].logprob((label1, root_change)) + \
                             discount)
     
     # Recreate the dict prob dist so it's not mutable any more
     schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist)
     root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist)
     
     ## Now for the initial distribution
     # Count up the observations
     initial_counts = FreqDist()
     for sequence in training_samples:
         initial_counts.inc(sequence[0][0])
     # Use Laplace (plus one) smoothing
     #for label in self.schemata:
     #    initial_counts.inc(label)
     
     # Estimate distribution from this frequency distribution
     initial_dist = prob_dist_to_dictionary_prob_dist(\
                 mle_estimator(initial_counts, None), samples=self.schemata)
     
     # Replace the model's transition distributions
     self.schema_transition_dist = schema_trans_dist
     self.root_transition_dist = root_trans_dist
     self.initial_state_dist = initial_dist
     # Invalidate the cache
     self.clear_cache()
Exemplo n.º 25
0
 def normal_forward_probabilities(self, sequence, array=False):
     """If you want the normalized matrix of forward probabilities, it's 
     ok to use normal (non-log) probabilities and these can be computed 
     more quickly, since you don't need to sum logs (which is time 
     consuming).
     
     Returns the matrix, and also the vector of values that each timestep 
     was divided by to normalize (i.e. total probability of each timestep 
     over all states).
     Also returns the total log probability of the sequence.
     
     @type array: bool
     @param array: if True, returns a numpy 2d array instead of a list of 
         dicts.
     @return: (matrix,normalizing vector,log prob)
     
     """
     T = len(sequence)
     N = len(self.label_dom)
     alpha = numpy.zeros((T, N), numpy.float64)
     scale = numpy.zeros(T, numpy.float64)
     
     # Prepare the first column of the matrix: probs of all states in the 
     #  first timestep
     for i,state in enumerate(self.label_dom):
         alpha[0,i] = self.transition_probability(state, None) * \
                         self.emission_probability(sequence[0], state)
     # Normalize by dividing all values by the total probability
     total = array_sum(alpha[0,:])
     alpha[0,:] /= total
     scale[0] = total
     
     # Iterate over the other timesteps
     for t in range(1, T):
         for j,sj in enumerate(self.label_dom):
             # Multiply each previous state's prob by the transition prob 
             #  to this state and sum them all together
             prob = sum(
                 (alpha[t-1, i] * self.transition_probability(sj, si) \
                     for i,si in enumerate(self.label_dom)), 0.0)
             # Also multiply this by the emission probability
             alpha[t, j] = prob * \
                             self.emission_probability(sequence[t], sj)
         # Normalize by dividing all values by the total probability
         total = array_sum(alpha[t,:])
         alpha[t,:] /= total
         scale[t] = total
     
     # Multiply together the probability of each timestep to get the whole 
     # probability of the sequence
     # This gets the same result as if we did:
     #  alpha = model.forward_log_probabilities(sequence, normalize=False, array=True)
     #  log_prob = sum_logs(alpha[T-1,:])
     log_prob = sum((logprob(total) for total in scale), 0.0)
     
     if not array:
         # Convert this into a list of dicts
         matrix = []
         for t in range(T):
             timestep = {}
             for (i,label) in enumerate(self.label_dom):
                 timestep[label] = alpha[t,i]
             matrix.append(timestep)
         return matrix,scale,log_prob
     else:
         return alpha,scale,log_prob