예제 #1
0
파일: multi.py 프로젝트: ANB2/nltk-trainer
	def prob_classify(self, feat):
		probs = self.root.prob_classify(feat)
		# passing in self.labels() ensures it doesn't have any of label_classifiers.keys()
		mult = MutableProbDist(probs, self.labels(), store_logs=False)
		
		for classifier in self.label_classifiers.values():
			pd = classifier.prob_classify(feat)
			
			for sample in pd.samples():
				mult.update(sample, pd.prob(sample), log=False)
		
		return mult
예제 #2
0
    def prob_classify(self, feat):
        probs = self.root.prob_classify(feat)
        # passing in self.labels() ensures it doesn't have any of label_classifiers.keys()
        mult = MutableProbDist(probs, self.labels(), store_logs=False)

        for classifier in self.label_classifiers.values():
            pd = classifier.prob_classify(feat)

            for sample in pd.samples():
                mult.update(sample, pd.prob(sample), log=False)

        return mult
예제 #3
0
def prob_dist_to_dictionary_prob_dist(dist, mutable=False, samples=None):
    """
    Takes a probability distribution estimated in any way (e.g. from 
    a freq dist) and produces a corresponding dictionary prob dist 
    that just stores the probability of every sample.
    
    Can be used to turn any kind of prob dist into a dictionary-based 
    one, including a MutableProbDist.
    
    @type mutable: bool
    @param mutable: if True, the returned dist is a mutable prob dist
    
    """
    # We may want to give a different set of samples, for example, if there 
    #  are samples not represented in the original dist
    if samples is None:
        samples = dist.samples()
    
    probs = {}
    for sample in samples:
        probs[sample] = dist.prob(sample)
    # We'd expect these to sum to one, but normalize just in case
    dictpd = DictionaryProbDist(probs, normalize=True)
    
    if mutable:
        # Convert to a mutable distribution
        dictpd = MutableProbDist(dictpd, samples)
    return dictpd
예제 #4
0
    def train_unsupervised(self,
                           unlabeled_sequences,
                           update_outputs=True,
                           **kwargs):
        """
        Trains the HMM using the Baum-Welch algorithm to maximise the
        probability of the data sequence. This is a variant of the EM
        algorithm, and is unsupervised in that it doesn't need the state
        sequences for the symbols. The code is based on 'A Tutorial on Hidden
        Markov Models and Selected Applications in Speech Recognition',
        Lawrence Rabiner, IEEE, 1989.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param unlabeled_sequences: the training data, a set of
            sequences of observations
        :type unlabeled_sequences: list

        kwargs may include following parameters:

        :param model: a HiddenMarkovModelTagger instance used to begin
            the Baum-Welch algorithm
        :param max_iterations: the maximum number of EM iterations
        :param convergence_logprob: the maximum change in log probability to
            allow convergence
        """

        # create a uniform HMM, which will be iteratively refined, unless
        # given an existing model
        model = kwargs.get('model')
        if not model:
            priors = RandomProbDist(self._states)
            transitions = DictionaryConditionalProbDist(
                dict((state, RandomProbDist(self._states))
                     for state in self._states))
            outputs = DictionaryConditionalProbDist(
                dict((state, RandomProbDist(self._symbols))
                     for state in self._states))
            model = HiddenMarkovModelTagger(self._symbols, self._states,
                                            transitions, outputs, priors)

        self._states = model._states
        self._symbols = model._symbols

        N = len(self._states)
        M = len(self._symbols)
        symbol_numbers = dict((sym, i) for i, sym in enumerate(self._symbols))

        # update model prob dists so that they can be modified
        # model._priors = MutableProbDist(model._priors, self._states)

        model._transitions = DictionaryConditionalProbDist(
            dict((s, MutableProbDist(model._transitions[s], self._states))
                 for s in self._states))

        if update_outputs:
            model._outputs = DictionaryConditionalProbDist(
                dict((s, MutableProbDist(model._outputs[s], self._symbols))
                     for s in self._states))

        model.reset_cache()

        # iterate until convergence
        converged = False
        last_logprob = None
        iteration = 0
        max_iterations = kwargs.get('max_iterations', 1000)
        epsilon = kwargs.get('convergence_logprob', 1e-6)

        while not converged and iteration < max_iterations:
            A_numer = _ninf_array((N, N))
            B_numer = _ninf_array((N, M))
            A_denom = _ninf_array(N)
            B_denom = _ninf_array(N)

            logprob = 0
            for sequence in unlabeled_sequences:
                sequence = list(sequence)
                if not sequence:
                    continue

                (lpk, seq_A_numer, seq_A_denom, seq_B_numer,
                 seq_B_denom) = self._baum_welch_step(sequence, model,
                                                      symbol_numbers)

                # add these sums to the global A and B values
                for i in range(N):
                    A_numer[i] = np.logaddexp2(A_numer[i],
                                               seq_A_numer[i] - lpk)
                    B_numer[i] = np.logaddexp2(B_numer[i],
                                               seq_B_numer[i] - lpk)

                A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk)
                B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk)

                logprob += lpk

            # use the calculated values to update the transition and output
            # probability values
            for i in range(N):
                logprob_Ai = A_numer[i] - A_denom[i]
                logprob_Bi = B_numer[i] - B_denom[i]

                # We should normalize all probabilities (see p.391 Huang et al)
                # Let sum(P) be K.
                # We can divide each Pi by K to make sum(P) == 1.
                #   Pi' = Pi/K
                #   log2(Pi') = log2(Pi) - log2(K)
                logprob_Ai -= logsumexp2(logprob_Ai)
                logprob_Bi -= logsumexp2(logprob_Bi)

                # update output and transition probabilities
                si = self._states[i]

                for j in range(N):
                    sj = self._states[j]
                    model._transitions[si].update(sj, logprob_Ai[j])

                if update_outputs:
                    for k in range(M):
                        ok = self._symbols[k]
                        model._outputs[si].update(ok, logprob_Bi[k])

                # Rabiner says the priors don't need to be updated. I don't
                # believe him. FIXME

            # test for convergence
            if iteration > 0 and abs(logprob - last_logprob) < epsilon:
                converged = True

            print('iteration', iteration, 'logprob', logprob)
            iteration += 1
            last_logprob = logprob

        return model
예제 #5
0
    def train_unsupervised(self, unlabeled_sequences, **kwargs):
        """
        Trains the HMM using the Baum-Welch algorithm to maximise the
        probability of the data sequence. This is a variant of the EM
        algorithm, and is unsupervised in that it doesn't need the state
        sequences for the symbols. The code is based on 'A Tutorial on Hidden
        Markov Models and Selected Applications in Speech Recognition',
        Lawrence Rabiner, IEEE, 1989.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param unlabeled_sequences: the training data, a set of
            sequences of observations
        :type unlabeled_sequences: list

        kwargs may include following parameters:

        :param model: a HiddenMarkovModelTagger instance used to begin
            the Baum-Welch algorithm
        :param max_iterations: the maximum number of EM iterations
        :param convergence_logprob: the maximum change in log probability to
            allow convergence
        """

        N = len(self._states)
        M = len(self._symbols)
        symbol_dict = dict((self._symbols[i], i) for i in range(M))

        # create a uniform HMM, which will be iteratively refined, unless
        # given an existing model
        model = kwargs.get('model')
        if not model:
            priors = UniformProbDist(self._states)
            transitions = DictionaryConditionalProbDist(
                            dict((state, UniformProbDist(self._states))
                                  for state in self._states))
            output = DictionaryConditionalProbDist(
                            dict((state, UniformProbDist(self._symbols))
                                  for state in self._states))
            model = HiddenMarkovModelTagger(self._symbols, self._states,
                            transitions, output, priors)

        # update model prob dists so that they can be modified
        model._priors = MutableProbDist(model._priors, self._states)
        model._transitions = DictionaryConditionalProbDist(
            dict((s, MutableProbDist(model._transitions[s], self._states))
                 for s in self._states))
        model._outputs = DictionaryConditionalProbDist(
            dict((s, MutableProbDist(model._outputs[s], self._symbols))
                 for s in self._states))

        # iterate until convergence
        converged = False
        last_logprob = None
        iteration = 0
        max_iterations = kwargs.get('max_iterations', 1000)
        epsilon = kwargs.get('convergence_logprob', 1e-6)
        while not converged and iteration < max_iterations:
            A_numer = ones((N, N), float64) * _NINF
            B_numer = ones((N, M), float64) * _NINF
            A_denom = ones(N, float64) * _NINF
            B_denom = ones(N, float64) * _NINF

            logprob = 0
            for sequence in unlabeled_sequences:
                sequence = list(sequence)
                if not sequence:
                    continue

                # compute forward and backward probabilities
                alpha = model._forward_probability(sequence)
                beta = model._backward_probability(sequence)

                # find the log probability of the sequence
                T = len(sequence)
                lpk = _log_add(*alpha[T-1, :])
                logprob += lpk

                # now update A and B (transition and output probabilities)
                # using the alpha and beta values. Please refer to Rabiner's
                # paper for details, it's too hard to explain in comments
                local_A_numer = ones((N, N), float64) * _NINF
                local_B_numer = ones((N, M), float64) * _NINF
                local_A_denom = ones(N, float64) * _NINF
                local_B_denom = ones(N, float64) * _NINF

                # for each position, accumulate sums for A and B
                for t in range(T):
                    x = sequence[t][_TEXT] #not found? FIXME
                    if t < T - 1:
                        xnext = sequence[t+1][_TEXT] #not found? FIXME
                    xi = symbol_dict[x]
                    for i in range(N):
                        si = self._states[i]
                        if t < T - 1:
                            for j in range(N):
                                sj = self._states[j]
                                local_A_numer[i, j] =  \
                                    _log_add(local_A_numer[i, j],
                                        alpha[t, i] +
                                        model._transitions[si].logprob(sj) +
                                        model._outputs[sj].logprob(xnext) +
                                        beta[t+1, j])
                            local_A_denom[i] = _log_add(local_A_denom[i],
                                alpha[t, i] + beta[t, i])
                        else:
                            local_B_denom[i] = _log_add(local_A_denom[i],
                                alpha[t, i] + beta[t, i])

                        local_B_numer[i, xi] = _log_add(local_B_numer[i, xi],
                            alpha[t, i] + beta[t, i])

                # add these sums to the global A and B values
                for i in range(N):
                    for j in range(N):
                        A_numer[i, j] = _log_add(A_numer[i, j],
                                                local_A_numer[i, j] - lpk)
                    for k in range(M):
                        B_numer[i, k] = _log_add(B_numer[i, k],
                                                local_B_numer[i, k] - lpk)

                    A_denom[i] = _log_add(A_denom[i], local_A_denom[i] - lpk)
                    B_denom[i] = _log_add(B_denom[i], local_B_denom[i] - lpk)

            # use the calculated values to update the transition and output
            # probability values
            for i in range(N):
                si = self._states[i]
                for j in range(N):
                    sj = self._states[j]
                    model._transitions[si].update(sj, A_numer[i,j] -
                                                  A_denom[i])
                for k in range(M):
                    ok = self._symbols[k]
                    model._outputs[si].update(ok, B_numer[i,k] - B_denom[i])
                # Rabiner says the priors don't need to be updated. I don't
                # believe him. FIXME

            # test for convergence
            if iteration > 0 and abs(logprob - last_logprob) < epsilon:
                converged = True

            print 'iteration', iteration, 'logprob', logprob
            iteration += 1
            last_logprob = logprob

        return model
예제 #6
0
    def train(self, emissions, max_iterations=None, \
                    convergence_logprob=None, logger=None, processes=1,
                    save=True, save_intermediate=False):
        """
        Performs unsupervised training using Baum-Welch EM.
        
        This is an instance method, because it is performed on a model 
        that has already been initialized. You might, for example, 
        create such a model using C{initialize_chord_types}.
        
        This is based on the training procedure in NLTK for HMMs:
        C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
        
        @type emissions: list of lists of emissions
        @param emissions: training data. Each element is a list of 
            emissions representing a sequence in the training data.
            Each emission is an emission like those used for 
            L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, 
            i.e. a list of note 
            observations
        @type max_iterations: int
        @param max_iterations: maximum number of iterations to allow 
            for EM (default 100). Overrides the corresponding 
            module option
        @type convergence_logprob: float
        @param convergence_logprob: maximum change in log probability 
            to consider convergence to have been reached (default 1e-3). 
            Overrides the corresponding module option
        @type logger: logging.Logger
        @param logger: a logger to send progress logging to
        @type processes: int
        @param processes: number processes to spawn. A pool of this 
            many processes will be used to compute distribution updates 
            for sequences in parallel during each iteration.
        @type save: bool
        @param save: save the model at the end of training
        @type save_intermediate: bool
        @param save_intermediate: save the model after each iteration. Implies 
            C{save}
        
        """
        from . import raphsto_d
        if logger is None:
            from jazzparser.utils.loggers import create_dummy_logger
            logger = create_dummy_logger()

        if save_intermediate:
            save = True

        # No point in creating more processes than there are sequences
        if processes > len(emissions):
            processes = len(emissions)

        self.model.add_history("Beginning Baum-Welch unigram training on %s" %
                               get_host_info_string())
        self.model.add_history("Training on %d sequences (with %s chords)" % \
            (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))

        # Use kwargs if given, otherwise module options
        if max_iterations is None:
            max_iterations = self.options['max_iterations']
        if convergence_logprob is None:
            convergence_logprob = self.options['convergence_logprob']

        # Enumerate the states
        state_ids = dict((state,num) for (num,state) in \
                                    enumerate(self.model.label_dom))

        # Enumerate the beat values (they're probably consecutive ints, but
        #  let's not rely on it)
        beat_ids = dict((beat,num) for (num,beat) in \
                                    enumerate(self.model.beat_dom))
        num_beats = len(beat_ids)
        # Enumerate the d-values (d-function's domain)
        d_ids = dict((d,num) for (num,d) in \
                                    enumerate(self.model.emission_dist_dom))
        num_ds = len(d_ids)

        # Make a mutable distribution for the emission distribution we'll
        #  be updating
        emission_mdist = DictionaryConditionalProbDist(
            dict((s,
                  MutableProbDist(self.model.emission_dist[s],
                                  self.model.emission_dist_dom))
                 for s in self.model.emission_dist.conditions()))
        # Create dummy distributions to fill the places of the transition
        #  distribution components
        key_mdist = DictionaryConditionalProbDist({})
        chord_mdist = DictionaryConditionalProbDist({})
        chord_uni_mdist = MutableProbDist({}, [])

        # Construct a model using these mutable distributions so we can
        #  evaluate using them
        model = self.model_cls(key_mdist,
                               chord_mdist,
                               emission_mdist,
                               chord_uni_mdist,
                               chord_set=self.model.chord_set)

        iteration = 0
        last_logprob = None
        while iteration < max_iterations:
            logger.info("Beginning iteration %d" % iteration)
            current_logprob = 0.0

            # ems contains the new emission numerator probabilities
            # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r}
            #                  alpha(x_n).beta(x_n) /
            #                    Sum_{x'_n} (alpha(x'_n).beta(x'_n))
            ems = zeros((num_beats, num_ds), float64)
            # And these are the denominators
            ems_denom = zeros(num_beats, float64)

            def _training_callback(result):
                """
                Callback for the _sequence_updates processes that takes 
                the updates from a single sequence and adds them onto 
                the global update accumulators.
                
                """
                # _sequence_updates() returns all of this as a tuple
                (ems_local, ems_denom_local, seq_logprob) = result

                # Add these probabilities from this sequence to the
                #  global matrices
                # Emission numerator
                array_add(ems, ems_local, ems)
                # Denominators
                array_add(ems_denom, ems_denom_local, ems_denom)

            ## End of _training_callback

            # Only use a process pool if there's more than one sequence
            if processes > 1:
                # Create a process pool to use for training
                logger.info("Creating a pool of %d processes" % processes)
                pool = Pool(processes=processes)

                async_results = []
                for seq_i, sequence in enumerate(emissions):
                    logger.info("Iteration %d, sequence %d" %
                                (iteration, seq_i))
                    T = len(sequence)
                    if T == 0:
                        continue

                    # Fire off a new call to the process pool for every sequence
                    async_results.append(
                        pool.apply_async(
                            _sequence_updates_uni,
                            (sequence, model, self.model.label_dom, state_ids,
                             beat_ids, d_ids, raphsto_d),
                            callback=_training_callback))
                pool.close()
                # Wait for all the workers to complete
                pool.join()

                # Call get() on every AsyncResult so that any exceptions in
                #  workers get raised
                for res in async_results:
                    # If there was an exception in _sequence_update, it
                    #  will get raised here
                    res_tuple = res.get()
                    # Add this sequence's logprob into the total for all sequences
                    current_logprob += res_tuple[2]
            else:
                logger.info("One sequence: not using a process pool")
                sequence = emissions[0]

                if len(sequence) > 0:
                    updates = _sequence_updates_uni(sequence, model,
                                                    self.model.label_dom,
                                                    state_ids, beat_ids, d_ids,
                                                    raphsto_d)
                    _training_callback(updates)
                    # Update the overall logprob
                    current_logprob = updates[2]

            # Update the model's probabilities from the accumulated values
            for beat in self.model.beat_dom:
                denom = ems_denom[beat_ids[beat]]
                for d in self.model.emission_dist_dom:
                    if denom == 0.0:
                        # Zero denominator
                        prob = -logprob(len(d_ids))
                    else:
                        prob = logprob(ems[beat_ids[beat]][d_ids[d]] +
                                       ADD_SMALL) - logprob(
                                           denom + len(d_ids) * ADD_SMALL)
                    model.emission_dist[beat].update(d, prob)

            # Clear the model's cache so we get the new probabilities
            model.clear_cache()

            logger.info("Training data log prob: %s" % current_logprob)
            if last_logprob is not None and current_logprob < last_logprob:
                logger.error("Log probability dropped by %s" % \
                                (last_logprob - current_logprob))
            if last_logprob is not None:
                logger.info("Log prob change: %s" % \
                                (current_logprob - last_logprob))
            # Check whether the log probability has converged
            if iteration > 0 and \
                    abs(current_logprob - last_logprob) < convergence_logprob:
                # Don't iterate any more
                logger.info("Distribution has converged: ceasing training")
                break

            iteration += 1
            last_logprob = current_logprob

            # Update the main model
            # Only save if we've been asked to save between iterations
            self.update_model(model, save=save_intermediate)

        self.model.add_history("Completed Baum-Welch unigram training")
        # Update the distribution's parameters with those we've trained
        self.update_model(model, save=save)
        return