Пример #1
0
    def _prob_classify(self, input):

        # Make a featureset of the input after tokenizing tokenized
        input_tokenized_featureset = self._tokenizeInputToFeatures(
            input).copy()

        # Ensuring that all the feature names are valid and can be ued
        for input_feature_name in input_tokenized_featureset.keys():
            for label in self._labels:
                if (label, input_feature_name
                    ) in self._featureProbabilityDistribution:
                    break
            else:
                #print 'Ignoring unseen feature %s' % input_feature_name
                del input_tokenized_featureset[input_feature_name]

        # Start with a log probability of 0 to avoid skewing towards larger data sets
        logprob = {}
        for label in self._labels:
            #print "in here adding labels"
            logprob[label] = 0

        # Add in the log probability of features given labels.
        # Iterate through the labels assigned eg : location,time, noise
        for label in self._labels:

            # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}"
            for (input_feature_name,
                 input_feature_val) in input_tokenized_featureset.items():

                # If the combination ie (location,turkey) belongs in the trainig set, add the log probability
                if (label, input_feature_name
                    ) in self._featureProbabilityDistribution:
                    # Assign its probability
                    feature_probs = self._featureProbabilityDistribution[
                        label, input_feature_name]
                    logprob[label] += feature_probs.logprob(input_feature_val)
                else:
                    # nb: This case will never come up if the classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([])  # = -INF.

        dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True)
        return dictprobDist
Пример #2
0
    def get_class_probs(self, token):
        feature_vector = token[self.property('FEATURE_VECTOR')]

        if len(feature_vector) * len(self._classes) != len(self._weights):
            raise ValueError, 'Bad feature vector length'

        prob_dict = {}
        for i, cls in enumerate(self._classes):
            # Find the offset into the weights vector.
            offset = i * len(feature_vector)

            # Multiply the weights of all active features for this class.
            prod = 1.0
            for (id, val) in feature_vector.assignments():
                prod *= (self._weights[id + offset]**val)
            prob_dict[cls] = prod

        # Normalize the dictionary to give a probability distribution
        return DictionaryProbDist(prob_dict, normalize=True)
Пример #3
0
    def prob_classify(self, featureset):
        """
        Return a probability distribution of classifications

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
        """
        if self._model is None:
            raise Exception('This classifier is not yet trained')
            return None

        # do the classification
        prediction = self._get_svm_classification(featureset)
        if self._verbose:
            print('prediction', prediction)

        # lump it into a boolean class, -1 or +1
        predicted_label = cmp(prediction, 0)

        # sometimes the result is not within -1 ... +1; clip it so
        # that it is, and we get a sane-looking probability
        # distribution.  this will upset some results with non-linear
        # partitioning where instance-hyperplane distance can be many
        # orders of magnitude larger; I don't have a fix for that
        if prediction < -1.0:
            prediction = -1.0
        if prediction > 1.0:
            prediction = 1.0

        # if the prediction is negative, then we will maximise the
        # value of the -1 class; otherwise, that of the 1 class will
        # be greater.
        if predicted_label == 1:
            distribution = {
                str(self.resolve_prediction(1)): prediction,
                str(self.resolve_prediction(-1)): 1 - prediction
            }
        else:
            distribution = {
                str(self.resolve_prediction(1)): prediction + 1,
                str(self.resolve_prediction(-1)): -prediction
            }

        return DictionaryProbDist(distribution)
Пример #4
0
def parse_sausage(fname):
    """gets the filename of a sausage and returns a list of probability distributions"""
    sausage = []
    with open(fname) as f:
        for line in f:
            if line.startswith('align'):
                # align a w1 p1 w2 p2 ...
                # split line and ignore first two tokens
                bits = line.split()[2:]
                dist = DictionaryProbDist(
                    {w: float(p)
                     for w, p in zip(bits[::2], bits[1::2])})
                sausage.append(dist)

    # remove sentence boundaries
    assert sausage[0].samples() == ['<s>']
    assert sausage[-1].samples() == ['</s>']
    sausage = sausage[1:-1]

    return sausage
Пример #5
0
    def __init__(self, label_dist, emission_dist, label_dom, emission_dom, \
                        mutable=False):
        """
        @type label_dist: nltk prob dist
        @param label_dist: transition distribution
        @type emission_dist: nltk prob dist
        @param emission_dist: emission distribution
        @type label_dom: list
        @param label_dom: state domain
        @type emission_dom: list
        @param emission_dom: emission domain
        @type mutable: bool
        @param mutable: if true, the distributions stored will be mutable 
            dictionary distributions, so the model can be updated
        
        """
        self.order = 2

        self.label_dom = label_dom
        self.num_labels = len(label_dom)
        self.emission_dom = emission_dom
        self.num_emissions = len(emission_dom)

        self.label_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
                                label_dist, mutable=mutable)
        self.emission_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
                                emission_dist, mutable=mutable)
        # Marginalize the emission dist to get an unconditioned version
        observations = {}
        for label in emission_dist.conditions():
            for samp in emission_dist[label].samples():
                observations[samp] = observations.get(samp, 0.0) + \
                        emission_dist[label].prob(samp)
        self.observation_dist = DictionaryProbDist(observations)
        self.seen_labels = label_dom

        self.backoff_model = None

        # Initialize the various caches
        # These will be filled as we access probabilities
        self.clear_cache()
Пример #6
0
    def prob_classify(self, feat):
        '''Return ProbDistI of averaged label probabilities.'''
        label_probs = collections.defaultdict(list)

        for classifier in self._classifiers:
            try:
                cprobs = classifier.prob_classify(feat)

                for label in cprobs.samples():
                    label_probs[label].append(cprobs.prob(label))
            except NotImplementedError:
                # if we can't do prob_classify (like for DecisionTree)
                # assume 100% probability from classify
                label_probs[classifier.classify(feat)].append(1)

        avg_probs = {}

        for label, probs in label_probs.items():
            avg_probs[label] = float(sum(probs)) / len(probs)

        return DictionaryProbDist(avg_probs)
Пример #7
0
    def prob_classify(self, featureset):
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                del featureset[fname]

        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label, fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    logprob[label] += sum_logs([])

        return DictionaryProbDist(logprob, normalize=True, log=True)
Пример #8
0
    def prob_classify(self, featureset, priors=None):
        # Discard any feature names that we've never seen before.
        # Otherwise, we'll just assign a probability of 0 to
        # everything.
        featureset = featureset.copy()
        for fname in featureset.keys():
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                #print 'Ignoring unseen feature %s' % fname
                del featureset[fname]

        # Find the log probabilty of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}
        if priors == None:
            print "no prior",
            for label in self._labels:
                logprob[label] = self._label_probdist.logprob(label)
        else:
            for label, prob in priors.iteritems():
                logprob[label] = math.log(prob, 2)

        # Then add in the log probability of features given labels.
        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label,fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # nb: This case will never come up if the
                    # classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([]) # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)
Пример #9
0
skwWords = [w for w in akwWords if not w in kwstopWords]
kwDict['NoStop'] = len(skwWords)

## MAYBE USE THIS?
# remove small words
# elected not to use this
finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 2)
scored = finder2.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:20]:
    print(bscore)

# need to stem, but realy only want to stem "horse" and "horses"

# First list the top 50 words by frequency (normalized by the length of the document)
bbDist = FreqDist(sbbWords)
bbDist2 = DictionaryProbDist(bbDist, normalize=True)
bbDist2.prob('black')
bbDist2.prob('horse')
bbDist.plot(50)
# need to make second number number / len(sbbWords)
bbItems = bbDist.most_common(50)
# Show the normalized probability
for item in bbItems:
    print(item)

# King of the Wind Frequency Distribution
kwDist = FreqDist(skwWords)
kwDist2 = DictionaryProbDist(kwDist, normalize=True)
kwDist2.prob('said')
kwDist2.prob('agba')
kwDist.plot(50)
Пример #10
0
 def _make_probdist(self, y_proba):
     classes = self._encoder.classes_
     return DictionaryProbDist(dict((classes[i], p)
                                    for i, p in enumerate(y_proba)))
Пример #11
0
 def pd(values, samples):
     d = dict(zip(samples, values))
     return DictionaryProbDist(d)
Пример #12
0
print(nb_classifier.classify(posfeat))

print(accuracy(nb_classifier, test_feats))

probs = nb_classifier.prob_classify(test_feats[0][0])
print(probs.samples())

print(probs.max())

print(probs.prob('pos'))

print(probs.prob('neg'))

print(nb_classifier.most_informative_features(n=5))
print("############################################################################")
print(nb_classifier.show_most_informative_features(n=5))
print("############################################################################")

nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist)

print("Accuracy: " + str(accuracy(nb_classifier, test_feats)))
# Accuracy: 0.76

label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
true_probdist = DictionaryProbDist({True: 1})
feature_probdist = {('pos', 'yes'): true_probdist, ('neg', 'no'): true_probdist}
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

print(classifier.classify({'yes': True}))
print(classifier.classify({'no': True}))
Пример #13
0
 def parse_weka_distribution(self, s):
     probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
     probs = dict(zip(self._formatter.labels(), probs))
     return DictionaryProbDist(probs)
Пример #14
0
    def train(positive_featuresets,
              unlabeled_featuresets,
              positive_prob_prior=0.5,
              estimator=ELEProbDist):
        """
        :param positive_featuresets: A list of featuresets that are known as positive
            examples (i.e., their label is ``True``).

        :param unlabeled_featuresets: A list of featuresets whose label is unknown.

        :param positive_prob_prior: A prior estimate of the probability of the label
            ``True`` (default 0.5).
        """
        positive_feature_freqdist = defaultdict(FreqDist)
        unlabeled_feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occurred in positive examples.
        for featureset in positive_featuresets:
            for fname, fval in featureset.items():
                positive_feature_freqdist[fname].inc(fval)
                feature_values[fname].add(fval)
                fnames.add(fname)

        # Count up how many times each feature value occurred in unlabeled examples.
        for featureset in unlabeled_featuresets:
            for fname, fval in featureset.items():
                unlabeled_feature_freqdist[fname].inc(fval)
                feature_values[fname].add(fval)
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then we assume that
        # it gets the implicit value 'None'.
        num_positive_examples = len(positive_featuresets)
        for fname in fnames:
            count = positive_feature_freqdist[fname].N()
            positive_feature_freqdist[fname].inc(None,
                                                 num_positive_examples - count)
            feature_values[fname].add(None)

        num_unlabeled_examples = len(unlabeled_featuresets)
        for fname in fnames:
            count = unlabeled_feature_freqdist[fname].N()
            unlabeled_feature_freqdist[fname].inc(
                None, num_unlabeled_examples - count)
            feature_values[fname].add(None)

        negative_prob_prior = 1.0 - positive_prob_prior

        # Create the P(label) distribution.
        label_probdist = DictionaryProbDist({
            True: positive_prob_prior,
            False: negative_prob_prior
        })

        # Create the P(fval|label, fname) distribution.
        feature_probdist = {}
        for fname, freqdist in positive_feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[True, fname] = probdist

        for fname, freqdist in unlabeled_feature_freqdist.items():
            global_probdist = estimator(freqdist,
                                        bins=len(feature_values[fname]))
            negative_feature_probs = {}
            for fval in feature_values[fname]:
                prob = (global_probdist.prob(fval)
                        - positive_prob_prior *
                        feature_probdist[True, fname].prob(fval)) \
                        / negative_prob_prior
                # TODO: We need to add some kind of smoothing here, instead of
                # setting negative probabilities to zero and normalizing.
                negative_feature_probs[fval] = max(prob, 0.0)
            feature_probdist[False, fname] = DictionaryProbDist(
                negative_feature_probs, normalize=True)

        return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
Пример #15
0
 def pd(values, samples):
     d = {}
     for value, item in zip(values, samples):
         d[item] = value
     return DictionaryProbDist(d)
Пример #16
0
    def initialize_chord_types(cls,
                               probs,
                               model_name="default",
                               chord_set="scale+dom7"):
        """
        Creates a new model with the distributions initialized naively to 
        favour simple chord-types, as R&S do in the paper. They don't say 
        what values they use for C{probs}, except that they're high, medium 
        and low respectively.
        
        The transition distribution is initialized so that everything is 
        equiprobable.
        
        @type probs: 3-tuple of floats
        @param probs: probability mass to assign to (0.) chord notes, (1.) 
            scale notes and (2.) other notes. The three values should sum to
            1.0 (but will be normalized to if they don't)
        
        """
        prob_sum = sum(probs)
        probs = [p / prob_sum for p in probs]

        # Create a probability distribution for the emission
        #  distribution
        dists = {}
        # Create the distribution for each possible r-value
        for r in range(4):
            probabilities = {}
            for d in [0, 1, 2]:
                probabilities[d] = probs[0] / 3.0
            probabilities[3] = probs[1]
            probabilities[4] = probs[2]
            dists[r] = DictionaryProbDist(probabilities)
        emission_dist = DictionaryConditionalProbDist(dists)

        # These distributions will make everything equiprobable
        key_transition_counts = ConditionalFreqDist()
        chord_transition_counts = ConditionalFreqDist()
        chord_counts = {}
        # Get all possible labels
        label_dom = cls.get_label_dom(chord_set=chord_set)

        for label0 in label_dom:
            for label1 in label_dom:
                key, pkey = states_to_key_transition(label1, label0)
                # Give one count to the key transition corresponding to this state transition
                key_transition_counts[pkey].inc(key)
                # And one to the chord transition corresponding to this state transition
                if label0[0] == label1[0] and label0[1] == label1[1]:
                    # tonic = tonic', mode = mode'
                    chord_transition_counts[label0[2]].inc(label1[2])
                else:
                    chord_counts.setdefault(label1[2], 0)
                    chord_counts[label1[2]] += 1

        # Estimate distributions from these frequency distributions
        key_dist = ConditionalProbDist(key_transition_counts, mle_estimator,
                                       None)
        chord_trans_dist = ConditionalProbDist(chord_transition_counts,
                                               mle_estimator, None)
        chord_dist = DictionaryProbDist(chord_counts)
        # Sample these to get dictionary prob dists
        key_dist = cond_prob_dist_to_dictionary_cond_prob_dist(key_dist)
        chord_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(
            chord_trans_dist)
        chord_dist = prob_dist_to_dictionary_prob_dist(chord_dist)

        model = cls(key_dist, \
                      chord_trans_dist, \
                      emission_dist, \
                      chord_dist, \
                      model_name=model_name,
                      chord_set=chord_set)
        model.add_history(\
            "Initialized model '%s' to chord type probabilities, using "\
            "parameters: %s, %s, %s" % (model_name, probs[0], probs[1], probs[2]))
        return model
Пример #17
0
    def setUp(self):
        """
        Prepare some training data.
        
        """
        self.TRAINING_DATA = [[
            0, 5, 5, 7, 6, 7, 8, 5, 2, 0, 3, 1, 2, 2, 2, 9, 9, 8, 0, 8, 9, 9,
            1, 3, 2, 2, 1
        ], [3, 3, 1, 2, 1, 1, 0, 1, 9, 7, 8, 7, 7, 9, 0],
                              [
                                  7, 8, 6, 9, 8, 9, 9, 1, 3, 0, 1, 3, 0, 1, 1,
                                  0, 5, 7, 5, 4, 5, 7, 7
                              ]]
        self.TEST_DATA = [0, 1, 2, 3, 4, 3, 5, 6, 7, 8, 8, 9, 7, 7, 0, 0, 1]

        ems = list(range(10))
        states = ['H', 'M', 'L']
        # Construct some initial distributions
        # Emission
        hprobs = {
            0: 0.0,
            1: 0.0,
            2: 0.0,
            3: 0.0,
            4: 0.0,
            5: 0.0,
            6: 0.1,
            7: 0.3,
            8: 0.3,
            9: 0.3
        }
        mprobs = {
            0: 0.0,
            1: 0.0,
            2: 0.0,
            3: 0.1,
            4: 0.3,
            5: 0.3,
            6: 0.3,
            7: 0.0,
            8: 0.0,
            9: 0.0
        }
        lprobs = {
            0: 0.2,
            1: 0.2,
            2: 0.2,
            3: 0.2,
            4: 0.2,
            5: 0.0,
            6: 0.0,
            7: 0.0,
            8: 0.0,
            9: 0.0
        }
        conddist = {
            'H': DictionaryProbDist(hprobs),
            'M': DictionaryProbDist(mprobs),
            'L': DictionaryProbDist(lprobs),
        }
        emdist = DictionaryConditionalProbDist(conddist)
        # And transition
        conddist = {}
        for first in states + [None]:
            probs = dict([(second, 1.0 / 3) for second in states + [None]])
            dist = DictionaryProbDist(probs)
            conddist[(first, )] = dist
        transdist = DictionaryConditionalProbDist(conddist)

        # Initialize an ngram model with these distributions
        self.model = DictionaryHmmModel(transdist, emdist, states, ems)
Пример #18
0
class NovelParagraph:
    def __init__(self, *args, **kwargs):
        if 'strategy' in kwargs:
            self.strategy = kwargs['strategy']
        else:
            self.strategy = 'best'

        self.events = []
        self.sentences = []
        self.source_probability = {}
        self.querysets = {}
        self.sources = []
        self.symmetrical_tokens = []
        for source, probability in args:
            self.source_probability[source] = probability
            self.querysets[source] = NGram.objects.filter(
                **reconcile_old_style_source(source)
            )
            self.sources.append(source)
            if self.querysets[source].count() == 0:
                raise InvalidSourceException("No NGrams with this source")
        self.source_probability = DictionaryProbDist(self.source_probability)

    def pick_queryset(self):
        return self.querysets[self.source_probability.generate()]

    def append_sentence(self):
        self.current_sentence = []
        starter = self.pick_queryset().filter(
            sentence_starter=True
        ).order_by('?').first()
        self.current_sentence.append((starter.token_one, starter.tag_one))
        self.current_sentence.append((starter.token_two, starter.tag_two))
        self.current_sentence.append((starter.token_three, starter.tag_three))
        while self.current_sentence[-1][0] not in TERMINAL_PUNCTUATION:
            new_word = self.new_word()
            self.current_sentence.append(new_word)
        self.sentences.append(self.current_sentence)

    def _get_others(self, original):
        sources = self.sources.copy()
        sources.remove(original)
        return [
            NGram.objects.filter(
                **reconcile_old_style_source(source)
            ) for source in sources
        ]

    def _account_for_symmetrical_tokens(self, token):
        if token in SYMMETRICAL_TOKENS:
            self.symmetrical_tokens.append(
                ( SYMMETRICAL_TOKENS[token], SYMMETRICAL_TOKENS[token] )
            )

    def new_word(self):
        queryset = self.pick_queryset()
        ordered_querysets = [queryset]

        if len(self.sources) > 1:
            if queryset.first().twitter_user:
                source = queryset.first().twitter_user.twitter_id + '@twitter'
            else:
                source = 'document:'+queryset.first().document.name
            ordered_querysets = ordered_querysets + self._get_others(source)

        for qs in ordered_querysets:
            new_word = self.new_word_from_queryset(qs)
            if new_word:
                self._account_for_symmetrical_tokens(new_word[0])
                if new_word[0] in TERMINAL_PUNCTUATION:
                    if len(self.symmetrical_tokens) > 0:
                        return self.symmetrical_tokens.pop()
                return new_word

        if len(self.symmetrical_tokens) > 0:
            return self.symmetrical_tokens.pop()

        return ('.', '.')

    def _best_matching_word(self, queryset):
        if self.strategy == 'grammar_only':
            return queryset.filter(
                tag_one=self.current_sentence[-2][1],
                tag_two=self.current_sentence[-1][1],
            ).order_by('?').first()
        else:
            nxt = queryset.filter(
                token_one__iexact=self.current_sentence[-2][0],
                token_two__iexact=self.current_sentence[-1][0],
                tag_one=self.current_sentence[-2][1],
                tag_two=self.current_sentence[-1][1],
            ).order_by('?').first()
            if not nxt:
                nxt = queryset.filter(
                    token_one__iexact=self.current_sentence[-2][0],
                    token_two__iexact=self.current_sentence[-1][0],
                ).order_by('?').first()
            return nxt

    def new_word_from_queryset(self, queryset):
        nxt = self._best_matching_word(queryset)
        if nxt:
            return (nxt.token_three, nxt.tag_three)
        else:
            return None

    @classmethod
    def _needs_space(self, token, previous_token, index):
        if index == 0:
            return False
        if previous_token in NO_TRAILING_SPACE_TOKENS:
            return False
        if token in NO_LEADING_SPACE_TOKENS:
            return False
        return True
    
    @classmethod
    def _join_and_postprocess_sentences(self, sentences):
        sentences = [''.join(sentence) for sentence in sentences]
        text = ' '.join(sentences)
        for pattern, replacement in REGEX_REPLACEMENTS:
            text = re.sub(pattern, replacement, text) 
        return text 
    
    def human_readable_sentences(self):
        final_output = []
        for sent in self.sentences:
            output = []
            for i, token in enumerate(sent):
                if NovelParagraph._needs_space(token[0], sent[i-1][0], i):
                    output.append(' ')
                output.append(token[0])
            final_output.append(output)
        return NovelParagraph._join_and_postprocess_sentences(final_output)
Пример #19
0
    def set_chord_transition_probabilities(self, spec):
        """
        Sets the parameters of the chord transition distribution. This is used 
        in initialization. The parameters are extracted from a string: this is 
        so that it can be specified in a script option.
        
        The required format of the string is a comma-separated list of 
        parameters given as C0->C1-P, where C0 and C1 are chords (I, II, etc) 
        that are in the model's distribution and P is a float probability.
        Parameters not specified will be evenly distributed the remaining 
        probability mass.
        
        """
        params = {}
        param_re = re.compile(r'(?P<chord0>.+)->(?P<chord1>.+)-(?P<prob>.+)')
        chord_ids = dict(
            (name, num) for (num, name) in constants.CHORD_NAMES.items())

        def _chord_id(name):
            # Get the id for the named chord
            if name not in chord_ids:
                raise RaphstoHmmParameterError, "unrecognised chord name '%s' "\
                    "in parameter spec: %s" % (name,spec)
            cid = chord_ids[name]
            if cid not in self.chord_transition_dom:
                raise RaphstoHmmParameterError, "chord %s is not used with this "\
                    "model (in parameter spec: %s)" % (name,spec)
            return cid

        for param_str in spec.split(","):
            # Pull out the bits of the parameter specification
            match = param_re.match(param_str.strip())
            if not match:
                raise RaphstoHmmParameterError, "could not parse parameter "\
                    "spec: %s (in: %s)" % (param_str, spec)
            parts = match.groupdict()
            chord0 = _chord_id(parts['chord0'])
            chord1 = _chord_id(parts['chord1'])
            try:
                prob = float(parts['prob'])
            except ValueError:
                raise RaphstoHmmParameterError, "not a valid probability: %s "\
                    "(in %s)" % (parts['prob'], spec)
            # Store the parameter value
            params.setdefault(chord0, {})[chord1] = prob

        # Set the values in the transition distribution
        dists = {}
        for chord0 in self.chord_transition_dom:
            dist_params = {}
            if chord0 not in params:
                # Not given in the spec: uniform distribution
                uniform_mass = 1.0 / len(self.chord_transition_dom)
                for chord1 in self.chord_transition_dom:
                    dist_params[chord1] = uniform_mass
            else:
                # Work out the prob mass to be distributed among unspecified parameters
                not_given = len(self.chord_transition_dom) - len(
                    params[chord0])
                if not_given > 0:
                    given_mass = sum(params[chord0].values(), 0.0)
                    uniform_mass = (1.0 - given_mass) / not_given
                else:
                    uniform_mass = 0.0
                # Calculate the whole distribution
                for chord1 in self.chord_transition_dom:
                    if chord1 in params[chord0]:
                        dist_params[chord1] = params[chord0][chord1]
                    else:
                        dist_params[chord1] = uniform_mass
            dists[chord0] = DictionaryProbDist(dist_params)
        # Use this distribution instead of what's already there
        self.chord_transition_dist = DictionaryConditionalProbDist(dists)

        self.add_history("Set chord transition distribution using "\
            "parameters: %s" % spec)
Пример #20
0
                           sep="\t",
                           quoting=csv.QUOTE_NONE,
                           header=None)

hmm_emits_pd = hmm_emits_pd.set_index([0, 1])
hmm_trans_pd = hmm_trans_pd.set_index([0, 1])
hmm_trans_pd = hmm_trans_pd.apply(lambda x: np.exp(x))
hmm_emits_pd = hmm_emits_pd.apply(lambda x: np.exp(x))

tag_dict_tag = dict()
for tag in distinct_tags:
    tag_dict = dict(
        zip(hmm_trans_pd.ix[tag].index, hmm_trans_pd.ix[tag].values.ravel()))
    #missing_to_dict = list(set(distinct_tags).difference(tag_dict.keys()))
    #tag_dict.update(zip(missing_to_dict,np.zeros(len(missing_to_dict))))
    tag_dict_tag[tag] = DictionaryProbDist(tag_dict)

transition = DictionaryConditionalProbDist(tag_dict_tag)

tag_dict_word = dict()
for tag in distinct_tags:
    tag_dict = dict(
        zip(hmm_emits_pd.ix[tag].index, hmm_emits_pd.ix[tag].values.ravel()))
    #missing_to_dict = list(set(distinct_tags).difference(tag_dict_word.keys()))
    #tag_dict_word.update(zip(missing_to_dict,np.zeros(len(missing_to_dict))))
    tag_dict_word[tag] = DictionaryProbDist(tag_dict)

emission = DictionaryConditionalProbDist(tag_dict_word)


def get_value(df, index_1, index_2):
Пример #21
0
 def _make_probdist(self, y_proba):
     return DictionaryProbDist(dict((self._index_label[i], p)
                                    for i, p in enumerate(y_proba)))
Пример #22
0
    weights = memcache.get_multi(allkeys,
                                 namespace=cache_ver,
                                 key_prefix=cache_ver)
    for label in labels:
        feature_vector = []
        for i in groups[label]:
            if i in weights:  #  ?? maybe get_multi didnt return
                feature_vector.append((weights[i], 1))
        if alwayson and label in alwayson:
            feature_vector.append((alwayson[label], 1))
        total = 0.0
        for (weight, f_val) in feature_vector:
            total += weight * f_val
        prob_dict[label] = total
    # Normalize the dictionary to give a probability distribution
    return DictionaryProbDist(prob_dict, log=True, normalize=True)


def encode(featureset, label, alwayson):
    # Inherit docs.
    encoding = []
    # Convert input-features to joint-features:
    keys = [(fname + str(fval) + label) for fname, fval in featureset.items()]
    # Known feature name & value:
    weights = memcache.get_multi(keys,
                                 namespace=cache_ver,
                                 key_prefix=cache_ver)
    for i in weights:
        encoding.append((weights[i], 1))
    # Add always-on features:
    if alwayson and label in alwayson:
Пример #23
0
 def initialize_chord_classes(cls, tetrad_prob, max_notes, grammar, \
         illegal_transitions=[], fixed_root_transitions={}, metric=False):
     """
     Creates a new model with the distributions initialized naively to 
     favour simple chord-types, in a similar way to what R&S do in the paper. 
     
     The transition distribution is initialized so that everything is 
     equiprobable.
     
     @type tetrad_prob: float
     @param tetrad_prob: prob of a note in the tetrad. This prob is 
         distributed over the notes of the tetrad. The remaining prob 
         mass is distributed over the remaining notes. You'll want this 
         to be >0.33, so that tetrad notes are more probable than others.
     @type max_notes: int
     @param max_notes: maximum number of notes that can be generated in 
         each emission. Usually best to set to something high, like 100 - 
         it's just to make the distribution finite.
     @type grammar: L{jazzparser.grammar.Grammar}
     @param grammar: grammar from which to take the chord class definitions
     @type metric: bool
     @param metric: if True, creates a model with a metrical component 
         (dependence on metrical position). Default False
     
     """
     # Only use chord classes that are used by some morph item in the lexicon
     classes = [ccls for ccls in grammar.chord_classes.values() if ccls.used]
     
     # Create a probability distribution for the emission distribution
     dists = {}
     
     # Create the distribution for each possible r-value if we're creating 
     #  a metrical model
     if metric:
         r_vals = range(4)
     else:
         r_vals = [0]
     # Separate emission distribution for each chord class
     for ccls in classes:
         for r in r_vals:
             probabilities = {}
             # We assign two different probabilities: in tetrad or out
             # Don't assume the tetrad has 4 notes!
             in_tetrad_prob = tetrad_prob / len(ccls.notes)
             out_tetrad_prob = (1.0 - tetrad_prob) / (12 - len(ccls.notes))
             # Give a probability to every pitch class
             for d in range(12):
                 if d in ccls.notes:
                     probabilities[d] = in_tetrad_prob
                 else:
                     probabilities[d] = out_tetrad_prob
             dists[(ccls.name,r)] = DictionaryProbDist(probabilities)
     emission_dist = DictionaryConditionalProbDist(dists)
     
     # Take the state labels from the lexical entries in the grammar
     # Include only tonic categories that were generated from lexical 
     #  expansion rules - i.e. only tonic repetition categories
     schemata = grammar.midi_families.keys()
     
     # Check that the transition constraint specifications refer to existing 
     #  schemata
     for labels in illegal_transitions:
         for label in labels:
             if label not in schemata:
                 raise ValueError, "%s, given in illegal transition "\
                     "specification, is not a valid schema in the grammar" \
                     % label
     for labels in fixed_root_transitions:
         for label in labels:
             if label not in schemata:
                 raise ValueError, "%s, given in fixed root transition "\
                     "specification, is not a valid schema in the grammar" \
                     % label
     
     # Build from the grammar a mapping from lexical schemata (POSs) to 
     #  chord classes
     chord_class_mapping = {}
     for morph in grammar.morphs:
         if morph.pos in schemata:
             chord_class_mapping.setdefault(morph.pos, []).append(str(morph.chord_class.name))
     # Make sure that every label appears in the mapping
     for label in schemata:
         if label not in chord_class_mapping:
             chord_class_mapping[label] = []
     
     # Initialize transition distribution so every transition is equiprobable
     schema_transition_counts = ConditionalFreqDist()
     root_transition_counts = ConditionalFreqDist()
     for label0 in schemata:
         for label1 in schemata:
             # Increment the count once for each chord class associated 
             #  with this schema: schemata with 2 chord classes get 2 
             #  counts
             for cclass in chord_class_mapping[label1]:
                 schema_transition_counts[label0].inc(label1)
                 for root_change in range(12):
                     # Give one count to the root transition corresponding to this state transition
                     root_transition_counts[(label0,label1)].inc(root_change)
         # Give a count to finishing in this state
         schema_transition_counts[label0].inc(None)
     # Estimate distribution from this frequency distribution
     schema_trans_dist = ConditionalProbDist(schema_transition_counts, mle_estimator, None)
     root_trans_dist = ConditionalProbDist(root_transition_counts, mle_estimator, None)
     # Sample this to get dictionary prob dists
     schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist)
     root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist)
     
     # Do the same with the initial states (just schemata, not roots)
     initial_state_counts = FreqDist()
     for label in schemata:
         initial_state_counts.inc(label)
     initial_state_dist = mle_estimator(initial_state_counts, None)
     initial_state_dist = prob_dist_to_dictionary_prob_dist(initial_state_dist)
     
     # Also initialize the notes number distribution to uniform
     emission_number_counts = FreqDist()
     for i in range(max_notes):
         emission_number_counts.inc(i)
     emission_number_dist = mle_estimator(emission_number_counts, None)
     emission_number_dist = prob_dist_to_dictionary_prob_dist(emission_number_dist)
     
     # Create the model
     model = cls(schema_trans_dist, 
                   root_trans_dist, 
                   emission_dist, 
                   emission_number_dist, 
                   initial_state_dist, 
                   schemata, 
                   chord_class_mapping, 
                   classes, 
                   metric=metric, 
                   illegal_transitions=illegal_transitions,
                   fixed_root_transitions=fixed_root_transitions)
     model.add_history(\
         "Initialized model to chord type probabilities, using "\
         "tetrad probability %s. Metric: %s" % \
         (tetrad_prob, metric))
     
     return model