예제 #1
0
    def __init__(self):
        # keys are word 2-tuples, values are dicts mapping trailing words to frequency
        self.__words = {}

        # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples)
        self.__words_compiled = {}

        # maps phrase start and end words to cardinalities
        self.__heads = KeyCounter()
        self.__tails = KeyCounter()

        # list of word-cardinality tuples
        self.__heads_compiled = []
예제 #2
0
 def __init__(self):
     # keys are word 2-tuples, values are dicts mapping trailing words to frequency
     self.__words = {}
     
     # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples)
     self.__words_compiled = {}
     
     # maps phrase start and end words to cardinalities
     self.__heads = KeyCounter()
     self.__tails = KeyCounter()
     
     # list of word-cardinality tuples
     self.__heads_compiled = []
예제 #3
0
    def test_ingest_correct_length(self):
        speaker = self.__new_speaker()
        speaker.ingest(_text)
        speaker.compile()

        run_count = 100
        min_length = 70
        max_length = 130
        errors = KeyCounter()
        for i in xrange(run_count):
            try:
                result = speaker.speak(min_length, max_length)
                if len(result) < min_length:
                    errors.increment("short")
                elif len(result) > max_length:
                    errors.increment("long")
            except Exception, e:
                errors.increment(e)
예제 #4
0
    def test_ingest_correct_length(self):
        speaker = self.__new_speaker()
        speaker.ingest(_text)
        speaker.compile()

        run_count = 100
        min_length = 70
        max_length = 130
        errors = KeyCounter()
        for i in xrange(run_count):
            try:
                result = speaker.speak(min_length, max_length)
                if len(result) < min_length:
                    errors.increment("short")
                elif len(result) > max_length:
                    errors.increment("long")
            except Exception, e:
                errors.increment(e)
예제 #5
0
 def find_artifact_counts_newer(cls, datetime, **kw):
     counts = KeyCounter()
     for art in ArtifactInfo.find_newer(datetime, **kw):
         counts.increment(art.source_name)
     return counts.to_hash()
예제 #6
0
class Markov2Speaker(SelectingSpeaker):
    """ 2nd order Markov chain.  mostly congruent to Markov1Speaker. """
    
    def __init__(self):
        # keys are word 2-tuples, values are dicts mapping trailing words to frequency
        self.__words = {}
        
        # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples)
        self.__words_compiled = {}
        
        # maps phrase start and end words to cardinalities
        self.__heads = KeyCounter()
        self.__tails = KeyCounter()
        
        # list of word-cardinality tuples
        self.__heads_compiled = []
        
    def ingest(self, phrase):
        for sentence in tokenize_sentences(phrase, 50, lowercase=True):
            phrase_words = sentence.split()
            phrase_words.append(Symbols.END)
            phrase_len = len(phrase_words)
        
            # phrases under 3 are of no use to a 2nd-order chain
            if phrase_len < 3:
                return
        
            # grabs first 2 words of phrase
            self.__heads.increment((phrase_words[0], phrase_words[1]))
        
            for i in range(phrase_len - 2):
                w1 = phrase_words[i]
                w2 = phrase_words[i + 1]
                w3 = phrase_words[i + 2]
            
                w_pair = (w1, w2)
                if w_pair in self.__words:
                    trailing_words = self.__words[w_pair]
                    if w3 in trailing_words:
                        trailing_words[w3] = trailing_words[w3] + 1
                    else:
                        trailing_words[w3] = 1
                else:
                    trailing_words = {w3: 1}
                    self.__words[w_pair] = trailing_words

    def compile(self):
        """ converts word grid into more efficient structure """
        self.__words_compiled.clear()
        del self.__heads_compiled[0:]
        
        for w_pair in self.__words:
            trailing_words = []
            w_pair_count = 0
            for w, count in self.__words[w_pair].iteritems():
                trailing_words.append((w, count))
                w_pair_count += count
                
            self.__words_compiled[w_pair] = (w_pair_count, trailing_words)

        # builds list of (word 2-tuple, frequency) tuples
        self.__word_weights = []
        for w_pair, p in self.__words_compiled.iteritems():
            self.__word_weights.append((w_pair, p[0]))
        
        for pair, count in self.__heads.iteritems():
            self.__heads_compiled.append((pair, count))
        
        # logs all head phrases with cardinality > 1
        # logging.debug("heads: %s" % filter(lambda p: p[1] > 1, sorted(self.__heads.items(), key=lambda p: p[1], reverse=True)))

    def select(self, selected, min_length, max_length):
        """
        params:
            selected - list of selected words.
        return:
            a tuple of words
        """
        if not self.__words:
            raise MissingDataException("no satisfactory content has been ingested")
        elif not self.__words_compiled:
            self.compile()
        
        # select a trailing word via weighted random
        def select_next(current_pair):
            pair_stats = self.__words_compiled.get(current_pair, None)
            next_word = None
            if pair_stats:
                # ends if we're past the min length and have END as a potential next word
                if (calculate_length(selected) > min_length) and \
                    Symbols.END in pair_stats:
                    next_word = None
                else:
                    # don't end if we're not longer than min_length
                    logging.debug("select_next %s %s" % (current_pair, str(pair_stats)))
                    
                    # stats were found for this pair
                    # next_word = rrandom.select_weighted_with_replacement(pair_stats[1])
                    next_word = random.choice(pair_stats[1])[0]
                
            if next_word is Symbols.END: next_word = None
            return (next_word,) if next_word else None
            
        # select first pair via weighted random
        if not selected:
            #next = (rrandom.select_weighted_with_replacement(self.__word_weights))
            next = (rrandom.select_weighted_with_replacement(self.__heads_compiled))
        else:
            # select using last 2 words as params
            next = select_next((selected[-2], selected[-1]))

        return next      
        
    def describe(self):
        parts = []
        for w_pair, suffix_count in sorted(self.__words.iteritems()):
            parts.append("%s %s\n" % w_pair)
            for w_tail, count in sorted(suffix_count.iteritems(), key=lambda p: p[1], reverse=True):
                parts.append("  %s: %d\n" % (w_tail, count))
        return "".join(parts)
예제 #7
0
 def find_artifact_counts_newer(cls, datetime, **kw):
     counts = KeyCounter()
     for art in ArtifactInfo.find_newer(datetime, **kw):
         counts.increment(art.source_name)
     return counts.to_hash()
예제 #8
0
class Markov2Speaker(SelectingSpeaker):
    """ 2nd order Markov chain.  mostly congruent to Markov1Speaker. """
    def __init__(self):
        # keys are word 2-tuples, values are dicts mapping trailing words to frequency
        self.__words = {}

        # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples)
        self.__words_compiled = {}

        # maps phrase start and end words to cardinalities
        self.__heads = KeyCounter()
        self.__tails = KeyCounter()

        # list of word-cardinality tuples
        self.__heads_compiled = []

    def ingest(self, phrase):
        for sentence in tokenize_sentences(phrase, 50, lowercase=True):
            phrase_words = sentence.split()
            phrase_words.append(Symbols.END)
            phrase_len = len(phrase_words)

            # phrases under 3 are of no use to a 2nd-order chain
            if phrase_len < 3:
                return

            # grabs first 2 words of phrase
            self.__heads.increment((phrase_words[0], phrase_words[1]))

            for i in range(phrase_len - 2):
                w1 = phrase_words[i]
                w2 = phrase_words[i + 1]
                w3 = phrase_words[i + 2]

                w_pair = (w1, w2)
                if w_pair in self.__words:
                    trailing_words = self.__words[w_pair]
                    if w3 in trailing_words:
                        trailing_words[w3] = trailing_words[w3] + 1
                    else:
                        trailing_words[w3] = 1
                else:
                    trailing_words = {w3: 1}
                    self.__words[w_pair] = trailing_words

    def compile(self):
        """ converts word grid into more efficient structure """
        self.__words_compiled.clear()
        del self.__heads_compiled[0:]

        for w_pair in self.__words:
            trailing_words = []
            w_pair_count = 0
            for w, count in self.__words[w_pair].iteritems():
                trailing_words.append((w, count))
                w_pair_count += count

            self.__words_compiled[w_pair] = (w_pair_count, trailing_words)

        # builds list of (word 2-tuple, frequency) tuples
        self.__word_weights = []
        for w_pair, p in self.__words_compiled.iteritems():
            self.__word_weights.append((w_pair, p[0]))

        for pair, count in self.__heads.iteritems():
            self.__heads_compiled.append((pair, count))

        # logs all head phrases with cardinality > 1
        # logging.debug("heads: %s" % filter(lambda p: p[1] > 1, sorted(self.__heads.items(), key=lambda p: p[1], reverse=True)))

    def select(self, selected, min_length, max_length):
        """
        params:
            selected - list of selected words.
        return:
            a tuple of words
        """
        if not self.__words:
            raise MissingDataException(
                "no satisfactory content has been ingested")
        elif not self.__words_compiled:
            self.compile()

        # select a trailing word via weighted random
        def select_next(current_pair):
            pair_stats = self.__words_compiled.get(current_pair, None)
            next_word = None
            if pair_stats:
                # ends if we're past the min length and have END as a potential next word
                if (calculate_length(selected) > min_length) and \
                    Symbols.END in pair_stats:
                    next_word = None
                else:
                    # don't end if we're not longer than min_length
                    logging.debug("select_next %s %s" %
                                  (current_pair, str(pair_stats)))

                    # stats were found for this pair
                    # next_word = rrandom.select_weighted_with_replacement(pair_stats[1])
                    next_word = random.choice(pair_stats[1])[0]

            if next_word is Symbols.END: next_word = None
            return (next_word, ) if next_word else None

        # select first pair via weighted random
        if not selected:
            #next = (rrandom.select_weighted_with_replacement(self.__word_weights))
            next = (rrandom.select_weighted_with_replacement(
                self.__heads_compiled))
        else:
            # select using last 2 words as params
            next = select_next((selected[-2], selected[-1]))

        return next

    def describe(self):
        parts = []
        for w_pair, suffix_count in sorted(self.__words.iteritems()):
            parts.append("%s %s\n" % w_pair)
            for w_tail, count in sorted(suffix_count.iteritems(),
                                        key=lambda p: p[1],
                                        reverse=True):
                parts.append("  %s: %d\n" % (w_tail, count))
        return "".join(parts)