Пример #1
0
    def test_serialize(self):
        q = Queue()

        q.enqueue('A')
        q.enqueue('B')

        items = q.serialize()

        assert tuple(['A', 'B']) == items
Пример #2
0
class Markov(dict):
    def __init__(self, word_list, order=2, sentences=1):
        super().__init__()
        self.order = order  # order of the markov chain
        self.sentences = sentences
        self.memory = Queue(order)  # for sampling from markov model

        if word_list is not None:
            self['START'] = Dictogram()
            self._create_chain(word_list)

    def _create_chain(self, word_list):
        """Generate the internal markov chain that will be used by the sentence generator."""
        for i, message in enumerate(word_list + word_list[:self.order]):
            if i < self.order:  # to fill the queue initally so that we do not add states that are smaller than order
                self.memory.enqueue(message)  # enqueue each new item
            else:
                current_state = self.memory.serialize()  # the current state
                self.memory.enqueue(message)  # create the new state
                new_state = self.memory.serialize()  # the next state

                # TODO: improve how I want to sample start tokens
                if re.match('[A-Z]', current_state[0]) is not None:
                    # TODO: Don't add start tokens
                    self['START'].add_count(current_state)

                if current_state in self:  # check to see if the state already exists
                    # if it does just add the next state to it
                    self[current_state].add_count(message)
                else:
                    # otherwise create a new dictogram with the new state
                    self[current_state] = Dictogram([message])

        self.memory.clear()

    def generate_sentence(self):
        """Generate a sentence from the internal markov chain."""
        sentences = [
        ]  # empty list to keep generated sentences so that we can return them :)
        for _ in range(self.sentences
                       ):  # generate as many sentences as the user wants
            # word from starting state
            starting_state = self['START'].sample()[0]
            sentence_list = list()  # empty array to append sentence items to
            # the start of the sentence :)
            sentence_list.extend(starting_state)
            # Count to keep as a failsafe if there is no punctuation.
            failsafe = 0

            # loop through starting state and add those items to the queue
            for item in starting_state:
                self.memory.enqueue(item)

            while True:
                # increase failsafe for each iteration
                failsafe += 1
                # get the next state by samplying the current state
                next_state = self[starting_state].sample()[0]  # a word
                # enque the word into 'memory'
                self.memory.enqueue(next_state)
                # add the item to the list
                sentence_list.append(next_state)
                # check if the word is an end token
                if re.search('[\.\?\!]', next_state) is not None:
                    # clear the memory
                    self.memory.clear()
                    # return the sentence as a string without a period because it is added from stop token
                    sentences.append(' '.join(sentence_list))
                    break
                # set the new 'starting' state to the the tuple that is currently in 'memory'
                starting_state = self.memory.serialize()
                # if ran 20 times return what we already have and add a period
                if failsafe > 20:
                    # clear memory for next sentence generation
                    self.memory.clear()
                    # return the sentence as a string with an appended period because it will not have from stop token
                    sentences.append(' '.join(sentence_list) + '.')
                    break

        return ' '.join(sentences)