예제 #1
0
    def test_single_sentence(self):
        sentence = "i like curry chicken"
        sentence_splitted = sentence.split(" ")
        m = Markov()
        m.add_sentence(sentence)

        first_word = m.rand_first_word()
        self.assertEqual(first_word, "i", \
                "'i' should be the first word")

        next_word = first_word
        i = 0
        while (next_word != None):
            self.assertEqual(next_word, sentence_splitted[i]), \
                    "Wrong order in word output"
            next_word = m.rand_followup_for(next_word)

            i += 1
            self.assertLessEqual(i, len(sentence_splitted), \
                    "Something gone wrong in the loop")

        complete_sentence = m.gen_sentence()
        self.assertNotEqual(complete_sentence[len(complete_sentence) - 1], \
                            "chicken", \
                            "The last word must be 'chicken'")
예제 #2
0
class TestMarkov(unittest.TestCase):
    """test class of markov.py"""
    def setUp(self):
        print('*** setup ***')
        self.markov = Markov()

    def test_add_sentense(self):
        self.__add_sentense_bocchan()
        self.assertTrue(len(self.markov.starts) > 0)
        self.assertTrue(len(self.markov.dic) > 0)

    def test_generate(self):
        self.__add_sentense_bocchan()
        input_texts = [
            '初めまして、坊ちゃん', 'あら、ご病気ですか', 'あらあら、大変ですね', 'いたずらして病気になっちゃったんですか?',
            'そんな威張らなくてもいいでしょう', 'はあ、そんなもんですか', '遅刻しちゃだめですね', 'よく覚えてないんですか?',
            'ターナー?', 'どなたですか?'
        ]

        for input_text in input_texts:
            with self.subTest():
                tokens = Morph.analyze(input_text)
                keyword = 'N/A'
                for token in tokens:
                    if Morph.is_keyword(token):
                        keyword = token.surface
                generated = self.markov.generate(keyword)
                print('you > ' + input_text)
                print('generated > ' + generated)
                print('************')
                self.assertTrue(len(generated) > 0)

    # private method
    def __add_sentense_bocchan(self):
        sample_file = '../KOISURU_PROGRAM/sample/markov/bocchan.txt'
        content = ''

        original_content = codecs.open(sample_file, 'r', 'shift_jis')
        for row in original_content:
            content += row.rstrip()
        original_content.close()
        texts = re.split(r'[。??!!  ]+', content)

        for text in texts:
            if text == '':
                continue
            tokens = Morph.analyze(text)
            self.markov.add_sentence(tokens)
            print('.', end='')
        print('')
예제 #3
0
    def test_little_variation(self):
        sentence1 = "i like curry chicken"
        sentence2 = "i don't like curry chicken"
        m = Markov()
        m.add_sentence(sentence1)
        m.add_sentence(sentence2)

        first_word = m.rand_first_word()
        self.assertEqual(first_word, "i", \
                "'i' should be the first word")

        second_word = m.rand_followup_for(first_word)
        self.assertIn(second_word, ["like", "don't"], \
                "Second word should be 'like' or 'don't'")

        complete_sentence = m.gen_sentence()
        self.assertNotEqual(complete_sentence[len(complete_sentence) - 1], \
                            "chicken", \
                            "The last word must be 'chicken'")
예제 #4
0
class Dictionary:
    __metaclass__ = SingletonType

    original_directory = '../KOISURU_PROGRAM/sample/markov/dics/'
    random_file = './random.txt'
    pattern_file = './pattern.txt'
    template_file = './template.txt'
    markov_files = ['./markov_dic.pkl', './markov_starts.pkl']

    def __init__(self):
        self.__load_random()
        self.__load_pattern()
        self.__load_template()
        self.__load_markov()

    def __load_file(self, original_file, use_file):
        if not os.path.isfile(use_file):
            shutil.copyfile(original_file, use_file)

    def __load_random(self):
        self.__load_file(self.original_directory + 'random.txt',
                         self.random_file)
        self.random = messages(self.random_file)

    def __load_pattern(self):
        self.__load_file(self.original_directory + 'pattern.txt',
                         self.pattern_file)
        self.pattern = []
        rows = messages(self.pattern_file)
        for row in rows:
            divided = row.split('\t')
            item = PatternItem(divided[0], divided[1])
            self.pattern.append(item)

    def __load_template(self):
        self.__load_file(self.original_directory + 'template.txt',
                         self.template_file)
        self.template = {}
        rows = messages(self.template_file)
        for row in rows:
            divided = row.split('\t')
            if divided[0] == None:
                continue
            count = int(divided[0])
            if not count in self.template.keys():
                self.template[count] = []
            self.template[count].append(divided[1])

    def __load_markov(self):
        self.markov = Markov()
        self.markov.load(self.markov_files[0], self.markov_files[1])

    def study(self, input_text, tokens):
        self.study_random(input_text)
        self.study_pattern(input_text, tokens)
        self.study_template(tokens)
        self.study_markov(tokens)

    def study_random(self, input_text):
        if not input_text in self.random:
            self.random.append(input_text)

    def find_pattern(self, word, input_text):
        for item in self.pattern:
            if re.match(word, item.pattern) == None:
                continue
            for phrase in item.phrases:
                if phrase['phrase'] == input_text:
                    break
                if phrase == item.phrases[-1]:
                    return item
        return None

    def study_pattern(self, input_text, tokens):
        for token in tokens:
            if not Morph.is_keyword(token):
                continue
            word = token.surface
            duped = self.find_pattern(word, input_text)
            if duped != None:
                duped.phrases.append({'need': 0, 'phrase': input_text})
            else:
                self.pattern.append(PatternItem(word, '0##' + input_text))

    def study_template(self, tokens):
        template = ''
        count = 0
        for token in tokens:
            word = token.surface
            if Morph.is_keyword(token):
                word = '%noun%'
                count += 1
            template += word
        if count == 0:
            return
        if not count in self.template.keys():
            self.template[count] = []
        if not template in self.template[count]:
            self.template[count].append(template)

    def study_markov(self, tokens):
        self.markov.add_sentence(tokens)

    def save(self):
        # random
        content = codecs.open(self.random_file, 'w', 'shift_jis')
        for x in self.random:
            content.write(x + "\n")
        content.close()

        # pattern
        content = codecs.open(self.pattern_file, 'w', 'shift_jis')
        for x in self.pattern:
            phrase = ''
            for y in x.phrases:
                phrase = phrase + str(y['need']) + '##' + y['phrase'] + '|'
            phrase = phrase[:-1]
            content.write(
                str(x.modify) + '##' + x.pattern + '\t' + phrase + '\n')
        content.close()

        # template
        content = codecs.open(self.template_file, 'w', 'shift_jis')
        for count in self.template.keys():
            for template in self.template[count]:
                content.write(str(count) + '\t' + template + "\n")
        content.close()

        # markov
        self.markov.save(self.markov_files[0], self.markov_files[1])
예제 #5
0
class Dictionary:
    """
    会話データ登録クラス
    """
    def __init__(self, target, nlp):
        """コンストラクタ"""
        self.target = target
        if target == 'what': target = 'pattern'
        self.f_path = 'brain/dicts/%s.ini'%target
        if target == 'template':
            self.touch_dicts(target)
            lines = [line.strip() for line in open(self.f_path, 'r', encoding='utf-8')]
            self._dict = {int(l.split('\t')[0]): '' for l in lines}
            for l in lines:
                count, template = l.split('\t')
                if len(self._dict[int(count)]) == 0:
                    self._dict[int(count)] = template
                else:
                    self._dict[int(count)] += '|'+template
        elif target == 'markov':
            # 雑談用
            self._mkv = Markov()
            self._mkv.load('brain/dicts/markov.dat')
        else:
            self.touch_dicts(target)
            lines = [line.strip() for line in open(self.f_path, 'r', encoding='utf-8')]
            self._dict = {l.split('\t')[0]: l.split('\t')[1] for l in lines}
        self._nlp = nlp

    def touch_dicts(self, target):
        """辞書ファイルがなければ空のファイルを作成し、あれば何もしない。"""
        if target == 'markov': return
        f_path = 'brain/dicts/%s.ini'%target
        if not os.path.exists(f_path):
            open(f_path, 'w').close()

    @staticmethod
    def pattern_to_line(key, phrase):
        """パターンのハッシュを文字列に変換する。"""
        return '{}\t{}'.format(key, '|'.join(phrase))

    @property
    def obj(self):
        return self._mkv

    @property
    def data(self):
        return self._dict

    def study(self, text, parts):
        if self.target == 'patten' or self.target == 'what':
            self.study_pattern(text, parts)
        elif self.target == 'template':
            self.study_template(text, parts)
        elif self.target == 'markov':
            self.study_markov(text, parts)

    def study_markov(self, text, parts):
        """形態素のリストpartsを受け取り、マルコフ辞書に学習させる。"""
        self._mkv.add_sentence(parts)

    def study_template(self, text, parts):
        """形態素のリストpartsを受け取り、
        名詞のみ'%noun%'に変更した文字列templateをself._templateに追加する。
        名詞が存在しなかった場合、または同じtemplateが存在する場合は何もしない。
        """
        template = ''
        count = 0
        for word, part in parts:
            if self._nlp.is_keyword(part):
                word = '%noun%'
                count += 1
            template += word
        if count not in self._dict.keys():
            self._dict[count] = template
        elif template not in self._dict[count]:
            self._dict[count] += '|'+template

    def study_pattern(self, text, parts):
        """ユーザーの発言textを、形態素partsに基づいてパターン辞書に保存する。"""
        for word, part in parts:
            if self._nlp.is_keyword(part):
                duplicated = next((key for key in self._dict.keys() if key == word), None)
                if duplicated:
                    if text not in self._dict[word]:
                        self._dict[word] += '|'+text
                else:
                    self._dict[word] = text

    def save(self):
        """メモリ上の辞書をファイルに保存する。"""
        if self.target == 'template':
            self.save_template()
        elif self.target == 'markov':
            self.save_markov()
        elif self.target == 'pattern' or self.target == 'what':
            with open(self.f_path, mode='w', encoding='utf-8') as f:
                for key in self._dict:
                    f.write('%s\t%s\n'%(key, self._dict[key]))

    def save_markov(self):
        self._mkv.save('markov.dat')

    def save_template(self):
        with open(self.f_path, mode='w', encoding='utf-8') as f:
            for count, templates in self._dict.items():
                templates = templates.split('|')
                for template in templates:
                    f.write('{}\t{}\n'.format(count, template))