def test_single_sentence(self): sentence = "i like curry chicken" sentence_splitted = sentence.split(" ") m = Markov() m.add_sentence(sentence) first_word = m.rand_first_word() self.assertEqual(first_word, "i", \ "'i' should be the first word") next_word = first_word i = 0 while (next_word != None): self.assertEqual(next_word, sentence_splitted[i]), \ "Wrong order in word output" next_word = m.rand_followup_for(next_word) i += 1 self.assertLessEqual(i, len(sentence_splitted), \ "Something gone wrong in the loop") complete_sentence = m.gen_sentence() self.assertNotEqual(complete_sentence[len(complete_sentence) - 1], \ "chicken", \ "The last word must be 'chicken'")
class TestMarkov(unittest.TestCase): """test class of markov.py""" def setUp(self): print('*** setup ***') self.markov = Markov() def test_add_sentense(self): self.__add_sentense_bocchan() self.assertTrue(len(self.markov.starts) > 0) self.assertTrue(len(self.markov.dic) > 0) def test_generate(self): self.__add_sentense_bocchan() input_texts = [ '初めまして、坊ちゃん', 'あら、ご病気ですか', 'あらあら、大変ですね', 'いたずらして病気になっちゃったんですか?', 'そんな威張らなくてもいいでしょう', 'はあ、そんなもんですか', '遅刻しちゃだめですね', 'よく覚えてないんですか?', 'ターナー?', 'どなたですか?' ] for input_text in input_texts: with self.subTest(): tokens = Morph.analyze(input_text) keyword = 'N/A' for token in tokens: if Morph.is_keyword(token): keyword = token.surface generated = self.markov.generate(keyword) print('you > ' + input_text) print('generated > ' + generated) print('************') self.assertTrue(len(generated) > 0) # private method def __add_sentense_bocchan(self): sample_file = '../KOISURU_PROGRAM/sample/markov/bocchan.txt' content = '' original_content = codecs.open(sample_file, 'r', 'shift_jis') for row in original_content: content += row.rstrip() original_content.close() texts = re.split(r'[。??!! ]+', content) for text in texts: if text == '': continue tokens = Morph.analyze(text) self.markov.add_sentence(tokens) print('.', end='') print('')
def test_little_variation(self): sentence1 = "i like curry chicken" sentence2 = "i don't like curry chicken" m = Markov() m.add_sentence(sentence1) m.add_sentence(sentence2) first_word = m.rand_first_word() self.assertEqual(first_word, "i", \ "'i' should be the first word") second_word = m.rand_followup_for(first_word) self.assertIn(second_word, ["like", "don't"], \ "Second word should be 'like' or 'don't'") complete_sentence = m.gen_sentence() self.assertNotEqual(complete_sentence[len(complete_sentence) - 1], \ "chicken", \ "The last word must be 'chicken'")
class Dictionary: __metaclass__ = SingletonType original_directory = '../KOISURU_PROGRAM/sample/markov/dics/' random_file = './random.txt' pattern_file = './pattern.txt' template_file = './template.txt' markov_files = ['./markov_dic.pkl', './markov_starts.pkl'] def __init__(self): self.__load_random() self.__load_pattern() self.__load_template() self.__load_markov() def __load_file(self, original_file, use_file): if not os.path.isfile(use_file): shutil.copyfile(original_file, use_file) def __load_random(self): self.__load_file(self.original_directory + 'random.txt', self.random_file) self.random = messages(self.random_file) def __load_pattern(self): self.__load_file(self.original_directory + 'pattern.txt', self.pattern_file) self.pattern = [] rows = messages(self.pattern_file) for row in rows: divided = row.split('\t') item = PatternItem(divided[0], divided[1]) self.pattern.append(item) def __load_template(self): self.__load_file(self.original_directory + 'template.txt', self.template_file) self.template = {} rows = messages(self.template_file) for row in rows: divided = row.split('\t') if divided[0] == None: continue count = int(divided[0]) if not count in self.template.keys(): self.template[count] = [] self.template[count].append(divided[1]) def __load_markov(self): self.markov = Markov() self.markov.load(self.markov_files[0], self.markov_files[1]) def study(self, input_text, tokens): self.study_random(input_text) self.study_pattern(input_text, tokens) self.study_template(tokens) self.study_markov(tokens) def study_random(self, input_text): if not input_text in self.random: self.random.append(input_text) def find_pattern(self, word, input_text): for item in self.pattern: if re.match(word, item.pattern) == None: continue for phrase in item.phrases: if phrase['phrase'] == input_text: break if phrase == item.phrases[-1]: return item return None def study_pattern(self, input_text, tokens): for token in tokens: if not Morph.is_keyword(token): continue word = token.surface duped = self.find_pattern(word, input_text) if duped != None: duped.phrases.append({'need': 0, 'phrase': input_text}) else: self.pattern.append(PatternItem(word, '0##' + input_text)) def study_template(self, tokens): template = '' count = 0 for token in tokens: word = token.surface if Morph.is_keyword(token): word = '%noun%' count += 1 template += word if count == 0: return if not count in self.template.keys(): self.template[count] = [] if not template in self.template[count]: self.template[count].append(template) def study_markov(self, tokens): self.markov.add_sentence(tokens) def save(self): # random content = codecs.open(self.random_file, 'w', 'shift_jis') for x in self.random: content.write(x + "\n") content.close() # pattern content = codecs.open(self.pattern_file, 'w', 'shift_jis') for x in self.pattern: phrase = '' for y in x.phrases: phrase = phrase + str(y['need']) + '##' + y['phrase'] + '|' phrase = phrase[:-1] content.write( str(x.modify) + '##' + x.pattern + '\t' + phrase + '\n') content.close() # template content = codecs.open(self.template_file, 'w', 'shift_jis') for count in self.template.keys(): for template in self.template[count]: content.write(str(count) + '\t' + template + "\n") content.close() # markov self.markov.save(self.markov_files[0], self.markov_files[1])
class Dictionary: """ 会話データ登録クラス """ def __init__(self, target, nlp): """コンストラクタ""" self.target = target if target == 'what': target = 'pattern' self.f_path = 'brain/dicts/%s.ini'%target if target == 'template': self.touch_dicts(target) lines = [line.strip() for line in open(self.f_path, 'r', encoding='utf-8')] self._dict = {int(l.split('\t')[0]): '' for l in lines} for l in lines: count, template = l.split('\t') if len(self._dict[int(count)]) == 0: self._dict[int(count)] = template else: self._dict[int(count)] += '|'+template elif target == 'markov': # 雑談用 self._mkv = Markov() self._mkv.load('brain/dicts/markov.dat') else: self.touch_dicts(target) lines = [line.strip() for line in open(self.f_path, 'r', encoding='utf-8')] self._dict = {l.split('\t')[0]: l.split('\t')[1] for l in lines} self._nlp = nlp def touch_dicts(self, target): """辞書ファイルがなければ空のファイルを作成し、あれば何もしない。""" if target == 'markov': return f_path = 'brain/dicts/%s.ini'%target if not os.path.exists(f_path): open(f_path, 'w').close() @staticmethod def pattern_to_line(key, phrase): """パターンのハッシュを文字列に変換する。""" return '{}\t{}'.format(key, '|'.join(phrase)) @property def obj(self): return self._mkv @property def data(self): return self._dict def study(self, text, parts): if self.target == 'patten' or self.target == 'what': self.study_pattern(text, parts) elif self.target == 'template': self.study_template(text, parts) elif self.target == 'markov': self.study_markov(text, parts) def study_markov(self, text, parts): """形態素のリストpartsを受け取り、マルコフ辞書に学習させる。""" self._mkv.add_sentence(parts) def study_template(self, text, parts): """形態素のリストpartsを受け取り、 名詞のみ'%noun%'に変更した文字列templateをself._templateに追加する。 名詞が存在しなかった場合、または同じtemplateが存在する場合は何もしない。 """ template = '' count = 0 for word, part in parts: if self._nlp.is_keyword(part): word = '%noun%' count += 1 template += word if count not in self._dict.keys(): self._dict[count] = template elif template not in self._dict[count]: self._dict[count] += '|'+template def study_pattern(self, text, parts): """ユーザーの発言textを、形態素partsに基づいてパターン辞書に保存する。""" for word, part in parts: if self._nlp.is_keyword(part): duplicated = next((key for key in self._dict.keys() if key == word), None) if duplicated: if text not in self._dict[word]: self._dict[word] += '|'+text else: self._dict[word] = text def save(self): """メモリ上の辞書をファイルに保存する。""" if self.target == 'template': self.save_template() elif self.target == 'markov': self.save_markov() elif self.target == 'pattern' or self.target == 'what': with open(self.f_path, mode='w', encoding='utf-8') as f: for key in self._dict: f.write('%s\t%s\n'%(key, self._dict[key])) def save_markov(self): self._mkv.save('markov.dat') def save_template(self): with open(self.f_path, mode='w', encoding='utf-8') as f: for count, templates in self._dict.items(): templates = templates.split('|') for template in templates: f.write('{}\t{}\n'.format(count, template))