def test_load_corpus(self): """ Test loading the entire corpus of languages. """ corpus_files = corpus.list_corpus_files('chatterbot.corpus') corpus_data = corpus.load_corpus(*corpus_files) self.assertTrue(len(list(corpus_data)))
def test_load_new_corpus_file(self): """ Test that a file path can be specified for a corpus. """ # Create a file for testing file_path = './test_corpus.yml' with io.open(file_path, 'w') as test_corpus: yml_data = u'\n'.join([ 'conversations:', '- - Hello', ' - Hi', '- - Hi', ' - Hello' ]) test_corpus.write(yml_data) data_files = corpus.list_corpus_files(file_path) corpus_data = list(corpus.load_corpus(*data_files)) # Remove the test file if os.path.exists(file_path): os.remove(file_path) self.assertEqual(len(corpus_data), 1) # Load the content from the corpus conversations, _categories, _file_path = corpus_data[0] self.assertEqual(len(conversations[0]), 2)
def test_load_corpus_greetings(self): for language in LANGUAGES: file_path = os.path.join(corpus.DATA_DIRECTORY, language, 'greetings.yml') data_files = corpus.list_corpus_files(file_path) corpus_data = corpus.load_corpus(*data_files) self.assertEqual(len(list(corpus_data)), 1)
def test_load_corpus_file_non_existent(self): """ Test that a file path can be specified for a corpus. """ file_path = './test_corpus.yml' self.assertFalse(os.path.exists(file_path)) with self.assertRaises(IOError): list(corpus.load_corpus(file_path))
def test_load_corpus_language(self): for language in LANGUAGES: paths = [ f'chatterbot.corpus.{language}', os.path.join(corpus.DATA_DIRECTORY, 'english') + '/', os.path.join(corpus.DATA_DIRECTORY, language) ] for file_path in paths: data_files = corpus.list_corpus_files(file_path) corpus_data = corpus.load_corpus(*data_files) self.assertGreater(len(list(corpus_data)), 1)
def test_load_corpus_categories(self): # english - greetings data_files = corpus.list_corpus_files( 'chatterbot.corpus.english.greetings') corpus_data = list(corpus.load_corpus(*data_files)) self.assertEqual(len(corpus_data), 1) for _conversation, categories, _file_path in corpus_data: self.assertIn('greetings', categories) conversations, categories, file_path = corpus_data[0] self.assertIn(['Hi', 'Hello'], conversations) self.assertEqual(['greetings'], categories) self.assertIn('chatterbot_corpus/data/english/greetings.yml', file_path)
def train(self, *corpus_paths): from app.chatterbot_api.chatterbot.corpus import load_corpus, list_corpus_files data_file_paths = [] # Get the paths to each file the bot will be trained with for corpus_path in corpus_paths: data_file_paths.extend(list_corpus_files(corpus_path)) for corpus, categories, file_path in load_corpus(*data_file_paths): statements_to_create = [] # Train the chat bot with each statement and response pair for conversation_count, conversation in enumerate(corpus): if self.show_training_progress: utils.print_progress_bar( 'Training ' + str(os.path.basename(file_path)), conversation_count + 1, len(corpus)) previous_statement_text = None previous_statement_search_text = '' for text in conversation: statement_search_text = self.chatbot.storage.tagger.get_text_index_string( text) statement = Statement( text=text, search_text=statement_search_text, in_response_to=previous_statement_text, search_in_response_to=previous_statement_search_text, conversation='training') statement.add_tags(*categories) statement = self.get_preprocessed_statement(statement) previous_statement_text = statement.text previous_statement_search_text = statement_search_text statements_to_create.append(statement) if statements_to_create: self.chatbot.storage.create_many(statements_to_create)