def test_load_corpus_file(self): """ Test that a file path can be specified for a corpus. """ # Create a file for testing file_path = './test_corpus.yml' with io.open(file_path, 'w') as test_corpus: yml_data = u'\n'.join( ['conversations:', '- - Hello', ' - Hi', '- - Hi', ' - Hello'] ) test_corpus.write(yml_data) data_files = corpus.list_corpus_files(file_path) corpus_data = list(corpus.load_corpus(*data_files)) # Remove the test file if os.path.exists(file_path): os.remove(file_path) self.assertEqual(len(corpus_data), 1) # Load the content from the corpus conversations, _categories, _file_path = corpus_data[0] self.assertEqual(len(conversations[0]), 2)
def test_load_corpus(self): """ Test loading the entire corpus of languages. """ corpus_files = corpus.list_corpus_files('chatterbot.corpus') corpus_data = corpus.load_corpus(*corpus_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_file_non_existent(self): """ Test that a file path can be specified for a corpus. """ file_path = './test_corpus.yml' self.assertFalse(os.path.exists(file_path)) with self.assertRaises(IOError): list(corpus.load_corpus(file_path))
def test_load_corpus_english_categories(self): data_files = corpus.list_corpus_files('chatterbot.corpus.english.greetings') corpus_data = list(corpus.load_corpus(*data_files)) self.assertEqual(len(corpus_data), 1) # Test that each conversation gets labeled with the correct category for _conversation, categories, _file_path in corpus_data: self.assertIn('greetings', categories)
def test_load_english_corpus_categories(self): files = corpus.list_corpus_files( 'chatterbot_corpus/data/english/greetings.yml') corpus_data = list(corpus.load_corpus(*files)) self.assertEqual(len(corpus_data), 1) # Test that each conversation gets labeled with the correct category for conversation in corpus_data: self.assertIn('greetings', conversation[1])
def test_conversation_format(self): files = corpus.list_corpus_files('chatterbot_corpus') for dialog_corpus, _categories, _file_path in corpus.load_corpus( *files): for conversation in dialog_corpus: for text in conversation: if not isinstance(text, str): self.fail('"{}" must be a string, not {}.'.format( str(text), type(text)))
def test_load_corpus_english_greetings(self): data_files = corpus.list_corpus_files('chatterbot.corpus.english.greetings') corpus_data = list(corpus.load_corpus(*data_files)) self.assertEqual(len(corpus_data), 1) conversations, categories, file_path = corpus_data[0] self.assertIn(['Hi', 'Hello'], conversations) self.assertEqual(['greetings'], categories) self.assertIn('chatterbot_corpus/data/english/greetings.yml', file_path)
def test_character_count(self): """ Test that no line in the corpus exceeds the maximum number of characters. """ files = corpus.list_corpus_files('chatterbot_corpus') for dialog_corpus, _categories, _file_path in corpus.load_corpus( *files): for conversation in dialog_corpus: for text in conversation: if len(text) > STATEMENT_TEXT_MAX_LENGTH: self.fail( '"{}" cannot be longer than {} characters'.format( text, STATEMENT_TEXT_MAX_LENGTH))
def train(self, *corpus_paths): from chatterbot.corpus import load_corpus, list_corpus_files data_file_paths = [] # Get the paths to each file the bot will be trained with for corpus_path in corpus_paths: data_file_paths.extend(list_corpus_files(corpus_path)) for corpus, categories, file_path in load_corpus(*data_file_paths): statements_to_create = [] # Train the chat bot with each statement and response pair for conversation_count, conversation in enumerate(corpus): if self.show_training_progress: utils.print_progress_bar( 'Training ' + str(os.path.basename(file_path)), conversation_count + 1, len(corpus)) previous_statement_text = None previous_statement_search_text = '' for text in conversation: statement_search_text = self.stemmer.stem(text) _statement = Statement( text=text, search_text=statement_search_text, in_response_to=previous_statement_text, search_in_response_to=previous_statement_search_text, conversation='training') _statement.add_tags(*categories) statement = self.get_preprocessed_statement(_statement) previous_statement_text = statement.text previous_statement_search_text = statement_search_text statements_to_create.append({ 'text': statement.text, 'in_response_to': statement.in_response_to, 'conversation': statement.conversation, 'tags': statement.tags }) self.chatbot.storage.create_many(statements_to_create)
def get_sentence_tokenizer(language): """ Return the sentence tokenizer callable. """ pickle_path = 'sentence_tokenizer.pickle' try: input_file = open(pickle_path, 'rb') sentence_tokenizer = load(input_file) input_file.close() except FileNotFoundError: data_file_paths = [] sentences = [] try: # Get the paths to each file the bot will be trained with corpus_files = list_corpus_files('chatterbot.corpus.{language}'.format( language=language.ENGLISH_NAME.lower() )) except LookupError: # Fall back to English sentence splitting rules if a language is not supported corpus_files = list_corpus_files('chatterbot.corpus.{language}'.format( language=languages.ENG.ENGLISH_NAME.lower() )) data_file_paths.extend(corpus_files) for corpus, _categories, _file_path in load_corpus(*data_file_paths): for conversation in corpus: for text in conversation: sentences.append(text.upper()) sentences.append(text.lower()) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train('\n'.join(sentences)) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Pickle the sentence tokenizer for future use output_file = open(pickle_path, 'wb') dump(sentence_tokenizer, output_file, -1) output_file.close() return sentence_tokenizer
def train(self, *corpus_paths): from chatterbot.corpus import load_corpus, list_corpus_files data_file_paths = [] # Get the paths to each file the bot will be trained with for corpus_path in corpus_paths: data_file_paths.extend(list_corpus_files(corpus_path)) for corpus, categories, file_path in load_corpus(*data_file_paths): statements_to_create = [] # Train the chat bot with each statement and response pair for conversation_count, conversation in enumerate(corpus): if self.show_training_progress: utils.print_progress_bar( 'Training ' + str(os.path.basename(file_path)), conversation_count + 1, len(corpus) ) previous_statement_text = None previous_statement_search_text = '' for text in conversation: statement_search_text = self.stemmer.get_bigram_pair_string(text) statement = Statement( text=text, search_text=statement_search_text, in_response_to=previous_statement_text, search_in_response_to=previous_statement_search_text, conversation='training' ) statement.add_tags(*categories) statement = self.get_preprocessed_statement(statement) previous_statement_text = statement.text previous_statement_search_text = statement_search_text statements_to_create.append(statement) self.chatbot.storage.create_many(statements_to_create)
def test_load_corpus_spanish(self): data_files = corpus.list_corpus_files('chatterbot.corpus.spanish') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_telugu(self): data_files = corpus.list_corpus_files('chatterbot.corpus.telugu') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_italian(self): files = corpus.list_corpus_files('chatterbot_corpus/data/italian') corpus_data = list(corpus.load_corpus(*files)) self.assertTrue(len(corpus_data))
def test_load_corpus_chinese(self): data_files = corpus.list_corpus_files('chatterbot.corpus.chinese') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_portuguese(self): files = corpus.list_corpus_files('chatterbot_corpus/data/portuguese') corpus_data = list(corpus.load_corpus(*files)) self.assertTrue(len(corpus_data))
def test_load_corpus_traditional_chinese(self): files = corpus.list_corpus_files('chatterbot_corpus/data/tchinese') corpus_data = list(corpus.load_corpus(*files)) self.assertTrue(len(corpus_data))
def test_load_english_corpus(self): files = corpus.list_corpus_files('chatterbot_corpus/data/english/greetings.yml') corpus_data = list(corpus.load_corpus(*files)) self.assertEqual(len(corpus_data), 1) self.assertIn(['Hi', 'Hello'], corpus_data[0][0])
def test_load_corpus_english_trailing_slash(self): file_path = os.path.join(corpus.DATA_DIRECTORY, 'english') + '/' data_files = corpus.list_corpus_files(file_path) corpus_data = list(corpus.load_corpus(*data_files)) self.assertGreater(len(list(corpus_data)), 1)
def test_load_corpus_english_greetings(self): file_path = os.path.join(corpus.DATA_DIRECTORY, 'english', 'greetings.yml') data_files = corpus.list_corpus_files(file_path) corpus_data = corpus.load_corpus(*data_files) self.assertEqual(len(list(corpus_data)), 1)
def test_load_corpus_russian(self): data_files = corpus.list_corpus_files('chatterbot.corpus.russian') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_indonesia(self): data_files = corpus.list_corpus_files('chatterbot.corpus.indonesia') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def train(self, *corpus_paths): from chatterbot.corpus import load_corpus, list_corpus_files data_file_paths = [] # Get the paths to each file the bot will be trained with for corpus_path in corpus_paths: data_file_paths.extend(list_corpus_files(corpus_path)) for corpus, categories, file_path in load_corpus(*data_file_paths): statements_to_create = [] # Train the chat bot with each statement and response pair for conversation_count, conversations in enumerate(corpus): if self.show_training_progress: utils.print_progress_bar( 'Training ' + str(os.path.basename(file_path)), conversation_count + 1, len(corpus)) previous_statements_texts = [None] previous_statements_search_texts = [''] for conversation in conversations: if isinstance(conversation, str): conversation = [conversation] statements_texts = [] statements_search_texts = [] for previous_statement_text, previous_statement_search_text in zip( previous_statements_texts, previous_statements_search_texts): for text in conversation: statement_search_text = self.chatbot.storage.tagger.get_bigram_pair_string( text) statement = Statement( text=text, search_text=statement_search_text, in_response_to=previous_statement_text, search_in_response_to= previous_statement_search_text, conversation='training') statement.add_tags(*categories) statement = self.get_preprocessed_statement( statement) statements_texts.append(statement.text) statements_search_texts.append( statement_search_text) statements_to_create.append(statement) previous_statements_texts = statements_texts previous_statements_search_texts = statements_search_texts if statements_to_create: self.chatbot.storage.create_many(statements_to_create)
def test_load_corpus_marathi(self): data_files = corpus.list_corpus_files('chatterbot.corpus.marathi') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def train(self, *corpus_paths): from chatterbot.corpus import load_corpus, list_corpus_files data_file_paths = [] # Get the paths to each file the bot will be trained with for corpus_path in corpus_paths: data_file_paths.extend(list_corpus_files(corpus_path)) for corpus, categories, file_path in load_corpus(*data_file_paths): statements_to_create = [] # Train the chat bot with each statement and response pair for conversation_count, conversation in enumerate(corpus): if self.show_training_progress: utils.print_progress_bar( 'Training ' + str(os.path.basename(file_path)), conversation_count + 1, len(corpus)) previous_statement_text = None previous_statement_search_text = '' for text in conversation: suggestion_tags = [] if text.strip('.?!/;:\'\"') in constants.AFFIRMATIVES: text = 'AFF' elif text.strip('.?!/;:\'\"') in constants.NEGATIVES: text = 'NEG' elif text[0] is '^': (suggestion, text) = text.split(maxsplit=1) suggestion = suggestion[1:] if not suggestion.find('/'): suggestion_tags.append(suggestion) else: for suggestion in suggestion.split('/'): suggestion_tags.append(suggestion) statement_search_text = self.chatbot.storage.tagger.get_bigram_pair_string( text) statement = Statement( text=text, search_text=statement_search_text, in_response_to=previous_statement_text, search_in_response_to=previous_statement_search_text, conversation='training') # YesNoLogicAdapter deals with responses to AFF/NEG via statement tags. # No need for statements in_response_to = AFF/NEG In fact, it was causing # erroneous responses if statement.in_response_to in ['AFF', 'NEG']: statement.in_response_to = None statement.search_in_response_to = None statement.add_tags(*categories) if suggestion_tags: for suggestion in suggestion_tags: statement.add_tags('SUGGESTION:' + suggestion) if previous_statement_text: if previous_statement_text == 'AFF': statements_to_create[-2].add_tags('AFF:' + statement.text) elif previous_statement_text == 'NEG': statements_to_create[-2].add_tags('NEG:' + statement.text) statement = self.get_preprocessed_statement(statement) previous_statement_text = statement.text previous_statement_search_text = statement_search_text statements_to_create.append(statement) # Using update() because create_many() makes duplicate statements. AFF/NEG tag data was lost on some. for stmnts in statements_to_create: self.chatbot.storage.update(stmnts)
def test_load_corpus_portuguese(self): data_files = corpus.list_corpus_files('chatterbot.corpus.portuguese') corpus_data = corpus.load_corpus(*data_files) self.assertTrue(len(list(corpus_data)))
def test_load_corpus_english(self): files = corpus.list_corpus_files('chatterbot_corpus/data/english') corpus_data = list(corpus.load_corpus(*files)) self.assertTrue(len(corpus_data))
def test_load_corpus_marathi(self): files = corpus.list_corpus_files('chatterbot_corpus/data/marathi') corpus_data = list(corpus.load_corpus(*files)) self.assertTrue(len(corpus_data))