def synset_distance(statement, other_statement): """ Calculate the similarity of two statements. This is based on the total similarity between each word in each sentence. """ from chatterbot.utils.pos_tagger import POSTagger from chatterbot.utils.stop_words import StopWordsManager from chatterbot.utils.word_net import Wordnet import itertools wordnet = Wordnet() tagger = POSTagger() stopwords = StopWordsManager() def get_tokens(text, exclude_stop_words=True): """ Takes a string and converts it to a tuple of each word. Skips common stop words such as ("is, the, a, ...") is 'exclude_stop_words' is True. """ lower = text.lower() tokens = tagger.tokenize(lower) # Remove any stop words from the string if exclude_stop_words: excluded_words = stopwords.words('english') tokens = set(tokens) - set(excluded_words) return tokens tokens1 = get_tokens(statement.text) tokens2 = get_tokens(other_statement.text) total_similarity = 0 # Get the highest matching value for each possible combination of words for combination in itertools.product(*[tokens1, tokens2]): synset1 = wordnet.synsets(combination[0]) synset2 = wordnet.synsets(combination[1]) if synset1 and synset2: max_similarity = 0 # Get the highest similarity for each combination of synsets for synset in itertools.product(*[synset1, synset2]): similarity = synset[0].path_similarity(synset[1]) if similarity and (similarity > max_similarity): max_similarity = similarity # Add the most similar path value to the total total_similarity += max_similarity return total_similarity
def test_remove_stop_words(self): stopwords_manager = StopWordsManager() tokens = ['this', 'is', 'a', 'test', 'string'] words = stopwords_manager.remove_stopwords('english', tokens) # This example list of words should end up with only two elements self.assertEqual(len(words), 2) self.assertIn('test', list(words)) self.assertIn('string', list(words))
def __init__(self, **kwargs): super(DeveloperAssistant, self).__init__(**kwargs) # Initializing variables self.program_data = {"name": "", "path": ""} self.stage = "" self.data_dir = "" self.data = self.read_program_file() self.stopwords = StopWordsManager() self.tagger = POSTagger() self.conversation = []
def get_tokens(self, text, language='english', exclude_stop_words=True): """ Takes a string and converts it to a tuple of each word. Skips common stop words such as ("is, the, a, ...") if 'exclude_stop_words' is True. """ from chatterbot.utils.stop_words import StopWordsManager from nltk import word_tokenize stopwords = StopWordsManager() tokens = word_tokenize(text.lower()) # Remove all stop words from the list of word tokens if exclude_stop_words: tokens = stopwords.remove_stopwords(language, tokens) return tokens
def setUp(self): super(StopWordsTestCase, self).setUp() from chatterbot.utils.stop_words import StopWordsManager self.stopwords_manager = StopWordsManager()
def __init__(self, **kwargs): super(ClosestMeaningAdapter, self).__init__(**kwargs) self.wordnet = Wordnet() self.tagger = POSTagger() self.stopwords = StopWordsManager()
def test_stop_words(self): stopwords_manager = StopWordsManager() words = stopwords_manager.words("english") test_case = set(["too"]) - set(words) self.assertEqual(test_case, set([]))