def synset_distance(statement, other_statement): """ Calculate the similarity of two statements. This is based on the total maximum synset similarity between each word in each sentence. This algorithm uses the `wordnet`_ functionality of `NLTK`_ to determine the similarity of two statements based on the path similarity between each token of each statement. This is essentially an evaluation of the closeness of synonyms. :return: The percent of similarity between the closest synset distance. :rtype: float .. _wordnet: http://www.nltk.org/howto/wordnet.html .. _NLTK: http://www.nltk.org/ """ from nltk.corpus import wordnet from nltk import word_tokenize from chatterbot import utils import itertools tokens1 = word_tokenize(statement.text.lower()) tokens2 = word_tokenize(other_statement.text.lower()) # Remove all stop words from the list of word tokens tokens1 = utils.remove_stopwords(tokens1, language='english') tokens2 = utils.remove_stopwords(tokens2, language='english') # The maximum possible similarity is an exact match # Because path_similarity returns a value between 0 and 1, # max_possible_similarity is the number of words in the longer # of the two input statements. max_possible_similarity = max( len(statement.text.split()), len(other_statement.text.split()) ) max_similarity = 0.0 # Get the highest matching value for each possible combination of words for combination in itertools.product(*[tokens1, tokens2]): synset1 = wordnet.synsets(combination[0]) synset2 = wordnet.synsets(combination[1]) if synset1 and synset2: # Get the highest similarity for each combination of synsets for synset in itertools.product(*[synset1, synset2]): similarity = synset[0].path_similarity(synset[1]) if similarity and (similarity > max_similarity): max_similarity = similarity if max_possible_similarity == 0: return 0 return max_similarity / max_possible_similarity
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the closest synset distance. :rtype: float .. _wordnet: http://www.nltk.org/howto/wordnet.html .. _NLTK: http://www.nltk.org/ """ from nltk.corpus import wordnet from nltk import word_tokenize import itertools tokens1 = word_tokenize(statement.text.lower()) tokens2 = word_tokenize(other_statement.text.lower()) # Remove all stop words from the list of word tokens tokens1 = utils.remove_stopwords(tokens1, language='english') tokens2 = utils.remove_stopwords(tokens2, language='english') # The maximum possible similarity is an exact match # Because path_similarity returns a value between 0 and 1, # max_possible_similarity is the number of words in the longer # of the two input statements. max_possible_similarity = min( len(statement.text.split()), len(other_statement.text.split()) ) / max( len(statement.text.split()), len(other_statement.text.split()) ) max_similarity = 0.0 # Get the highest matching value for each possible combination of words for combination in itertools.product(*[tokens1, tokens2]): synset1 = wordnet.synsets(combination[0]) synset2 = wordnet.synsets(combination[1]) if synset1 and synset2: # Get the highest similarity for each combination of synsets for synset in itertools.product(*[synset1, synset2]): similarity = synset[0].path_similarity(synset[1]) if similarity and (similarity > max_similarity): max_similarity = similarity if max_possible_similarity == 0: return 0 return max_similarity / max_possible_similarity
def test_remove_stop_words(self): from chatterbot.utils import nltk_download_corpus nltk_download_corpus('stopwords') tokens = ['this', 'is', 'a', 'test', 'string'] words = utils.remove_stopwords(tokens, 'english') # This example list of words should end up with only two elements self.assertEqual(len(words), 2) self.assertIn('test', list(words)) self.assertIn('string', list(words))
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys from nltk import word_tokenize from chatterbot import utils logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') global counter #global model # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement or not other_statement: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.lower()) other_statement_text = unicode(other_statement.lower()) else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) counter += 1 #print "calculating similarity ****************************************************************************",counter # Calculate a decimal percent of the similarity percent = int(round(100 * similarity.ratio())) / 100.0 sentence_1 = clean_sent(statement_text).lower().split() sentence_2 = clean_sent(other_statement_text).lower().split() tokens1 = (sentence_1) tokens2 = (sentence_2) # Remove all stop words from the list of word tokens s1 = utils.remove_stopwords(tokens1, language='english') s2 = utils.remove_stopwords(tokens2, language='english') #s1 = [w for w in sentence_1 if w not in stop_words] #s2 = [w for w in sentence_2 if w not in stop_words] distance = model.wmdistance(s1, s2) distance_gensim = model.wmdistance(s1, s2) if distance == infinity: return percent elif percent > distance: if percent - distance < 0.25: #print other_statement_text, percent + 0.08, '%', '***DECENT MATCH****' #print 'percent: ', percent, 'distance: ', distance #print return percent + 0.08 + (0.15 * abs(1 - distance)) else: #print other_statement_text, '*****CLOSE MATCH*****' #print 'percent: ', percent, 'distance: ', distance #print return percent + 1.0 + (0.15 * abs(1 - distance)) elif percent > 0.4: if distance - percent < 0.15: #print other_statement_text, percent + 0.06, '%' #print 'percent: ', percent, 'distance: ', distance #print return percent + 0.06 + (0.15 * abs(1 - distance)) else: #print other_statement_text, percent - 0.04, '%' #print 'percent: ', percent, 'distance: ', distance #print return (percent - 0.04) - (0.15 * abs(1 - distance))