def extract_features(q1, q2): advanced_feature = [] # preprocessing each question # Removing html tags,punctuations,stemming,stopwords,contractions, and then return the text of question q1 = preprocess(q1) q2 = preprocess(q2) token_features = get_token_features(q1, q2) #token_features is a list. advanced_feature.extend(token_features) #cwc_min,cwc_min,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len #fuzzy_features advanced_feature.append(fuzz.token_set_ratio(q1, q2)) #token_set_ratio advanced_feature.append(fuzz.token_sort_ratio(q1, q2)) #token_sort_ratio advanced_feature.append(fuzz.QRatio(q1, q2)) #fuzz_ratio advanced_feature.append(fuzz.partial_ratio(q1, q2)) #fuzz_partial_ratio advanced_feature.append(get_longest_substr_ratio( q1, q2)) #longest_substr_ratio return advanced_feature
def testTokenSetRatio(self): self.assertEqual(fuzzywuzzy.token_set_ratio(self.s4, self.s5), 100)