Пример #1
0
def extract_features(q1, q2):
    advanced_feature = []

    # preprocessing each question
    # Removing html tags,punctuations,stemming,stopwords,contractions, and then return the text of question
    q1 = preprocess(q1)
    q2 = preprocess(q2)

    token_features = get_token_features(q1, q2)  #token_features is a list.
    advanced_feature.extend(token_features)
    #cwc_min,cwc_min,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len

    #fuzzy_features
    advanced_feature.append(fuzz.token_set_ratio(q1, q2))  #token_set_ratio
    advanced_feature.append(fuzz.token_sort_ratio(q1, q2))  #token_sort_ratio
    advanced_feature.append(fuzz.QRatio(q1, q2))  #fuzz_ratio
    advanced_feature.append(fuzz.partial_ratio(q1, q2))  #fuzz_partial_ratio
    advanced_feature.append(get_longest_substr_ratio(
        q1, q2))  #longest_substr_ratio

    return advanced_feature
Пример #2
0
 def testTokenSetRatio(self):
     self.assertEqual(fuzzywuzzy.token_set_ratio(self.s4, self.s5), 100)