def _calculateRatio(i, j): a, b = sorted([i, j]) # ratio = fuzz.token_sort_ratio(a, b) ratio = fuzz.WRatio(a, b) return ratio
from fuzzywuzzy import fuzz from fuzzywuzzy import process s1 = "I love fuzzysforfuzzys" s2 = "I am loving fuzzysforfuzzys" print ("FuzzyWuzzy Ratio:", fuzz.ratio(s1, s2)) print ("FuzzyWuzzyPartialRatio: ", fuzz.partial_ratio(s1, s2)) print ("FuzzyWuzzyTokenSortRatio: ", fuzz.token_sort_ratio(s1, s2)) print ("FuzzyWuzzyTokenSetRatio: ", fuzz.token_set_ratio(s1, s2)) print ("FuzzyWuzzyWRatio: ", fuzz.WRatio(s1, s2),'\n\n') # for process library, query = 'fuzzys for fuzzys' choices = ['fuzzy for fuzzy', 'fuzzy fuzzy', 'g. for fuzzys'] print ("List of ratios: ") print (process.extract(query, choices), '\n') print ("Best among the above list: ",process.extractOne(query, choices))
df_train = pd.read_csv('../../input/train.csv') df_test = pd.read_csv('../../input/test.csv') len_train = df_train.shape[0] df_feat = pd.DataFrame() df_data = pd.concat([ df_train[['question1', 'question2']], df_test[['question1', 'question2']] ], axis=0) df_feat['fuzz_qratio'] = df_data.apply( lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_WRatio'] = df_data.apply( lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio( str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_set_ratio'] = df_data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_token_set_ratio'] = df_data.apply( lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2'])),
for i in tokensyll: words.append(i) words = set(words) tokenscrape = word_tokenize(b) for i in tokenscrape: if i in words: words1.append(str(i)) words1 = set(words1) for i in words1: h += i + " " a = normal1(a) b = normal1(b) words.clear() words1.clear() Token_W_Ratio = fuzz.WRatio(a, b) Token_set_Ratio = fuzz.token_set_ratio(a, b) ratio = item["SiteScore"] ratio1 = item["CourseScore"] ratio = ((8 - int(ratio)) / 7) * 100 ratio1 = ((26 - int(ratio1)) / 25) * 100 finalratio123 = Token_W_Ratio * 0.3 + Token_set_Ratio * 0.4 + ratio * 0.1 + ratio1 * 0.2 finalratio.append(Token_set_Ratio) finalwordsmatch.append(h) finalname.append(item["Name"]) finaldetail.append(item["Detail"]) finalsite.append(item["Site"]) finallink.append(item["Link"]) finalcoursecode.append(item["Code"]) finalCoursename.append(item["Cname"]) finalcoursedetail.append(item["Cdetail"])
# print(t1) # print(t2) # print(t3) # print(t4) # a = fuzz.partial_ratio(t1,t2) # b = fuzz.ratio(t1,t2) # c = fuzz.token_set_ratio(t1,t2) # d = fuzz.partial_token_set_ratio(t1,t2) # e = fuzz.QRatio(t1,t2) # f = fuzz.WRatio(t1,t2) # g = fuzz.UWRatio(t1,t2) # print(a,b,c,d,e,f,g) s1 = filtering(str_list=["방카슈랑스 계약사항 중에서 변경하고 싶은 부분이 있습니다."], noun=False) s2 = filtering(str_list=["스마트알림 메시지 데이터는 얼마동안 볼 수 있나요?"], noun=False) s3 = "후 스마트 경우 수신 해외 서비스 외국 알림 신청 메시지 출국" print(fuzz.token_set_ratio(s3, s2)) print(fuzz.QRatio(s3, s2)) print(fuzz.UWRatio(s3, s2)) print(fuzz.WRatio(s3, s2)) # testlist = [{ # 'category' : 1, 'value' : [1,2] # },{ # 'category' : 2, 'value' : [1,2,3] # }] # # print(testlist) # # print([value['value'].append(4) for value in testlist if value.get('category')==2]) # # print(testlist)
len_1 = len(question1) len_2 = len(question2) diff_len = len_1 - len_2 len_char_q1 = len(''.join(set(str(question1).replace(' ', '')))) len_char_q2 = len(''.join(set(str(question2).replace(' ', '')))) len_word_q1 = len(str(question1).split()) len_word_q2 = len(str(question2).split()) common_words = len( set(str(question1).lower().split()).intersection( set(str(question2).lower().split()))) #fuzzy from fuzzywuzzy import fuzz fuzz_qratio = fuzz.QRatio(str(question1), str(question2)) fuzz_WRatio = fuzz.WRatio(str(question1), str(question2)) fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2)) fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio( str(question1), str(question2)) fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio( str(question1), str(question2)) fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2)) fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2)) #wmd import gensim model = gensim.models.KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin.gz', binary=True) #sen2vec import scipy
data['len_char_q2'] = data.question2.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) # 问句2的字符串长度(去除空格) data['len_word_q1'] = data.question1.apply( lambda x: len(str(x).split())) # 问句1的单词个数 data['len_word_q2'] = data.question2.apply( lambda x: len(str(x).split())) # 问句2的单词个数 data['common_words'] = data.apply(lambda x: len( set(str(x['question1']).lower().split()).intersection( set(str(x['question2']).lower().split()))), axis=1) # 问句1与问句2的相同单词数(通过集合实现) # 调用fuzz抽取问句对特征 data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(
def fuzz_wratio(q1, q2): return fuzz.WRatio(q1, q2)
def compare(self, left, right): return fuzz.WRatio(left, right) / 100.0
data = data.drop(['id', 'qid1', 'qid2'], axis=1) data['word_match_share']=data.apply(word_match_share,axis=1,raw=True) data['tfidf_word_match_share']=data.apply(tfidf_word_match_share,axis=1,raw=True) data["question1"] = data["question1"].fillna("").apply(preprocess) data["question2"] = data["question2"].fillna("").apply(preprocess) data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format('..data/GoogleNews-vectors-negative300.bin.gz', binary=True) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format('..data/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0
# Split user input (comma seperated) into list productList = userInput.split(",") # Tables for pretty output matches = PrettyTable(["Ingredient"]) closeMatches = PrettyTable( ["Product Ingredient", "User Ingredient", "Similarity Ratio"]) closeMatches.reversesort = True # Loop through input list for item in productList: # Loop through user list for itemCheck in userList: # Check for direct matches to ingredients if fuzz.WRatio(item, itemCheck) == 100: matches.add_row([itemCheck]) break # Check for similar matches to ingredients if fuzz.WRatio(item, itemCheck) >= 90: closeMatches.add_row( [item, itemCheck, fuzz.WRatio(item, itemCheck)]) # Sort similar matches by most similar to least similar closeMatches.sortby = "Similarity Ratio" # Print tables print("\nDirect matches:") print(matches) print(