예제 #1
0
 def _calculateRatio(i, j):
     a, b = sorted([i, j])
     # ratio = fuzz.token_sort_ratio(a, b)
     ratio = fuzz.WRatio(a, b)
     return ratio
예제 #2
0
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
s1 = "I love fuzzysforfuzzys"
s2 = "I am loving fuzzysforfuzzys"
print ("FuzzyWuzzy Ratio:", fuzz.ratio(s1, s2)) 
print ("FuzzyWuzzyPartialRatio: ", fuzz.partial_ratio(s1, s2)) 
print ("FuzzyWuzzyTokenSortRatio: ", fuzz.token_sort_ratio(s1, s2)) 
print ("FuzzyWuzzyTokenSetRatio: ", fuzz.token_set_ratio(s1, s2)) 
print ("FuzzyWuzzyWRatio: ", fuzz.WRatio(s1, s2),'\n\n')
# for process library, 
query = 'fuzzys for fuzzys'
choices = ['fuzzy for fuzzy', 'fuzzy fuzzy', 'g. for fuzzys'] 
print ("List of ratios: ")
print (process.extract(query, choices), '\n')
print ("Best among the above list: ",process.extractOne(query, choices))
df_train = pd.read_csv('../../input/train.csv')
df_test = pd.read_csv('../../input/test.csv')
len_train = df_train.shape[0]

df_feat = pd.DataFrame()
df_data = pd.concat([
    df_train[['question1', 'question2']], df_test[['question1', 'question2']]
],
                    axis=0)

df_feat['fuzz_qratio'] = df_data.apply(
    lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_WRatio'] = df_data.apply(
    lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio(
    str(row['question1']), str(row['question2'])),
                                              axis=1)
df_feat['fuzz_partial_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_set_ratio(str(row['question1']),
                                             str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_sort_ratio(str(row['question1']),
                                              str(row['question2'])),
    axis=1)
df_feat['fuzz_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.token_set_ratio(str(row['question1']),
                                     str(row['question2'])),
    for i in tokensyll:
        words.append(i)
    words = set(words)
    tokenscrape = word_tokenize(b)

    for i in tokenscrape:
        if i in words:
            words1.append(str(i))
    words1 = set(words1)
    for i in words1:
        h += i + " "
    a = normal1(a)
    b = normal1(b)
    words.clear()
    words1.clear()
    Token_W_Ratio = fuzz.WRatio(a, b)
    Token_set_Ratio = fuzz.token_set_ratio(a, b)
    ratio = item["SiteScore"]
    ratio1 = item["CourseScore"]
    ratio = ((8 - int(ratio)) / 7) * 100
    ratio1 = ((26 - int(ratio1)) / 25) * 100
    finalratio123 = Token_W_Ratio * 0.3 + Token_set_Ratio * 0.4 + ratio * 0.1 + ratio1 * 0.2
    finalratio.append(Token_set_Ratio)
    finalwordsmatch.append(h)
    finalname.append(item["Name"])
    finaldetail.append(item["Detail"])
    finalsite.append(item["Site"])
    finallink.append(item["Link"])
    finalcoursecode.append(item["Code"])
    finalCoursename.append(item["Cname"])
    finalcoursedetail.append(item["Cdetail"])
예제 #5
0
# print(t1)
# print(t2)
# print(t3)
# print(t4)
# a = fuzz.partial_ratio(t1,t2)
# b = fuzz.ratio(t1,t2)
# c = fuzz.token_set_ratio(t1,t2)
# d = fuzz.partial_token_set_ratio(t1,t2)
# e = fuzz.QRatio(t1,t2)
# f = fuzz.WRatio(t1,t2)
# g = fuzz.UWRatio(t1,t2)
# print(a,b,c,d,e,f,g)
s1 = filtering(str_list=["방카슈랑스 계약사항 중에서 변경하고 싶은 부분이 있습니다."], noun=False)
s2 = filtering(str_list=["스마트알림 메시지 데이터는 얼마동안 볼 수 있나요?"], noun=False)
s3 = "후 스마트 경우 수신 해외 서비스 외국 알림 신청 메시지 출국"
print(fuzz.token_set_ratio(s3, s2))
print(fuzz.QRatio(s3, s2))
print(fuzz.UWRatio(s3, s2))
print(fuzz.WRatio(s3, s2))
# testlist = [{
#     'category' : 1, 'value' : [1,2]
# },{
#     'category' : 2, 'value' : [1,2,3]
# }]
#
# print(testlist)
#
# print([value['value'].append(4) for value in testlist if value.get('category')==2])
#
# print(testlist)
len_1 = len(question1)
len_2 = len(question2)
diff_len = len_1 - len_2
len_char_q1 = len(''.join(set(str(question1).replace(' ', ''))))
len_char_q2 = len(''.join(set(str(question2).replace(' ', ''))))
len_word_q1 = len(str(question1).split())
len_word_q2 = len(str(question2).split())
common_words = len(
    set(str(question1).lower().split()).intersection(
        set(str(question2).lower().split())))

#fuzzy
from fuzzywuzzy import fuzz

fuzz_qratio = fuzz.QRatio(str(question1), str(question2))
fuzz_WRatio = fuzz.WRatio(str(question1), str(question2))
fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2))
fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(
    str(question1), str(question2))
fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(
    str(question1), str(question2))
fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2))
fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2))

#wmd
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)
#sen2vec
import scipy
data['len_char_q2'] = data.question2.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))  # 问句2的字符串长度(去除空格)
data['len_word_q1'] = data.question1.apply(
    lambda x: len(str(x).split()))  # 问句1的单词个数
data['len_word_q2'] = data.question2.apply(
    lambda x: len(str(x).split()))  # 问句2的单词个数
data['common_words'] = data.apply(lambda x: len(
    set(str(x['question1']).lower().split()).intersection(
        set(str(x['question2']).lower().split()))),
                                  axis=1)  # 问句1与问句2的相同单词数(通过集合实现)

# 调用fuzz抽取问句对特征
data['fuzz_qratio'] = data.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(
    lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(
    lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                           str(x['question2'])),
    axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(
    lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                            str(x['question2'])),
    axis=1)
data['fuzz_token_set_ratio'] = data.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_token_sort_ratio'] = data.apply(
예제 #8
0
def fuzz_wratio(q1, q2):
    return fuzz.WRatio(q1, q2)
예제 #9
0
 def compare(self, left, right):
     return fuzz.WRatio(left, right) / 100.0
예제 #10
0
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
data['word_match_share']=data.apply(word_match_share,axis=1,raw=True)
data['tfidf_word_match_share']=data.apply(tfidf_word_match_share,axis=1,raw=True)
data["question1"] = data["question1"].fillna("").apply(preprocess)
data["question2"] = data["question2"].fillna("").apply(preprocess)
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


model = gensim.models.KeyedVectors.load_word2vec_format('..data/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
norm_model = gensim.models.KeyedVectors.load_word2vec_format('..data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
예제 #11
0
# Split user input (comma seperated) into list
productList = userInput.split(",")

# Tables for pretty output
matches = PrettyTable(["Ingredient"])
closeMatches = PrettyTable(
    ["Product Ingredient", "User Ingredient", "Similarity Ratio"])
closeMatches.reversesort = True

# Loop through input list
for item in productList:
    # Loop through user list
    for itemCheck in userList:
        # Check for direct matches to ingredients
        if fuzz.WRatio(item, itemCheck) == 100:
            matches.add_row([itemCheck])
            break
        # Check for similar matches to ingredients
        if fuzz.WRatio(item, itemCheck) >= 90:
            closeMatches.add_row(
                [item, itemCheck,
                 fuzz.WRatio(item, itemCheck)])

# Sort similar matches by most similar to least similar
closeMatches.sortby = "Similarity Ratio"

# Print tables
print("\nDirect matches:")
print(matches)
print(