def fuzzy_feats(train_in, test_in, qcolumns=['question1', 'question2'], append=''): from fuzzywuzzy import fuzz import pandas as pd train = train_in.copy().loc[:, qcolumns] test = test_in.copy().loc[:, qcolumns] train['fuzz_r' + append] = train.apply( lambda x: fuzz.ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) train['fuzz_pr' + append] = train.apply( lambda x: fuzz.partial_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) train['fuzz_tsr' + append] = train.apply( lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) train['fuzz_tsor' + append] = train.apply(lambda x: fuzz.partial_token_sort_ratio( x[qcolumns[0]], x[qcolumns[1]]), axis=1) test['fuzz_r' + append] = test.apply( lambda x: fuzz.ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) test['fuzz_pr' + append] = test.apply( lambda x: fuzz.partial_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) test['fuzz_tsr' + append] = test.apply( lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1) test['fuzz_tsor' + append] = test.apply(lambda x: fuzz.partial_token_sort_ratio( x[qcolumns[0]], x[qcolumns[1]]), axis=1) return (train, test)
def compute_features(train_df, test_df): train_df[Fields.qratio] = train_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.qratio] = test_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_qratio = compute_quality(train_df, Fields.qratio) train_df[Fields.wratio] = train_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.wratio] = test_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_wratio = compute_quality(train_df, Fields.wratio) train_df[Fields.partial_ratio] = train_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_ratio] = test_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio) train_df[Fields.partial_token_set_ratio] = train_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_set_ratio] = test_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio) train_df[Fields.partial_token_sort_ratio] = train_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_sort_ratio] = test_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio) train_df[Fields.token_set_ratio] = train_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_set_ratio] = test_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio) train_df[Fields.token_sort_ratio] = train_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_sort_ratio] = test_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio) quality = dict( quality_qratio=quality_qratio, quality_wratio=quality_wratio, quality_partial_ratio=quality_partial_ratio, quality_partial_token_set_ratio=quality_partial_token_set_ratio, quality_partial_token_sort_ratio=quality_partial_token_sort_ratio, quality_token_set_ratio=quality_token_set_ratio, quality_token_sort_ratio=quality_token_sort_ratio ) return quality
def Feature_set2(corpus_raw): corpus_raw['fuzz_qratio'] = corpus_raw.apply( lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_partialratio'] = corpus_raw.apply( lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_partial_token_setratio'] = corpus_raw.apply( lambda row: fuzz.partial_token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_partial_token_sortratio'] = corpus_raw.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_token_setratio'] = corpus_raw.apply( lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_token_sortratio'] = corpus_raw.apply( lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1) corpus_raw['fuzz_wratio'] = corpus_raw.apply( lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])), axis=1)
def banks_count(banks, params): word = 0 """Сортировка банков""" spisok_rate = [] for j in range(len(spisok)): for i in range(len(banks)): if fuzz.partial_token_sort_ratio( spisok[j], banks[i]["bank"] ) >= 90: # Составление списка обменников с помощью совпадений word = "<b>%s</b> / <b>%s</b> <a href='%s'>%s</a>\nТелефон: %s" % ( banks[i]["sell"], banks[i]["buy"], links[j], spisok[j], phones[j]) if word not in spisok_rate: spisok_rate.append( "<b>%s</b> / <b>%s</b> <a href='%s'>%s</a>\nТелефон: %s" % (banks[i]["sell"], banks[i]["buy"], links[j], spisok[j], phones[j])) break # print("\n".join(spisok_rate)) # Список доступных банков и их курсов text = "\n".join(delete_copy(spisok_rate)) if params == "text": # print(text) return text elif params == "distance": return banks
def step22_full(d_old, d_new): for i in d_new.items(): a = i[0].split("#bh#")[1] t = i[0].split("#bh#")[0] if "feat." in a: print("Working on ", i[0]) #d_c = d2.copy() #key_old = i[0] #d_c.pop(key_old, None) for k in d_old.items(): if norm(t) == norm(k[0].split("#bh#") [0]): # проверяем, что название совпадает a_a = full_norm(a) if fuzz.partial_token_sort_ratio( full_norm(k[0].split("#bh#")[1]), a_a) > 80: print("\nsame artists. matching.") print(a) print(k[0].split("#bh#")[1], "\n") d_old[k[0]] = pd.concat([d_old[k[0]], i[1]]) # обнуляем ключ в новых данных d_new[i[0]] = None else: print("\nartists are too different") print(a) print(k[0].split("#bh#")[1], "\n") print() return (d_old, d_new)
def generate_fuzz(infile, outfile): start = datetime.now() print('generate fuzz feat,data path is', infile) df_data = pd.read_csv(infile, sep='\t', header=None, names=['index', 'sen1', 'sen2', 'label']) df_feat = pd.DataFrame() df_feat['fuzz_qratio'] = df_data.apply( lambda row: fuzz.QRatio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_WRatio'] = df_data.apply( lambda row: fuzz.WRatio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_partial_ratio'] = df_data.apply( lambda row: fuzz.partial_ratio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_partial_token_set_ratio'] = df_data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_token_set_ratio'] = df_data.apply( lambda row: fuzz.token_set_ratio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat['fuzz_token_sort_ratio'] = df_data.apply( lambda row: fuzz.token_sort_ratio(str(row['sen1']), str(row['sen2'])), axis=1) df_feat.to_csv(outfile, index=False) end = datetime.now() print('times:', end - start)
def getNutritionValue(input): inputArray = input.split(" ") # db = MySQLdb.connect(host=dbConfig["host"],user=dbConfig["user"],passwd=dbConfig["passwd"],db=dbConfig["db"], unix_socket=dbConfig["unix_socket"]) db = MySQLdb.connect(host=dbConfig["host"], user=dbConfig["user"], passwd=dbConfig["passwd"], db=dbConfig["db"]) cur = db.cursor() query = "SELECT Food_Name, Protein, Fat, Carbohydrate, Total_Sugar from nutrition_fact WHERE " for i in range(0, len(inputArray)): if (i != len(inputArray) - 1): query += "Food_Name LIKE '%" + inputArray[i] + "%' AND " else: query += "Food_Name LIKE '%" + inputArray[i] + "%'" cur.execute(query) try: allMatches = cur.fetchall() bestMatch = None bestRatio = 0 for match in allMatches: ratio = fuzz.partial_token_sort_ratio(match[0], input) if ratio > bestRatio: ratio = bestRatio bestMatch = match n = Nutrition(bestMatch[0], bestMatch[1], bestMatch[2], bestMatch[3], bestMatch[4]) return n except: return None
def main(df, suf): df['len_q1' + suf] = df['q1'].apply(lambda x: len(str(x))) df['len_q2' + suf] = df['q2'].apply(lambda x: len(str(x))) df['diff_len' + suf] = df['len_q1' + suf] - df['len_q2' + suf] df['len_char_q1' + suf] = df['q1'].apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_char_q2' + suf] = df['q2'].apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_word_q1' + suf] = df['q1'].apply(lambda x: len(str(x).split())) df['len_word_q2' + suf] = df['q2'].apply(lambda x: len(str(x).split())) df['common_words' + suf] = df.apply(lambda x: len( set(str(x['q1']).lower().split()).intersection( set(str(x['q2']).lower().split()))), axis=1) df['fuzz_qratio' + suf] = df.apply( lambda x: fuzz.QRatio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_WRatio' + suf] = df.apply( lambda x: fuzz.WRatio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_partial_ratio' + suf] = df.apply( lambda x: fuzz.partial_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_partial_token_set_ratio' + suf] = df.apply( lambda x: fuzz.partial_token_set_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_partial_token_sort_ratio' + suf] = df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_token_set_ratio' + suf] = df.apply( lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_token_sort_ratio' + suf] = df.apply( lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1) return df
def add_fuzz_features(self): self.df['fuzz_qratio'] = self.df.apply(lambda x: fuzz.QRatio( str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_wratio'] = self.df.apply(lambda x: fuzz.WRatio( str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_partial_ratio'] = self.df.apply( lambda x: fuzz.partial_ratio(str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_partial_token_set_ratio'] = self.df.apply( lambda x: fuzz.partial_token_set_ratio(str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_partial_token_sort_ratio'] = self.df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_token_set_ratio'] = self.df.apply( lambda x: fuzz.token_set_ratio(str(x[self.q1_column]), str(x[self.q2_column])), axis=1) self.df['fuzz_token_sort_ratio'] = self.df.apply( lambda x: fuzz.token_sort_ratio(str(x[self.q1_column]), str(x[self.q2_column])), axis=1)
def fw_partial_token_sort_ratio(question1, question2): fuzzy = [] for q1, q2 in zip(question1, question2): partial_ratio = fuzz.partial_token_sort_ratio(str(q1), str(q2)) / 100 fuzzy.append([partial_ratio]) print("Created fuzz partial_token_sort_ratio feature") return np.array(fuzzy)
def generate_feature(i): print(str(i) + ' processor started !') data = names['data_' + str(i)] with timer('fuzzywuzzy'): data['fuzz_qratio'] = data.apply( lambda row: fuzz.QRatio(str(row['query']), str(row['title'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda row: fuzz.WRatio(str(row['query']), str(row['title'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda row: fuzz.partial_ratio( str(row['query']), str(row['title'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['query']), str(row['title'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['query']), str(row['title'])), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda row: fuzz.token_set_ratio(str(row['query']), str(row['title'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply( lambda row: fuzz.token_sort_ratio(str(row['query']), str(row['title'])), axis=1) data['diff_ratios'] = data.apply(diff_ratios, axis=1) data = reduce_mem_usage(data) return data
def step3_full(d_old, d_new): for i in d_new.items(): a = i[0].split("#bh#")[1] t = i[0].split("#bh#")[0] #d_c = d22.copy() #key_old = i[0] #d_c.pop(key_old, None) for k in d_old.items(): if fuzz.token_sort_ratio(k[0].split("#bh#")[0], t) == 100: print(t) print(k[0].split("#bh#")[0]) # сравниваем артистов if fuzz.token_sort_ratio( full_norm(a), full_norm(k[0].split("#bh#")[1])) > 80: d_old[k[0]] = pd.concat([d_old[k[0]], i[1]]) # обнуляем ключ в новых данных d_new[i[0]] = None print("\nartists are close! matching these together") print(k[0].split("#bh#")[1]) print(a, "\n") elif fuzz.partial_token_sort_ratio( translit(full_norm(a), "ru"), translit(full_norm(k[0].split("#bh#")[1]), "ru")) > 80: d_old[k[0]] = pd.concat([d_old[k[0]], i[1]]) # обнуляем ключ в новых данных d_new[i[0]] = None print("\nartists are close! matching these together") print(k[0].split("#bh#")[1]) print(a, "\n") else: print("\nartists are too different:") print(i[0]) print(k[0].split("#bh#")[1]) print(a, "\n") return (d_old, d_new)
def find_tweet_target(tweet_text: str) -> str: """ run tweet text through a database, return the companies it associates to. """ split = tweet_text.split() highest_score = 0 h_company = [] h_brand = [] for company, brand_dict in companies_db.items(): for brand, tag_list in brand_dict.items(): for tag in tag_list: score = fuzz.partial_token_sort_ratio(tag, tweet_text) if score > 90: highest_score = score h_company.append(company) h_brand.append(brand) #company_matches[h_company][h_brand] += 1 # if tag in tweet_text: # h_company = company # h_brand = brand # return (h_company, h_brand) # for tweet_word in split: # score = fuzz.ratio(tag, tweet_word) # if score > highest_score: # highest_score = score # h_company = company # h_brand = brand return str(zip(h_company, h_brand))
def step22(D): d2 = D d22 = d2.copy() for i in d2.items(): a = i[0].split("#bh#")[1] t = i[0].split("#bh#")[0] if "feat." in a: print("Working on ", i[0]) d_c = d2.copy() key_old = i[0] d_c.pop(key_old, None) for k in d_c.items(): if norm(t) == norm(k[0].split("#bh#") [0]): # проверяем, что название совпадает a_a = full_norm(a) if fuzz.partial_token_sort_ratio( full_norm(k[0].split("#bh#")[1]), a_a) > 80: print("\nsame artists. matching.") print(a) print(k[0].split("#bh#")[1], "\n") d22[k[0]] = pd.concat( [d22[k[0]], pd.concat([k[1], i[1]])]) # удаляем старый ключ в словаре #d22.pop(i[0], None) d22[i[0]] = None else: print("\nartists are too different") print(a) print(k[0].split("#bh#")[1], "\n") print() return d22
def pairFeatures(self, sentenceA, sentenceB): features = list() ## len features all, chars, word features.append( np.log(len(sentenceA)+1) ) features.append( np.log(len(sentenceB)+1) ) features.append( np.log(abs(len(sentenceA) - len(sentenceB))+1) ) features.append( np.log(len(''.join(set(sentenceA.replace(' ', ''))))+1 )) features.append( np.log(len(''.join(set(sentenceB.replace(' ', ''))))+1 )) features.append( np.log(len(sentenceA.split())+1 )) features.append( np.log(len(sentenceB.split())+1 )) features.append(np.log(sf.longestCommonsubstring(sentenceA, sentenceB)+1)) features.append(np.log(sf.longestCommonSubseq(sentenceA, sentenceB)+1)) ## token features features.append( len(set(sentenceA.lower().split()).intersection(set(sentenceB.lower().split()))) ) features.append( np.log(fuzz.QRatio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.WRatio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.partial_ratio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.partial_token_set_ratio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.partial_token_sort_ratio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.token_set_ratio(sentenceA, sentenceB)+1) ) features.append( np.log(fuzz.token_sort_ratio(sentenceA, sentenceB)+1) ) ## word semantic features for f in self.qs.pairFeatures(sentenceA, sentenceB, stemming = 0): features.append(f) for f in self.qs.pairFeatures(sentenceA, sentenceB, stemming = 1): features.append(f) return features
def author_quality_match(x, y, scorer=fuzz.token_set_ratio): chk = {} chk['simple_wos'] = unidecode.unidecode(x).lower().replace( '.', '').replace(',', '').replace('-', ' ') chk['full_name'] = unidecode.unidecode(y).lower().replace('.', '').replace( ',', '').replace('-', ' ') sn = re.sub('^(\w+\s+\w+\s+\w)\w+(\s+\w)\w+$', r'\1\2', chk['full_name']) chk['short_name'] = re.sub('^(\w+\s+\w+\s+\w)\w+$', r'\1', sn) sn = re.sub('^(\w+\s+)\w+\s+(\w+)\s+\w+$', r'\1\2', chk['full_name']) chk['simple_name'] = re.sub('^(\w+\s+)\w+\s+(\w+)$', r'\1\2', sn) chk['simple_second_name'] = re.sub('^(\w+\s+)\w+\s+\w+\s+(\w+)$', r'\1\2', chk['full_name']) chk['last_name'] = re.sub('^(\w+\s+)\w+\s+(\w+\s+\w+)$', r'\1\2', chk['full_name']) chk['last_names'] = re.sub('^(\w+\s+\w+\s+\w+)\s+\w+$', r'\1', chk['full_name']) chk['second_name'] = re.sub('^(\w+\s+\w+\s+)\w+\s+(\w+)$', r'\1\2', chk['full_name']) chk['s1'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['full_name']) chk['s1b'] = fuzz.partial_token_sort_ratio(chk['simple_wos'], chk['full_name']) chk['s2'] = scorer(chk['simple_wos'], chk['short_name']) chk['s3'] = fuzz.ratio(chk['simple_wos'], chk['simple_name']) chk['s3'] = fuzz.ratio(chk['simple_wos'], chk['simple_second_name']) chk['s4'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['last_name']) chk['s5'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['last_names']) chk['s6'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['second_name']) chk['max'] = max(chk['s1'], chk['s1b'], chk['s2'], chk['s3'], chk['s4'], chk['s5'], chk['s6']) return chk
def _create_fuzzy_matches(x, y): """ Create fuzzy matches. :param str x: first string to compare :param str y: second string to compare :return: dict results: dictionary of fuzzy match results """ r1 = fuzz.ratio(x, y) r2 = fuzz.token_sort_ratio(x, y) r3 = fuzz.token_set_ratio(x, y) r4 = fuzz.partial_ratio(x, y) r5 = fuzz.partial_token_sort_ratio(x, y) r6 = fuzz.partial_token_set_ratio(x, y) ratios = [r1, r2, r3, r4, r5, r6] ratio_average = mean(ratios) results = { "simple_ratio": r1, "token_sort_ratio": r2, "token_set_ratio": r3, "partial_ratio": r4, "partial_token_sort_ratio": r5, "partial_token_set_ratio": r6, "ratio_average": ratio_average } return results
def closest_user(member_string, guild: discord.Guild): if member_string.startswith('<@!') and member_string.endswith('>'): return guild.get_member(int(member_string[3:-1])) return list( sorted(guild.members, key=lambda member: fuzz.partial_token_sort_ratio( member_string.lower(), member.display_name.lower())))[-1]
def extract_features(df): df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) return df
def fuzzy_check(postTitle: str = "", postBody:str = ""): ratio = fuzz.partial_ratio([utils.full_process(postTitle), utils.full_process(postBody)], listOfWords.words['vote-manipulation']) print(ratio) if (not is_rule_breaking(ratio)): ratio = fuzz.partial_token_sort_ratio([postTitle, postBody], listOfWords.words['vote-manipulation']) print(ratio) return is_rule_breaking(ratio)
def search_name(book_arg: str, bible: Biblia) -> int: """ Essa funcao verifica um livro proximo a string pedida e retorna o index do livro pedido. """ nomes = index_books(bible) for nome in nomes: if fuzz.partial_token_sort_ratio(book_arg, nome) >= 80: return nomes.index(nome)
def func_txtFeat12(s1, s2, lang='eng'): """ :param s1: sentence1 :param s2: sentence2 :param lang: language of the sentences :return: Partial token sort ratio """ return float(fuzz.partial_token_sort_ratio(s1, s2))
def fuzzy_ratio(sentencea, sentenceb): ratio = fuzz.ratio(sentencea, sentenceb) partial_ratio = fuzz.partial_ratio(sentencea, sentenceb) token_sort_ratio = fuzz.token_sort_ratio(sentencea, sentenceb) token_set_ratio = fuzz.token_set_ratio(sentencea, sentenceb) partial_token_set_ratio = fuzz.partial_token_set_ratio(sentencea, sentenceb) partial_token_sort_ratio = fuzz.partial_token_sort_ratio(sentencea, sentenceb) return ratio, partial_ratio, token_set_ratio, token_sort_ratio, partial_token_set_ratio, partial_token_sort_ratio
def strDistance(name1, name2): if name1 == None or name2 == None: return 0 allr = (fuzz.partial_ratio(name1, name2), fuzz.UWRatio(name1, name2), fuzz.partial_token_set_ratio(name1, name2), fuzz.partial_token_sort_ratio(name1, name2)) return (sum(allr) // len(allr), allr)
def transform(self, X, y = None): X['fuzz_qratio'] = X.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_WRatio'] = X.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_ratio'] = X.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_token_set_ratio'] = X.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_token_sort_ratio'] = X.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_token_set_ratio'] = X.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_token_sort_ratio'] = X.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) return X
def calculate_fuzzy_abbv_score(source_list, target_list): try: fuzz_abb_score = [ fuzz.partial_token_sort_ratio(source_list[i], target_list[i]) for i in range(0, len(source_list)) ] return fuzz_abb_score except Exception as e: print("Error occured in calculating Abbv Fuzzy Score ::", e)
def get_overlap(vclaim, tweet): ff = getCombined(vclaim) ss = getCombined(tweet) #print(ff) #print(ss) perc = fuzz.partial_token_sort_ratio(ss, ff) #print (perc) perc = perc / 100 return perc * 80
def fuzzy(s1, s2): return [fuzz.ratio(s1, s2) / 100, fuzz.partial_ratio(s1, s2) / 100, fuzz.token_sort_ratio(s1, s2) / 100, fuzz.partial_token_sort_ratio(s1, s2) / 100, fuzz.token_set_ratio(s1, s2) / 100, fuzz.partial_token_set_ratio(s1, s2) / 100, fuzz.QRatio(s1, s2) / 100, fuzz.WRatio(s1, s2) / 100]
def extract_features(list): for x in list: x.append(fuzz.QRatio(x[:13], x[13])) x.append(fuzz.partial_ratio(x[:13], x[13])) x.append(fuzz.partial_token_set_ratio(x[:13], x[13])) x.append(fuzz.partial_token_sort_ratio(x[:13], x[13])) x.append(fuzz.token_set_ratio(x[:13], x[13])) x.append(fuzz.token_sort_ratio(x[:13], x[13])) return list
def get_priority(key, str): priority = fuzz.token_set_ratio(key, str) priority = max(priority, fuzz.token_sort_ratio(key, str)) priority = max(priority, fuzz.ratio(key, str)) priority = max(priority, fuzz.partial_ratio(key, str)) priority = max(priority, fuzz.partial_token_sort_ratio(key, str)) priority = max(priority, fuzz.partial_token_set_ratio(key, str)) return priority
def find_chair(data, chairman): chairman = chairman.split(',', 1)[0].lower() if len(data) == 1: return data.values()[0] else: chairs = {} for chair in data.iterkeys(): chairs[chair] = fuzz.partial_token_sort_ratio(chair, chairman) max_chair = tuple(max(item[::-1] for item in chairs.iteritems()))[1] return data[max_chair]
def post_title_extract(self,sel,response): title = None title_score = 0 slug_score = 0 title_xpath = None blog=self.get_domain(response.url) slug = response.url.split('/')[-1] or response.url.split('/')[-2] slug = slug.replace('-',' ').rstrip('.html') head_title = sel.xpath('//title/text()').extract() head_title = head_title[0] if head_title else '' if '|' in head_title: pos=[head_title.split('|')[0],head_title.split('|')[-1]] word = pos[0] if fuzz.partial_ratio(pos[0],blog)>fuzz.partial_ratio(pos[-1],blog) else pos[-1] head_title_clean = head_title.replace(word,'').replace('|','') else: head_title_clean = head_title text_to_remove = sel.xpath('//link[@rel="alternate"]/@title').extract() if text_to_remove and head_title: words = (' '.join(text_to_remove)+head_title).split() if Counter(words).most_common(3): for wor in Counter(words).most_common(3): head_title_clean = head_title_clean.replace(wor[0],'') [h1,h1a,h2,h2a,h3,h3a]=["//h1","//h1/a","//h2","//h2/a","//h3","//h3/a"] head_xpaths = [h1a,h1,h2a,h2,h3a,h3] title_lists = [sel.xpath(head+'//text()').extract() for head in head_xpaths] title_dict = OrderedDict(zip(head_xpaths,title_lists)) for title_xpaths,title_list in title_dict.iteritems(): if title_list: for titles in title_list: #to prevent from one word getting higher score if titles.count(' ')>0 or head_title_clean.count(' ')<1: title_ratio = fuzz.partial_token_sort_ratio(titles,head_title_clean) if title_ratio>title_score: title_score = title_ratio title = titles title_xpath = title_xpaths if title_score==100 and title.count(' ')>0: break #slug_ratio to be added in case slug_ratio = fuzz.partial_ratio(titles.lower(),slug) if slug_ratio>80: slug_score = slug_ratio title = titles title_xpath = title_xpaths if slug_score==100: break if slug_score==100: break if title_score==100: break if title_score<51 and slug_score<81: title = head_title_clean return title,title_xpath
def join(testimony_file, house_file, senate_file, result_file): t_reader = csv.DictReader(testimony_file) h_reader = csv.DictReader(house_file) s_reader = csv.DictReader(senate_file) # Get result headers mappings = { 'house': get_mapping('house', h_reader), 'senate': get_mapping('senate', s_reader)} result_headers = t_reader.fieldnames[:] result_headers.extend(mappings['house'].itervalues()) result_headers.extend(mappings['senate'].itervalues()) r_writer = csv.DictWriter(result_file, fieldnames=result_headers) r_writer.writeheader() # Structure data for easier access row_mapping = defaultdict( lambda: defaultdict( lambda: defaultdict(dict))) for chamber, reader in (('house', h_reader), ('senate', s_reader)): for row in reader: name = get_chair(row['Name']) row_mapping[chamber][row['Congress']][ get_name(row, 'Committee Name')][name] = row # Match the committee for row in t_reader: chamber = row['chamber'].lower() if chamber: committee_name = get_name(row, 'committee') committees_data = row_mapping[chamber][row['congress']] if committee_name and committees_data: # First try to match a substring in committee name for committee, data in committees_data.iteritems(): if committee in committee_name: chair_data = find_chair(data, row['committee_chairman']) if chair_data: for key, value in chair_data.iteritems(): row[mappings[chamber][key]] = value break else: # Otherwise, try fuzzy match ratios = {} for committee, data in committees_data.iteritems(): ratio = fuzz.partial_token_sort_ratio(committee_name, committee) ratios[ratio] = data max_ratio = max(ratios.iterkeys()) if max_ratio > THRESHOLD_MAX_RATIO: data = ratios[max_ratio] chair_data = find_chair(data, row['committee_chairman']) for key, value in chair_data.iteritems(): row[mappings[chamber][key]] = value r_writer.writerow(row)
def addScoreDictionary(self, second, first): name1 = first name2 = second scoredict = {} tokenset = self.fuzzyNameMatch(name1, name2) fuzzratio = fuzz.ratio(name1,name2) partial_token_sort = fuzz.partial_token_sort_ratio(name1,name2) scoredict = { 'tokenset' : tokenset, 'fuzzratio': fuzzratio, 'partialsort' : partial_token_sort } return scoredict
def testPartialTokenSortRatio(self): self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
def compare_output(baseline, current): similarity = 50; if (DEFAULT_ALGORITHM == 'ratio'): similarity = fuzz.ratio(baseline, current) elif (DEFAULT_ALGORITHM == 'partial_ratio'): similarity = fuzz.partial_ratio(baseline, current) elif (DEFAULT_ALGORITHM == 'token_sort_ratio'): similarity = fuzz.token_sort_ratio(baseline, current) elif (DEFAULT_ALGORITHM == 'partial_token_sort_ratio'): similarity = fuzz.partial_token_sort_ratio(baseline, current) elif (DEFAULT_ALGORITHM == 'token_set_ratio'): similarity = fuzz.token_set_ratio(baseline, current) else: print("Unknown similarity measure " + DEFAULT_ALGORITHM + ". Aborting") sys.exit(-1) return similarity
def testPartialTokenSortRatio(self): self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a), 100) self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5), 100)
def partialTokenMatch(self, name1,name2): ratio = fuzz.partial_token_sort_ratio(name1,name2) return ratio
def MatchGooglePlus(self,facebookProfile,googlePlusProfile): try: if facebookProfile is None or googlePlusProfile is None: return False """Get the contact info for facebook profile.""" simplifiedFacebookProfile = self.SimplifyFacebookProfile(facebookProfile) simplifiedGooglePlusProfile = self.SimplifyGooglePlusProfile(googlePlusProfile) """Now calculations begins here.""" """First match email addresses. If any email is matched then the two profile dictates same.""" facebookEmailsStr = simplifiedFacebookProfile.get("emails") facebookEmails = [] if facebookEmailsStr is not None: facebookEmails = facebookEmailsStr.split(",") ###returns a list of email addresses. tempFacebookEmails = [] for eachFBEmail in facebookEmails: if eachFBEmail is not None: while eachFBEmail.find(" ") != -1: eachFBEmail = eachFBEmail.replace(r" ","") eachFBEmail = eachFBEmail.replace(r"<","") eachFBEmail = eachFBEmail.replace(r">","") tempFacebookEmails.append(eachFBEmail) facebookEmails = tempFacebookEmails if simplifiedFacebookProfile.get("email") is not None: if facebookEmails is not None and type(facebookEmails) is list: if not simplifiedFacebookProfile.get("email") in facebookEmails: facebookEmails.append(simplifiedFacebookProfile.get("email")) else: facebookEmails = [simplifiedFacebookProfile.get("email")] googlePlusEmails = simplifiedGooglePlusProfile.get("email") ###a list of email addresses. # print facebookEmails # print googlePlusEmails emailsIntersection = set(facebookEmails) & set(googlePlusEmails) if emailsIntersection: """Found matches.""" return True facebookScreenNames = simplifiedFacebookProfile.get("screen_names") googlePlusScreenNames = simplifiedGooglePlusProfile.get("screen_names") if facebookScreenNames is not None and googlePlusScreenNames is not None: screennamesIntersection = set(facebookScreenNames.items()) & set(googlePlusScreenNames.items()) if screennamesIntersection: return True """Weight dictionary for different attributes.""" weights = {"name":0.4,"birthday":0.5,"living_info_current":0.6,"living_info_permanent":0.7,"gender":0.4,"address":0.7} newComputedScore = 0.0 scoreThreshold = 0.7 new_score = lambda a,b:(2*a*b)/(1+(a*b)) scoresList = [] fullNameIsFoundInBothProfile = simplifiedFacebookProfile.get("full_name") is not None and simplifiedGooglePlusProfile.get("full_name") is not None and simplifiedFacebookProfile.get("full_name") != "" and simplifiedGooglePlusProfile.get("full_name") != "" genderIsFoundInBothProfile = simplifiedFacebookProfile.get("gender") is not None and simplifiedGooglePlusProfile.get("gender") is not None and simplifiedFacebookProfile.get("gender") != "" and simplifiedGooglePlusProfile.get("gender") != "" birthdayIsFoundInBothProfile = simplifiedFacebookProfile.get("birthday") is not None and simplifiedGooglePlusProfile.get("birthday") is not None and simplifiedFacebookProfile.get("birthday") != "" and simplifiedGooglePlusProfile.get("birthday") != "" currentCityIsFoundInBothProfile = simplifiedFacebookProfile.get("current_city") is not None and simplifiedGooglePlusProfile.get("current_city") is not None and simplifiedFacebookProfile.get("current_city") != "" and simplifiedGooglePlusProfile.get("current_city") != "" hometownIsFoundInBothProfile = simplifiedFacebookProfile.get("hometown") is not None and simplifiedGooglePlusProfile.get("hometown") is not None and simplifiedFacebookProfile.get("hometown") != "" and simplifiedGooglePlusProfile.get("homwtown") != "" addressIsFoundInBothProfile = simplifiedFacebookProfile.get("address") is not None and simplifiedGooglePlusProfile.get("address") is not None and simplifiedFacebookProfile.get("address") != "" and simplifiedGooglePlusProfile.get("address") != "" mobilePhonesFacebookStr = simplifiedFacebookProfile.get("mobile_phones") mobilePhonesFacebook = [] if mobilePhonesFacebookStr is not None: mobilePhonesFacebook = mobilePhonesFacebookStr.split(",") mobilePhonesGooglePlus = simplifiedGooglePlusProfile.get("mobile_phones") mobileIsFoundInBothProfile = mobilePhonesFacebookStr is not None and mobilePhonesFacebookStr != "" and mobilePhonesGooglePlus is not None and len(mobilePhonesGooglePlus) > 0 if fullNameIsFoundInBothProfile is True and genderIsFoundInBothProfile is True and birthdayIsFoundInBothProfile is True and currentCityIsFoundInBothProfile is True and hometownIsFoundInBothProfile is True and addressIsFoundInBothProfile is True: ###Now the calculation begins here. nameSimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("full_name"),simplifiedGooglePlusProfile.get("full_name")) newComputedScore = new_score(float(nameSimilarityScore)/100,weights.get("name")) scoresList.append(newComputedScore) genderSimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("gender"),simplifiedGooglePlusProfile.get("gender")) newComputedScore = new_score(float(genderSimilarityScore)/100,weights.get("gender")) scoresList.append(newComputedScore) birthdaySimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("birthday"),simplifiedGooglePlusProfile.get("birthday")) newComputedScore = new_score(float(birthdaySimilarityScore)/100,weights.get("birthday")) scoresList.append(newComputedScore) currentCitySimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("current_city"),simplifiedGooglePlusProfile.get("current_city")) newComputedScore = new_score(float(currentCitySimilarityScore)/100,weights.get("living_info_current")) scoresList.append(newComputedScore) homeTownSimiratyScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("hometown"),simplifiedGooglePlusProfile.get("hometown")) newComputedScore = new_score(float(homeTownSimiratyScore)/100,weights.get("living_info_permanent")) scoresList.append(newComputedScore) addressSimiratyScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("address"),simplifiedGooglePlusProfile.get("address")) newComputedScore = new_score(float(addressSimiratyScore)/100,weights.get("address")) scoresList.append(newComputedScore) averageScore = 0.0 if len(scoresList) > 0: scoresSum = reduce(lambda a,b:a+b,scoresList) if scoresSum is not None: scoresSum = float(scoresSum) averageScore = scoresSum/len(scoresList) if averageScore >= scoreThreshold: print averageScore return True else: return False else: return False except Exception,exp: return False
data = data.drop(['id', 'qid1', 'qid2'], axis=1) data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0