Exemplo n.º 1
0
def fuzzy_feats(train_in,
                test_in,
                qcolumns=['question1', 'question2'],
                append=''):
    from fuzzywuzzy import fuzz
    import pandas as pd

    train = train_in.copy().loc[:, qcolumns]
    test = test_in.copy().loc[:, qcolumns]

    train['fuzz_r' + append] = train.apply(
        lambda x: fuzz.ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1)
    train['fuzz_pr' + append] = train.apply(
        lambda x: fuzz.partial_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1)
    train['fuzz_tsr' + append] = train.apply(
        lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]], x[qcolumns[1]]),
        axis=1)
    train['fuzz_tsor' +
          append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(
              x[qcolumns[0]], x[qcolumns[1]]),
                                axis=1)

    test['fuzz_r' + append] = test.apply(
        lambda x: fuzz.ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1)
    test['fuzz_pr' + append] = test.apply(
        lambda x: fuzz.partial_ratio(x[qcolumns[0]], x[qcolumns[1]]), axis=1)
    test['fuzz_tsr' + append] = test.apply(
        lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]], x[qcolumns[1]]),
        axis=1)
    test['fuzz_tsor' +
         append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(
             x[qcolumns[0]], x[qcolumns[1]]),
                              axis=1)

    return (train, test)
Exemplo n.º 2
0
def compute_features(train_df, test_df):

    train_df[Fields.qratio] = train_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.qratio] = test_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.wratio] = test_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)

    quality = dict(
        quality_qratio=quality_qratio,
        quality_wratio=quality_wratio,
        quality_partial_ratio=quality_partial_ratio,
        quality_partial_token_set_ratio=quality_partial_token_set_ratio,
        quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
        quality_token_set_ratio=quality_token_set_ratio,
        quality_token_sort_ratio=quality_token_sort_ratio
    )

    return quality
Exemplo n.º 3
0
def Feature_set2(corpus_raw):

    corpus_raw['fuzz_qratio'] = corpus_raw.apply(
        lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_partialratio'] = corpus_raw.apply(
        lambda row: fuzz.partial_ratio(str(row['question1']),
                                       str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_partial_token_setratio'] = corpus_raw.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row['question1']),
                                                 str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_partial_token_sortratio'] = corpus_raw.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row['question1']),
                                                  str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_token_setratio'] = corpus_raw.apply(
        lambda row: fuzz.token_set_ratio(str(row['question1']),
                                         str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_token_sortratio'] = corpus_raw.apply(
        lambda row: fuzz.token_sort_ratio(str(row['question1']),
                                          str(row['question2'])),
        axis=1)
    corpus_raw['fuzz_wratio'] = corpus_raw.apply(
        lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])),
        axis=1)
Exemplo n.º 4
0
def banks_count(banks, params):
    word = 0
    """Сортировка банков"""
    spisok_rate = []
    for j in range(len(spisok)):
        for i in range(len(banks)):
            if fuzz.partial_token_sort_ratio(
                    spisok[j], banks[i]["bank"]
            ) >= 90:  # Составление списка обменников с помощью совпадений
                word = "<b>%s</b> / <b>%s</b> <a href='%s'>%s</a>\nТелефон: %s" % (
                    banks[i]["sell"], banks[i]["buy"], links[j], spisok[j],
                    phones[j])
                if word not in spisok_rate:
                    spisok_rate.append(
                        "<b>%s</b> / <b>%s</b> <a href='%s'>%s</a>\nТелефон: %s"
                        % (banks[i]["sell"], banks[i]["buy"], links[j],
                           spisok[j], phones[j]))
                    break
    # print("\n".join(spisok_rate)) # Список доступных банков и их курсов
    text = "\n".join(delete_copy(spisok_rate))
    if params == "text":
        # print(text)
        return text
    elif params == "distance":
        return banks
def step22_full(d_old, d_new):

    for i in d_new.items():
        a = i[0].split("#bh#")[1]
        t = i[0].split("#bh#")[0]
        if "feat." in a:
            print("Working on ", i[0])
            #d_c = d2.copy()
            #key_old = i[0]
            #d_c.pop(key_old, None)
            for k in d_old.items():
                if norm(t) == norm(k[0].split("#bh#")
                                   [0]):  # проверяем, что название совпадает
                    a_a = full_norm(a)
                    if fuzz.partial_token_sort_ratio(
                            full_norm(k[0].split("#bh#")[1]), a_a) > 80:
                        print("\nsame artists. matching.")
                        print(a)
                        print(k[0].split("#bh#")[1], "\n")
                        d_old[k[0]] = pd.concat([d_old[k[0]], i[1]])
                        # обнуляем ключ в новых данных
                        d_new[i[0]] = None

                    else:
                        print("\nartists are too different")
                        print(a)
                        print(k[0].split("#bh#")[1], "\n")

            print()
    return (d_old, d_new)
def generate_fuzz(infile, outfile):
    start = datetime.now()
    print('generate fuzz feat,data path is', infile)
    df_data = pd.read_csv(infile,
                          sep='\t',
                          header=None,
                          names=['index', 'sen1', 'sen2', 'label'])
    df_feat = pd.DataFrame()

    df_feat['fuzz_qratio'] = df_data.apply(
        lambda row: fuzz.QRatio(str(row['sen1']), str(row['sen2'])), axis=1)
    df_feat['fuzz_WRatio'] = df_data.apply(
        lambda row: fuzz.WRatio(str(row['sen1']), str(row['sen2'])), axis=1)
    df_feat['fuzz_partial_ratio'] = df_data.apply(
        lambda row: fuzz.partial_ratio(str(row['sen1']), str(row['sen2'])),
        axis=1)
    df_feat['fuzz_partial_token_set_ratio'] = df_data.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row['sen1']),
                                                 str(row['sen2'])),
        axis=1)
    df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row['sen1']),
                                                  str(row['sen2'])),
        axis=1)
    df_feat['fuzz_token_set_ratio'] = df_data.apply(
        lambda row: fuzz.token_set_ratio(str(row['sen1']), str(row['sen2'])),
        axis=1)
    df_feat['fuzz_token_sort_ratio'] = df_data.apply(
        lambda row: fuzz.token_sort_ratio(str(row['sen1']), str(row['sen2'])),
        axis=1)

    df_feat.to_csv(outfile, index=False)
    end = datetime.now()
    print('times:', end - start)
Exemplo n.º 7
0
def getNutritionValue(input):
    inputArray = input.split(" ")
    #  db = MySQLdb.connect(host=dbConfig["host"],user=dbConfig["user"],passwd=dbConfig["passwd"],db=dbConfig["db"], unix_socket=dbConfig["unix_socket"])
    db = MySQLdb.connect(host=dbConfig["host"],
                         user=dbConfig["user"],
                         passwd=dbConfig["passwd"],
                         db=dbConfig["db"])
    cur = db.cursor()
    query = "SELECT Food_Name, Protein, Fat, Carbohydrate, Total_Sugar from nutrition_fact WHERE "

    for i in range(0, len(inputArray)):
        if (i != len(inputArray) - 1):
            query += "Food_Name LIKE '%" + inputArray[i] + "%' AND "
        else:
            query += "Food_Name LIKE '%" + inputArray[i] + "%'"

    cur.execute(query)

    try:
        allMatches = cur.fetchall()
        bestMatch = None
        bestRatio = 0

        for match in allMatches:
            ratio = fuzz.partial_token_sort_ratio(match[0], input)
            if ratio > bestRatio:
                ratio = bestRatio
                bestMatch = match

        n = Nutrition(bestMatch[0], bestMatch[1], bestMatch[2], bestMatch[3],
                      bestMatch[4])
        return n

    except:
        return None
Exemplo n.º 8
0
def main(df, suf):

    df['len_q1' + suf] = df['q1'].apply(lambda x: len(str(x)))
    df['len_q2' + suf] = df['q2'].apply(lambda x: len(str(x)))
    df['diff_len' + suf] = df['len_q1' + suf] - df['len_q2' + suf]
    df['len_char_q1' + suf] = df['q1'].apply(
        lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_char_q2' + suf] = df['q2'].apply(
        lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_word_q1' + suf] = df['q1'].apply(lambda x: len(str(x).split()))
    df['len_word_q2' + suf] = df['q2'].apply(lambda x: len(str(x).split()))
    df['common_words' + suf] = df.apply(lambda x: len(
        set(str(x['q1']).lower().split()).intersection(
            set(str(x['q2']).lower().split()))),
                                        axis=1)
    df['fuzz_qratio' + suf] = df.apply(
        lambda x: fuzz.QRatio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_WRatio' + suf] = df.apply(
        lambda x: fuzz.WRatio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_partial_ratio' + suf] = df.apply(
        lambda x: fuzz.partial_ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_partial_token_set_ratio' + suf] = df.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['q1']), str(x['q2'])),
        axis=1)
    df['fuzz_partial_token_sort_ratio' + suf] = df.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])),
        axis=1)
    df['fuzz_token_set_ratio' + suf] = df.apply(
        lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_token_sort_ratio' + suf] = df.apply(
        lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)

    return df
Exemplo n.º 9
0
 def add_fuzz_features(self):
     self.df['fuzz_qratio'] = self.df.apply(lambda x: fuzz.QRatio(
         str(x[self.q1_column]), str(x[self.q2_column])),
                                            axis=1)
     self.df['fuzz_wratio'] = self.df.apply(lambda x: fuzz.WRatio(
         str(x[self.q1_column]), str(x[self.q2_column])),
                                            axis=1)
     self.df['fuzz_partial_ratio'] = self.df.apply(
         lambda x: fuzz.partial_ratio(str(x[self.q1_column]),
                                      str(x[self.q2_column])),
         axis=1)
     self.df['fuzz_partial_token_set_ratio'] = self.df.apply(
         lambda x: fuzz.partial_token_set_ratio(str(x[self.q1_column]),
                                                str(x[self.q2_column])),
         axis=1)
     self.df['fuzz_partial_token_sort_ratio'] = self.df.apply(
         lambda x: fuzz.partial_token_sort_ratio(str(x[self.q1_column]),
                                                 str(x[self.q2_column])),
         axis=1)
     self.df['fuzz_token_set_ratio'] = self.df.apply(
         lambda x: fuzz.token_set_ratio(str(x[self.q1_column]),
                                        str(x[self.q2_column])),
         axis=1)
     self.df['fuzz_token_sort_ratio'] = self.df.apply(
         lambda x: fuzz.token_sort_ratio(str(x[self.q1_column]),
                                         str(x[self.q2_column])),
         axis=1)
Exemplo n.º 10
0
def fw_partial_token_sort_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.partial_token_sort_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz partial_token_sort_ratio feature")
    return np.array(fuzzy)
Exemplo n.º 11
0
def generate_feature(i):
    print(str(i) + ' processor started !')
    data = names['data_' + str(i)]

    with timer('fuzzywuzzy'):
        data['fuzz_qratio'] = data.apply(
            lambda row: fuzz.QRatio(str(row['query']), str(row['title'])),
            axis=1)
        data['fuzz_WRatio'] = data.apply(
            lambda row: fuzz.WRatio(str(row['query']), str(row['title'])),
            axis=1)
        data['fuzz_partial_ratio'] = data.apply(lambda row: fuzz.partial_ratio(
            str(row['query']), str(row['title'])),
                                                axis=1)
        data['fuzz_partial_token_set_ratio'] = data.apply(
            lambda row: fuzz.partial_token_set_ratio(str(row['query']),
                                                     str(row['title'])),
            axis=1)
        data['fuzz_partial_token_sort_ratio'] = data.apply(
            lambda row: fuzz.partial_token_sort_ratio(str(row['query']),
                                                      str(row['title'])),
            axis=1)
        data['fuzz_token_set_ratio'] = data.apply(
            lambda row: fuzz.token_set_ratio(str(row['query']),
                                             str(row['title'])),
            axis=1)
        data['fuzz_token_sort_ratio'] = data.apply(
            lambda row: fuzz.token_sort_ratio(str(row['query']),
                                              str(row['title'])),
            axis=1)
        data['diff_ratios'] = data.apply(diff_ratios, axis=1)
        data = reduce_mem_usage(data)

    return data
Exemplo n.º 12
0
def fw_partial_token_sort_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.partial_token_sort_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz partial_token_sort_ratio feature")
    return np.array(fuzzy)
Exemplo n.º 13
0
def step3_full(d_old, d_new):
    for i in d_new.items():
        a = i[0].split("#bh#")[1]
        t = i[0].split("#bh#")[0]
        #d_c = d22.copy()
        #key_old = i[0]
        #d_c.pop(key_old, None)
        for k in d_old.items():
            if fuzz.token_sort_ratio(k[0].split("#bh#")[0], t) == 100:
                print(t)
                print(k[0].split("#bh#")[0])
                # сравниваем артистов
                if fuzz.token_sort_ratio(
                        full_norm(a), full_norm(k[0].split("#bh#")[1])) > 80:
                    d_old[k[0]] = pd.concat([d_old[k[0]], i[1]])
                    # обнуляем ключ в новых данных
                    d_new[i[0]] = None
                    print("\nartists are close! matching these together")
                    print(k[0].split("#bh#")[1])
                    print(a, "\n")
                elif fuzz.partial_token_sort_ratio(
                        translit(full_norm(a), "ru"),
                        translit(full_norm(k[0].split("#bh#")[1]), "ru")) > 80:
                    d_old[k[0]] = pd.concat([d_old[k[0]], i[1]])
                    # обнуляем ключ в новых данных
                    d_new[i[0]] = None
                    print("\nartists are close! matching these together")
                    print(k[0].split("#bh#")[1])
                    print(a, "\n")
                else:
                    print("\nartists are too different:")
                    print(i[0])
                    print(k[0].split("#bh#")[1])
                    print(a, "\n")
    return (d_old, d_new)
Exemplo n.º 14
0
def find_tweet_target(tweet_text: str) -> str:
    """
    run tweet text through a database, return the companies it associates to.
    """
    split = tweet_text.split()
    highest_score = 0
    h_company = []
    h_brand = []

    for company, brand_dict in companies_db.items():
        for brand, tag_list in brand_dict.items():
            for tag in tag_list:
                score = fuzz.partial_token_sort_ratio(tag, tweet_text)
                if score > 90:
                    highest_score = score
                    h_company.append(company)
                    h_brand.append(brand)
                    #company_matches[h_company][h_brand] += 1
                # if tag in tweet_text:
                #     h_company = company
                #     h_brand = brand
                #     return (h_company, h_brand)

                # for tweet_word in split:
                #     score = fuzz.ratio(tag, tweet_word)
                #     if score > highest_score:
                #         highest_score = score
                #         h_company = company
                #         h_brand = brand

    return str(zip(h_company, h_brand))
Exemplo n.º 15
0
def step22(D):
    d2 = D
    d22 = d2.copy()

    for i in d2.items():
        a = i[0].split("#bh#")[1]
        t = i[0].split("#bh#")[0]
        if "feat." in a:
            print("Working on ", i[0])
            d_c = d2.copy()
            key_old = i[0]
            d_c.pop(key_old, None)
            for k in d_c.items():
                if norm(t) == norm(k[0].split("#bh#")
                                   [0]):  # проверяем, что название совпадает
                    a_a = full_norm(a)
                    if fuzz.partial_token_sort_ratio(
                            full_norm(k[0].split("#bh#")[1]), a_a) > 80:
                        print("\nsame artists. matching.")
                        print(a)
                        print(k[0].split("#bh#")[1], "\n")
                        d22[k[0]] = pd.concat(
                            [d22[k[0]], pd.concat([k[1], i[1]])])
                        # удаляем старый ключ в словаре
                        #d22.pop(i[0], None)
                        d22[i[0]] = None
                    else:
                        print("\nartists are too different")
                        print(a)
                        print(k[0].split("#bh#")[1], "\n")
            print()
    return d22
Exemplo n.º 16
0
    def pairFeatures(self, sentenceA, sentenceB):
        features = list()

        ## len features all, chars, word
        features.append( np.log(len(sentenceA)+1) )
        features.append( np.log(len(sentenceB)+1) )
        features.append( np.log(abs(len(sentenceA) - len(sentenceB))+1) )
        features.append( np.log(len(''.join(set(sentenceA.replace(' ', ''))))+1 ))
        features.append( np.log(len(''.join(set(sentenceB.replace(' ', ''))))+1 ))
        features.append( np.log(len(sentenceA.split())+1 ))
        features.append( np.log(len(sentenceB.split())+1 ))

        features.append(np.log(sf.longestCommonsubstring(sentenceA, sentenceB)+1))
        features.append(np.log(sf.longestCommonSubseq(sentenceA, sentenceB)+1))

        ## token features
        features.append( len(set(sentenceA.lower().split()).intersection(set(sentenceB.lower().split()))) )
        features.append( np.log(fuzz.QRatio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.WRatio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.partial_ratio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.partial_token_set_ratio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.partial_token_sort_ratio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.token_set_ratio(sentenceA, sentenceB)+1) )
        features.append( np.log(fuzz.token_sort_ratio(sentenceA, sentenceB)+1) )

        ## word semantic features
        for f in self.qs.pairFeatures(sentenceA, sentenceB, stemming = 0):
            features.append(f)
        for f in self.qs.pairFeatures(sentenceA, sentenceB, stemming = 1):
            features.append(f)

        return features
Exemplo n.º 17
0
def author_quality_match(x, y, scorer=fuzz.token_set_ratio):

    chk = {}
    chk['simple_wos'] = unidecode.unidecode(x).lower().replace(
        '.', '').replace(',', '').replace('-', ' ')
    chk['full_name'] = unidecode.unidecode(y).lower().replace('.', '').replace(
        ',', '').replace('-', ' ')
    sn = re.sub('^(\w+\s+\w+\s+\w)\w+(\s+\w)\w+$', r'\1\2', chk['full_name'])
    chk['short_name'] = re.sub('^(\w+\s+\w+\s+\w)\w+$', r'\1', sn)
    sn = re.sub('^(\w+\s+)\w+\s+(\w+)\s+\w+$', r'\1\2', chk['full_name'])
    chk['simple_name'] = re.sub('^(\w+\s+)\w+\s+(\w+)$', r'\1\2', sn)
    chk['simple_second_name'] = re.sub('^(\w+\s+)\w+\s+\w+\s+(\w+)$', r'\1\2',
                                       chk['full_name'])
    chk['last_name'] = re.sub('^(\w+\s+)\w+\s+(\w+\s+\w+)$', r'\1\2',
                              chk['full_name'])
    chk['last_names'] = re.sub('^(\w+\s+\w+\s+\w+)\s+\w+$', r'\1',
                               chk['full_name'])
    chk['second_name'] = re.sub('^(\w+\s+\w+\s+)\w+\s+(\w+)$', r'\1\2',
                                chk['full_name'])

    chk['s1'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['full_name'])
    chk['s1b'] = fuzz.partial_token_sort_ratio(chk['simple_wos'],
                                               chk['full_name'])
    chk['s2'] = scorer(chk['simple_wos'], chk['short_name'])
    chk['s3'] = fuzz.ratio(chk['simple_wos'], chk['simple_name'])
    chk['s3'] = fuzz.ratio(chk['simple_wos'], chk['simple_second_name'])
    chk['s4'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['last_name'])
    chk['s5'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['last_names'])
    chk['s6'] = fuzz.token_sort_ratio(chk['simple_wos'], chk['second_name'])

    chk['max'] = max(chk['s1'], chk['s1b'], chk['s2'], chk['s3'], chk['s4'],
                     chk['s5'], chk['s6'])

    return chk
Exemplo n.º 18
0
def _create_fuzzy_matches(x, y):
    """
    Create fuzzy matches.
    
    :param str x: first string to compare
    :param str y: second string to compare
    :return: dict results: dictionary of fuzzy match results
    """
    r1 = fuzz.ratio(x, y)
    r2 = fuzz.token_sort_ratio(x, y)
    r3 = fuzz.token_set_ratio(x, y)
    r4 = fuzz.partial_ratio(x, y)
    r5 = fuzz.partial_token_sort_ratio(x, y)
    r6 = fuzz.partial_token_set_ratio(x, y)

    ratios = [r1, r2, r3, r4, r5, r6]
    ratio_average = mean(ratios)

    results = {
        "simple_ratio": r1,
        "token_sort_ratio": r2,
        "token_set_ratio": r3,
        "partial_ratio": r4,
        "partial_token_sort_ratio": r5,
        "partial_token_set_ratio": r6,
        "ratio_average": ratio_average
    }

    return results
Exemplo n.º 19
0
def closest_user(member_string, guild: discord.Guild):
    if member_string.startswith('<@!') and member_string.endswith('>'):
        return guild.get_member(int(member_string[3:-1]))

    return list(
        sorted(guild.members,
               key=lambda member: fuzz.partial_token_sort_ratio(
                   member_string.lower(), member.display_name.lower())))[-1]
Exemplo n.º 20
0
def extract_features(df):
    df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return df
Exemplo n.º 21
0
def fuzzy_check(postTitle: str = "", postBody:str = ""):
    ratio = fuzz.partial_ratio([utils.full_process(postTitle), utils.full_process(postBody)], listOfWords.words['vote-manipulation'])
    print(ratio)
    if (not is_rule_breaking(ratio)):
        ratio = fuzz.partial_token_sort_ratio([postTitle, postBody], listOfWords.words['vote-manipulation'])
        print(ratio)

    return is_rule_breaking(ratio)
Exemplo n.º 22
0
def search_name(book_arg: str, bible: Biblia) -> int:
    """
    Essa funcao verifica um livro proximo a string pedida e retorna o index do livro pedido.
    """
    nomes = index_books(bible)
    for nome in nomes:
        if fuzz.partial_token_sort_ratio(book_arg, nome) >= 80:
            return nomes.index(nome)
Exemplo n.º 23
0
def func_txtFeat12(s1, s2, lang='eng'):
    """
    :param s1: sentence1
    :param s2: sentence2
    :param lang: language of the sentences
    :return: Partial token sort ratio
    """
    return float(fuzz.partial_token_sort_ratio(s1, s2))
Exemplo n.º 24
0
 def fuzzy_ratio(sentencea, sentenceb):
     ratio = fuzz.ratio(sentencea, sentenceb)
     partial_ratio = fuzz.partial_ratio(sentencea, sentenceb)
     token_sort_ratio = fuzz.token_sort_ratio(sentencea, sentenceb)
     token_set_ratio = fuzz.token_set_ratio(sentencea, sentenceb)
     partial_token_set_ratio = fuzz.partial_token_set_ratio(sentencea, sentenceb)
     partial_token_sort_ratio = fuzz.partial_token_sort_ratio(sentencea, sentenceb)
     return ratio, partial_ratio, token_set_ratio, token_sort_ratio, partial_token_set_ratio, partial_token_sort_ratio
Exemplo n.º 25
0
def strDistance(name1, name2):
    if name1 == None or name2 == None:
        return 0

    allr = (fuzz.partial_ratio(name1, name2), fuzz.UWRatio(name1, name2),
            fuzz.partial_token_set_ratio(name1, name2),
            fuzz.partial_token_sort_ratio(name1, name2))
    return (sum(allr) // len(allr), allr)
Exemplo n.º 26
0
 def transform(self, X, y = None):
     X['fuzz_qratio'] = X.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_WRatio'] = X.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_partial_ratio'] = X.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_partial_token_set_ratio'] = X.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_partial_token_sort_ratio'] = X.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_token_set_ratio'] = X.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
     X['fuzz_token_sort_ratio'] = X.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
     return X
Exemplo n.º 27
0
def calculate_fuzzy_abbv_score(source_list, target_list):
    try:
        fuzz_abb_score = [
            fuzz.partial_token_sort_ratio(source_list[i], target_list[i])
            for i in range(0, len(source_list))
        ]
        return fuzz_abb_score
    except Exception as e:
        print("Error occured in calculating Abbv Fuzzy Score ::", e)
def get_overlap(vclaim, tweet):
    ff = getCombined(vclaim)
    ss = getCombined(tweet)
    #print(ff)
    #print(ss)
    perc = fuzz.partial_token_sort_ratio(ss, ff)
    #print (perc)
    perc = perc / 100
    return perc * 80
Exemplo n.º 29
0
def fuzzy(s1, s2):
    return [fuzz.ratio(s1, s2) / 100,
            fuzz.partial_ratio(s1, s2) / 100,
            fuzz.token_sort_ratio(s1, s2) / 100,
            fuzz.partial_token_sort_ratio(s1, s2) / 100,
            fuzz.token_set_ratio(s1, s2) / 100,
            fuzz.partial_token_set_ratio(s1, s2) / 100,
            fuzz.QRatio(s1, s2) / 100,
            fuzz.WRatio(s1, s2) / 100]
Exemplo n.º 30
0
def extract_features(list):
    for x in list:
        x.append(fuzz.QRatio(x[:13], x[13]))
        x.append(fuzz.partial_ratio(x[:13], x[13]))
        x.append(fuzz.partial_token_set_ratio(x[:13], x[13]))
        x.append(fuzz.partial_token_sort_ratio(x[:13], x[13]))
        x.append(fuzz.token_set_ratio(x[:13], x[13]))
        x.append(fuzz.token_sort_ratio(x[:13], x[13]))
    return list
Exemplo n.º 31
0
def get_priority(key, str):
    priority = fuzz.token_set_ratio(key, str)
    priority = max(priority, fuzz.token_sort_ratio(key, str))
    priority = max(priority, fuzz.ratio(key, str))
    priority = max(priority, fuzz.partial_ratio(key, str))
    priority = max(priority, fuzz.partial_token_sort_ratio(key, str))
    priority = max(priority, fuzz.partial_token_set_ratio(key, str))

    return priority
def find_chair(data, chairman):
    chairman = chairman.split(',', 1)[0].lower()
    if len(data) == 1:
        return data.values()[0]
    else:
        chairs = {}
        for chair in data.iterkeys():
            chairs[chair] = fuzz.partial_token_sort_ratio(chair, chairman)
        max_chair = tuple(max(item[::-1] for item in chairs.iteritems()))[1]
        return data[max_chair]
Exemplo n.º 33
0
	def post_title_extract(self,sel,response):
		title = None
		title_score = 0
		slug_score = 0
		title_xpath = None
		blog=self.get_domain(response.url)
		slug = response.url.split('/')[-1] or response.url.split('/')[-2]
		slug = slug.replace('-',' ').rstrip('.html')

		head_title = sel.xpath('//title/text()').extract()
		head_title = head_title[0] if head_title else ''
		if '|' in head_title:
			pos=[head_title.split('|')[0],head_title.split('|')[-1]]
			word = pos[0] if fuzz.partial_ratio(pos[0],blog)>fuzz.partial_ratio(pos[-1],blog) else pos[-1]
			head_title_clean = head_title.replace(word,'').replace('|','')
		else:		
			head_title_clean = head_title
			text_to_remove = sel.xpath('//link[@rel="alternate"]/@title').extract()
			if text_to_remove and head_title:
				words = (' '.join(text_to_remove)+head_title).split()
				if Counter(words).most_common(3):
					for wor in Counter(words).most_common(3):
						head_title_clean = head_title_clean.replace(wor[0],'')

		[h1,h1a,h2,h2a,h3,h3a]=["//h1","//h1/a","//h2","//h2/a","//h3","//h3/a"]
		head_xpaths = [h1a,h1,h2a,h2,h3a,h3]
		title_lists = [sel.xpath(head+'//text()').extract() for head in head_xpaths]
		title_dict = OrderedDict(zip(head_xpaths,title_lists))
		for title_xpaths,title_list in title_dict.iteritems():
			if title_list:
				for titles in title_list:
					#to prevent from one word getting higher score
					if titles.count(' ')>0 or head_title_clean.count(' ')<1:
						title_ratio = fuzz.partial_token_sort_ratio(titles,head_title_clean)
						if title_ratio>title_score:
							title_score = title_ratio
							title = titles
							title_xpath = title_xpaths
							if title_score==100 and title.count(' ')>0:
								break
						#slug_ratio to be added in case
						slug_ratio = fuzz.partial_ratio(titles.lower(),slug)
						if slug_ratio>80:
							slug_score = slug_ratio
							title = titles
							title_xpath = title_xpaths
							if slug_score==100:
								break
				if slug_score==100:
					break
				if title_score==100:
					break
		if title_score<51 and slug_score<81:
			title = head_title_clean
		return title,title_xpath
def join(testimony_file, house_file, senate_file, result_file):
    t_reader = csv.DictReader(testimony_file)
    h_reader = csv.DictReader(house_file)
    s_reader = csv.DictReader(senate_file)

    # Get result headers
    mappings = {
        'house': get_mapping('house', h_reader),
        'senate': get_mapping('senate', s_reader)}
    result_headers = t_reader.fieldnames[:]
    result_headers.extend(mappings['house'].itervalues())
    result_headers.extend(mappings['senate'].itervalues())
    r_writer = csv.DictWriter(result_file, fieldnames=result_headers)
    r_writer.writeheader()

    # Structure data for easier access
    row_mapping = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(dict)))
    for chamber, reader in (('house', h_reader), ('senate', s_reader)):
        for row in reader:
            name = get_chair(row['Name'])
            row_mapping[chamber][row['Congress']][
                get_name(row, 'Committee Name')][name] = row

    # Match the committee
    for row in t_reader:
        chamber = row['chamber'].lower()
        if chamber:
            committee_name = get_name(row, 'committee')
            committees_data = row_mapping[chamber][row['congress']]
            if committee_name and committees_data:
                # First try to match a substring in committee name
                for committee, data in committees_data.iteritems():
                    if committee in committee_name:
                        chair_data = find_chair(data, row['committee_chairman'])
                        if chair_data:
                            for key, value in chair_data.iteritems():
                                row[mappings[chamber][key]] = value
                            break
                else:
                    # Otherwise, try fuzzy match
                    ratios = {}
                    for committee, data in committees_data.iteritems():
                        ratio = fuzz.partial_token_sort_ratio(committee_name, committee)
                        ratios[ratio] = data
                    max_ratio = max(ratios.iterkeys())
                    if max_ratio > THRESHOLD_MAX_RATIO:
                        data = ratios[max_ratio]
                        chair_data = find_chair(data, row['committee_chairman'])
                        for key, value  in chair_data.iteritems():
                            row[mappings[chamber][key]] = value

        r_writer.writerow(row)
	def addScoreDictionary(self, second, first):
		
		name1 = first
		name2 = second
		scoredict = {}
		tokenset = self.fuzzyNameMatch(name1, name2)
		fuzzratio = fuzz.ratio(name1,name2)
		partial_token_sort = fuzz.partial_token_sort_ratio(name1,name2)
		scoredict = { 'tokenset' : tokenset,
				'fuzzratio': fuzzratio,
				'partialsort' : partial_token_sort }
		return scoredict
Exemplo n.º 36
0
 def testPartialTokenSortRatio(self):
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
Exemplo n.º 37
0
def compare_output(baseline, current):
    similarity = 50;
    if (DEFAULT_ALGORITHM == 'ratio'):
       similarity = fuzz.ratio(baseline, current)
    elif (DEFAULT_ALGORITHM == 'partial_ratio'):
       similarity = fuzz.partial_ratio(baseline, current)
    elif (DEFAULT_ALGORITHM == 'token_sort_ratio'):
       similarity = fuzz.token_sort_ratio(baseline, current)
    elif (DEFAULT_ALGORITHM == 'partial_token_sort_ratio'):
       similarity = fuzz.partial_token_sort_ratio(baseline, current)
    elif (DEFAULT_ALGORITHM == 'token_set_ratio'):
       similarity = fuzz.token_set_ratio(baseline, current)
    else:
       print("Unknown similarity measure " + DEFAULT_ALGORITHM + ". Aborting")
       sys.exit(-1)
    return similarity
Exemplo n.º 38
0
 def testPartialTokenSortRatio(self):
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a), 100)
     self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5), 100)
	def partialTokenMatch(self, name1,name2):
		ratio = fuzz.partial_token_sort_ratio(name1,name2)
		return ratio
Exemplo n.º 40
0
 def MatchGooglePlus(self,facebookProfile,googlePlusProfile):
     try:
         if facebookProfile is None or googlePlusProfile is None:
             return False
         """Get the contact info for facebook profile."""
         simplifiedFacebookProfile = self.SimplifyFacebookProfile(facebookProfile)
         simplifiedGooglePlusProfile = self.SimplifyGooglePlusProfile(googlePlusProfile)
         
         """Now calculations begins here."""
         """First match email addresses. If any email is matched then the two profile dictates same."""
         facebookEmailsStr = simplifiedFacebookProfile.get("emails")
         facebookEmails = []
         if facebookEmailsStr is not None:
             facebookEmails = facebookEmailsStr.split(",") ###returns a list of email addresses.
         tempFacebookEmails = []
         for eachFBEmail in facebookEmails:
             if eachFBEmail is not None:
                 while eachFBEmail.find(" ") != -1:
                     eachFBEmail = eachFBEmail.replace(r" ","")
                 eachFBEmail = eachFBEmail.replace(r"<","")
                 eachFBEmail = eachFBEmail.replace(r">","")
                 tempFacebookEmails.append(eachFBEmail)
         facebookEmails = tempFacebookEmails
         if simplifiedFacebookProfile.get("email") is not None:
             if facebookEmails is not None and type(facebookEmails) is list:
                 if not simplifiedFacebookProfile.get("email") in facebookEmails:
                     facebookEmails.append(simplifiedFacebookProfile.get("email"))
             else:
                 facebookEmails = [simplifiedFacebookProfile.get("email")]
         googlePlusEmails = simplifiedGooglePlusProfile.get("email") ###a list of email addresses.
 #        print facebookEmails
 #        print googlePlusEmails
         emailsIntersection = set(facebookEmails) & set(googlePlusEmails)
         if emailsIntersection:
             """Found matches."""
             return True
         facebookScreenNames = simplifiedFacebookProfile.get("screen_names")
         googlePlusScreenNames = simplifiedGooglePlusProfile.get("screen_names")
         if facebookScreenNames is not None and googlePlusScreenNames is not None:
             screennamesIntersection = set(facebookScreenNames.items()) & set(googlePlusScreenNames.items())
             if screennamesIntersection:
                 return True
         """Weight dictionary for different attributes."""
         weights = {"name":0.4,"birthday":0.5,"living_info_current":0.6,"living_info_permanent":0.7,"gender":0.4,"address":0.7}
         newComputedScore = 0.0
         scoreThreshold = 0.7
         new_score = lambda a,b:(2*a*b)/(1+(a*b))
         scoresList = []
         
         fullNameIsFoundInBothProfile = simplifiedFacebookProfile.get("full_name") is not None and simplifiedGooglePlusProfile.get("full_name") is not None and simplifiedFacebookProfile.get("full_name") != "" and simplifiedGooglePlusProfile.get("full_name") != ""
         genderIsFoundInBothProfile = simplifiedFacebookProfile.get("gender") is not None and simplifiedGooglePlusProfile.get("gender") is not None and simplifiedFacebookProfile.get("gender") != "" and simplifiedGooglePlusProfile.get("gender") != ""
         birthdayIsFoundInBothProfile = simplifiedFacebookProfile.get("birthday") is not None and simplifiedGooglePlusProfile.get("birthday") is not None and simplifiedFacebookProfile.get("birthday") != "" and simplifiedGooglePlusProfile.get("birthday") != ""
         currentCityIsFoundInBothProfile = simplifiedFacebookProfile.get("current_city") is not None and simplifiedGooglePlusProfile.get("current_city") is not None and simplifiedFacebookProfile.get("current_city") != "" and simplifiedGooglePlusProfile.get("current_city") != ""
         hometownIsFoundInBothProfile = simplifiedFacebookProfile.get("hometown") is not None and simplifiedGooglePlusProfile.get("hometown") is not None and simplifiedFacebookProfile.get("hometown") != "" and simplifiedGooglePlusProfile.get("homwtown") != ""
         addressIsFoundInBothProfile = simplifiedFacebookProfile.get("address") is not None and simplifiedGooglePlusProfile.get("address") is not None and simplifiedFacebookProfile.get("address") != "" and simplifiedGooglePlusProfile.get("address") != ""
         
         mobilePhonesFacebookStr = simplifiedFacebookProfile.get("mobile_phones")
         mobilePhonesFacebook = []
         if mobilePhonesFacebookStr is not None:
             mobilePhonesFacebook = mobilePhonesFacebookStr.split(",")
         mobilePhonesGooglePlus = simplifiedGooglePlusProfile.get("mobile_phones")
         
         mobileIsFoundInBothProfile = mobilePhonesFacebookStr is not None and mobilePhonesFacebookStr != "" and mobilePhonesGooglePlus is not None and len(mobilePhonesGooglePlus) > 0
         
         if fullNameIsFoundInBothProfile is True and genderIsFoundInBothProfile is True and birthdayIsFoundInBothProfile is True and currentCityIsFoundInBothProfile is True and hometownIsFoundInBothProfile is True and addressIsFoundInBothProfile is True:
             ###Now the calculation begins here.
             nameSimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("full_name"),simplifiedGooglePlusProfile.get("full_name"))
             newComputedScore = new_score(float(nameSimilarityScore)/100,weights.get("name"))
             scoresList.append(newComputedScore)
             
             genderSimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("gender"),simplifiedGooglePlusProfile.get("gender"))
             newComputedScore = new_score(float(genderSimilarityScore)/100,weights.get("gender"))
             scoresList.append(newComputedScore)
             
             birthdaySimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("birthday"),simplifiedGooglePlusProfile.get("birthday"))
             newComputedScore = new_score(float(birthdaySimilarityScore)/100,weights.get("birthday"))
             scoresList.append(newComputedScore)
             
             currentCitySimilarityScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("current_city"),simplifiedGooglePlusProfile.get("current_city"))
             newComputedScore = new_score(float(currentCitySimilarityScore)/100,weights.get("living_info_current"))
             scoresList.append(newComputedScore)
             
             homeTownSimiratyScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("hometown"),simplifiedGooglePlusProfile.get("hometown"))
             newComputedScore = new_score(float(homeTownSimiratyScore)/100,weights.get("living_info_permanent"))
             scoresList.append(newComputedScore)
             
             addressSimiratyScore = fuzz.partial_token_sort_ratio(simplifiedFacebookProfile.get("address"),simplifiedGooglePlusProfile.get("address"))
             newComputedScore = new_score(float(addressSimiratyScore)/100,weights.get("address"))
             scoresList.append(newComputedScore)
             
             averageScore = 0.0
             
             if len(scoresList) > 0:
                 scoresSum = reduce(lambda a,b:a+b,scoresList)
                 if scoresSum is not None:
                     scoresSum = float(scoresSum)
                     averageScore = scoresSum/len(scoresList)
             
             if averageScore >= scoreThreshold:
                 print averageScore
                 return True
             else:
                 return False
             
         else:
             return False
     except Exception,exp:
         return False
data = data.drop(['id', 'qid1', 'qid2'], axis=1)


data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)


norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0