示例#1
0
    def test_iterable_input(self):
        """Function should return correct values when called with valid iterables as input"""
        self.assertEqual(
            fuzzycomp.lcs_length(["X", "M", "J", "Y", "A", "U", "Z"], ["M", "Z", "J", "A", "W", "X", "U"]), 4
        )
        self.assertEqual(fuzzycomp.lcs_length(["f", "o", "o"], ["b", "a", "r"]), 0)

        self.assertEqual(
            fuzzycomp.lcs_length(("X", "M", "J", "Y", "A", "U", "Z"), ("M", "Z", "J", "A", "W", "X", "U")), 4
        )
        self.assertEqual(fuzzycomp.lcs_length(("f", "o", "o"), ("b", "a", "r")), 0)
示例#2
0
    def test_iterable_input(self):
        """Function should return correct values when called with valid iterables as input"""
        self.assertEqual(
            fuzzycomp.lcs_length(["X", "M", "J", "Y", "A", "U", "Z"],
                                 ["M", "Z", "J", "A", "W", "X", "U"]), 4)
        self.assertEqual(
            fuzzycomp.lcs_length(["f", "o", "o"], ["b", "a", "r"]), 0)

        self.assertEqual(
            fuzzycomp.lcs_length(("X", "M", "J", "Y", "A", "U", "Z"),
                                 ("M", "Z", "J", "A", "W", "X", "U")), 4)
        self.assertEqual(
            fuzzycomp.lcs_length(("f", "o", "o"), ("b", "a", "r")), 0)
示例#3
0
 def test_valid_input(self):
     """Algorithm should return correct values under valid input"""
     self.assertEqual(fuzzycomp.lcs_length("XMJYAUZ", "MZJAWXU"), 4)
     self.assertEqual(fuzzycomp.lcs_length("foo", "bar"), 0)
示例#4
0
 def test_valid_input(self):
     """Algorithm should return correct values under valid input"""
     self.assertEqual(fuzzycomp.lcs_length("XMJYAUZ", "MZJAWXU"), 4)
     self.assertEqual(fuzzycomp.lcs_length("foo", "bar"), 0)
def calculate_features(dataset, string_type):
    ngrams1 = []
    ngrams2 = []
    ngrams3 = []
    ngrams4 = []
    dices = []
    jaccards = []
    jaros = []
    lcs = []
    mes = []
    sws = []
    afs = []
    bds = []
    coses = []
    prs = []
    sfs = []
    edxs = []
    gjs = []
    jws = []
    lws = []
    ptss = []
    rats = []
    sounds = []
    tfidfs = []
    tss = []
    tvs = []
    ovs = []
    nws = []
    wordnet_sims = []
    w2vec_sims = []

    if string_type == 'Entity':
        index = 2
    elif string_type == 'Parent':
        index = 4
    elif string_type == 'Path':
        index = 6

    for key, row in tqdm(dataset.iterrows()):

        string1 = row[index]
        string2 = row[index + 1]

        ngrams1.append(ngram.NGram.compare(string1, string2, N=1))
        ngrams2.append(ngram.NGram.compare(string1, string2, N=2))
        ngrams3.append(ngram.NGram.compare(string1, string2, N=3))
        ngrams4.append(ngram.NGram.compare(string1, string2, N=4))
        lws.append(lev.get_sim_score(string1, string2))
        jaros.append(jaro.get_sim_score(string1, string2))
        lcs.append(2 * fuzzycomp.lcs_length(string1, string2) /
                   (len(string1) + len(string2)))
        nws.append(nw.get_raw_score(string1, string2))
        sws.append(sw.get_raw_score(string1, string2))
        afs.append(af.get_raw_score(string1, string2))
        bds.append(bd.get_sim_score(string1, string2))
        prs.append(pr.get_sim_score(string1, string2))
        edxs.append(edx.get_sim_score(string1, string2))
        ptss.append(pts.get_sim_score(string1, string2))
        rats.append(rat.get_sim_score(string1, string2))
        sounds.append(sound.get_sim_score(string1, string2))
        tss.append(ts.get_sim_score(string1, string2))
        jws.append(jw.get_sim_score(string1, string2))

        row_set1 = get_words(string1)
        row_set2 = get_words(string2)

        mes.append(me.get_raw_score(row_set1, row_set2))
        coses.append(cos.get_sim_score(row_set1, row_set2))
        sfs.append(sf.get_raw_score(row_set1, row_set2))
        gjs.append(gj.get_sim_score(row_set1, row_set2))
        tfidfs.append(tfidf.get_sim_score(row_set1, row_set2))
        tvs.append(tv_ind.get_sim_score(row_set1, row_set2))
        ovs.append(over_coef.get_sim_score(row_set1, row_set2))
        dices.append(dice.get_sim_score(row_set1, row_set2))
        jaccards.append(jac.get_sim_score(row_set1, row_set2))

        allsyns1 = set(ss for word in row_set1 for ss in wordnet.synsets(word))
        allsyns2 = set(ss for word in row_set2 for ss in wordnet.synsets(word))

        best = [
            wordnet.wup_similarity(s1, s2)
            for s1, s2 in product(allsyns1, allsyns2)
        ]
        if len(best) > 0:
            wordnet_sims.append(best[0])
        else:
            wordnet_sims.append(0)

        w2vec_sims.append(get_word2vec_sim(row_set1, row_set2))

    dataset['Ngram1' + '_' + string_type] = ngrams1
    dataset['Ngram2' + '_' + string_type] = ngrams2
    dataset['Ngram3' + '_' + string_type] = ngrams3
    dataset['Ngram4' + '_' + string_type] = ngrams4
    dataset['Dice' + '_' + string_type] = dices
    dataset['Jaccard' + '_' + string_type] = jaccards
    dataset['Jaro' + '_' + string_type] = jaros
    dataset['Longest_com_sub' + '_' + string_type] = lcs
    dataset['Monge-Elkan' + '_' + string_type] = mes
    dataset['SmithWaterman' + '_' + string_type] = sws
    dataset['AffineGap' + '_' + string_type] = afs
    dataset['BagDistance' + '_' + string_type] = bds
    dataset['Cosine_similarity' + '_' + string_type] = coses
    dataset['PartialRatio' + '_' + string_type] = prs
    dataset['Soft_TFIDF' + '_' + string_type] = sfs
    dataset['Editex' + '_' + string_type] = edxs
    dataset['GeneralizedJaccard' + '_' + string_type] = gjs
    dataset['JaroWinkler' + '_' + string_type] = jws
    dataset['Levenshtein' + '_' + string_type] = lws
    dataset['PartialTokenSort' + '_' + string_type] = ptss
    dataset['Ratio' + '_' + string_type] = rats
    dataset['Soundex' + '_' + string_type] = sounds
    dataset['TFIDF' + '_' + string_type] = tfidfs
    dataset['TokenSort' + '_' + string_type] = tss
    dataset['TverskyIndex' + '_' + string_type] = tvs
    dataset['OverlapCoef' + '_' + string_type] = ovs
    dataset['Needleman-Wunsch' + '_' + string_type] = nws
    dataset['Wordnet_sim' + '_' + string_type] = wordnet_sims
    dataset['Word2vec_sim' + '_' + string_type] = w2vec_sims

    return dataset