def test_iterable_input(self): """Function should return correct values when called with valid iterables as input""" self.assertEqual( fuzzycomp.lcs_length(["X", "M", "J", "Y", "A", "U", "Z"], ["M", "Z", "J", "A", "W", "X", "U"]), 4 ) self.assertEqual(fuzzycomp.lcs_length(["f", "o", "o"], ["b", "a", "r"]), 0) self.assertEqual( fuzzycomp.lcs_length(("X", "M", "J", "Y", "A", "U", "Z"), ("M", "Z", "J", "A", "W", "X", "U")), 4 ) self.assertEqual(fuzzycomp.lcs_length(("f", "o", "o"), ("b", "a", "r")), 0)
def test_iterable_input(self): """Function should return correct values when called with valid iterables as input""" self.assertEqual( fuzzycomp.lcs_length(["X", "M", "J", "Y", "A", "U", "Z"], ["M", "Z", "J", "A", "W", "X", "U"]), 4) self.assertEqual( fuzzycomp.lcs_length(["f", "o", "o"], ["b", "a", "r"]), 0) self.assertEqual( fuzzycomp.lcs_length(("X", "M", "J", "Y", "A", "U", "Z"), ("M", "Z", "J", "A", "W", "X", "U")), 4) self.assertEqual( fuzzycomp.lcs_length(("f", "o", "o"), ("b", "a", "r")), 0)
def test_valid_input(self): """Algorithm should return correct values under valid input""" self.assertEqual(fuzzycomp.lcs_length("XMJYAUZ", "MZJAWXU"), 4) self.assertEqual(fuzzycomp.lcs_length("foo", "bar"), 0)
def calculate_features(dataset, string_type): ngrams1 = [] ngrams2 = [] ngrams3 = [] ngrams4 = [] dices = [] jaccards = [] jaros = [] lcs = [] mes = [] sws = [] afs = [] bds = [] coses = [] prs = [] sfs = [] edxs = [] gjs = [] jws = [] lws = [] ptss = [] rats = [] sounds = [] tfidfs = [] tss = [] tvs = [] ovs = [] nws = [] wordnet_sims = [] w2vec_sims = [] if string_type == 'Entity': index = 2 elif string_type == 'Parent': index = 4 elif string_type == 'Path': index = 6 for key, row in tqdm(dataset.iterrows()): string1 = row[index] string2 = row[index + 1] ngrams1.append(ngram.NGram.compare(string1, string2, N=1)) ngrams2.append(ngram.NGram.compare(string1, string2, N=2)) ngrams3.append(ngram.NGram.compare(string1, string2, N=3)) ngrams4.append(ngram.NGram.compare(string1, string2, N=4)) lws.append(lev.get_sim_score(string1, string2)) jaros.append(jaro.get_sim_score(string1, string2)) lcs.append(2 * fuzzycomp.lcs_length(string1, string2) / (len(string1) + len(string2))) nws.append(nw.get_raw_score(string1, string2)) sws.append(sw.get_raw_score(string1, string2)) afs.append(af.get_raw_score(string1, string2)) bds.append(bd.get_sim_score(string1, string2)) prs.append(pr.get_sim_score(string1, string2)) edxs.append(edx.get_sim_score(string1, string2)) ptss.append(pts.get_sim_score(string1, string2)) rats.append(rat.get_sim_score(string1, string2)) sounds.append(sound.get_sim_score(string1, string2)) tss.append(ts.get_sim_score(string1, string2)) jws.append(jw.get_sim_score(string1, string2)) row_set1 = get_words(string1) row_set2 = get_words(string2) mes.append(me.get_raw_score(row_set1, row_set2)) coses.append(cos.get_sim_score(row_set1, row_set2)) sfs.append(sf.get_raw_score(row_set1, row_set2)) gjs.append(gj.get_sim_score(row_set1, row_set2)) tfidfs.append(tfidf.get_sim_score(row_set1, row_set2)) tvs.append(tv_ind.get_sim_score(row_set1, row_set2)) ovs.append(over_coef.get_sim_score(row_set1, row_set2)) dices.append(dice.get_sim_score(row_set1, row_set2)) jaccards.append(jac.get_sim_score(row_set1, row_set2)) allsyns1 = set(ss for word in row_set1 for ss in wordnet.synsets(word)) allsyns2 = set(ss for word in row_set2 for ss in wordnet.synsets(word)) best = [ wordnet.wup_similarity(s1, s2) for s1, s2 in product(allsyns1, allsyns2) ] if len(best) > 0: wordnet_sims.append(best[0]) else: wordnet_sims.append(0) w2vec_sims.append(get_word2vec_sim(row_set1, row_set2)) dataset['Ngram1' + '_' + string_type] = ngrams1 dataset['Ngram2' + '_' + string_type] = ngrams2 dataset['Ngram3' + '_' + string_type] = ngrams3 dataset['Ngram4' + '_' + string_type] = ngrams4 dataset['Dice' + '_' + string_type] = dices dataset['Jaccard' + '_' + string_type] = jaccards dataset['Jaro' + '_' + string_type] = jaros dataset['Longest_com_sub' + '_' + string_type] = lcs dataset['Monge-Elkan' + '_' + string_type] = mes dataset['SmithWaterman' + '_' + string_type] = sws dataset['AffineGap' + '_' + string_type] = afs dataset['BagDistance' + '_' + string_type] = bds dataset['Cosine_similarity' + '_' + string_type] = coses dataset['PartialRatio' + '_' + string_type] = prs dataset['Soft_TFIDF' + '_' + string_type] = sfs dataset['Editex' + '_' + string_type] = edxs dataset['GeneralizedJaccard' + '_' + string_type] = gjs dataset['JaroWinkler' + '_' + string_type] = jws dataset['Levenshtein' + '_' + string_type] = lws dataset['PartialTokenSort' + '_' + string_type] = ptss dataset['Ratio' + '_' + string_type] = rats dataset['Soundex' + '_' + string_type] = sounds dataset['TFIDF' + '_' + string_type] = tfidfs dataset['TokenSort' + '_' + string_type] = tss dataset['TverskyIndex' + '_' + string_type] = tvs dataset['OverlapCoef' + '_' + string_type] = ovs dataset['Needleman-Wunsch' + '_' + string_type] = nws dataset['Wordnet_sim' + '_' + string_type] = wordnet_sims dataset['Word2vec_sim' + '_' + string_type] = w2vec_sims return dataset