def small_fLoc_test3(): s1 = ", ?Van Daal ," tnt = NER() ee = tnt(s1) print ee for e in ee: print e[0], e[1], e[2], " : ", s1[e[1] : e[2] + 1] # print "Entity: ", s1[e[1]: e[2] + 1] m = DistanceMatrix(fLevenshteinDistance, s1, tnt._processed_text) p = fMinPath()(m) print m.toString(p)
def test_lev6(self): for t in TEXTS: s1, s2 = t[0], t[1] m1 = DistanceMatrix(def_value=111) fLevDistanceDiag(3).fill_matrix(s1, s2, m1) m2 = UprightDistanceMatrix(def_value=111) fLevDistanceDiag(3).fill_matrix(s1, s2, m2) m3 = HybridSubsetDistanceMatrix(def_value=111) fLevDistanceDiag(3).fill_matrix(s1, s2, m3) m4 = UprightSubsetDistanceMatrix(7, def_value=111) fLevDistanceDiag(3).fill_matrix(s1, s2, m4) m5 = fLevDistanceDiag(3).matrix(s1, s2, def_value=111) m6 = c_lev_distance.fLevDistanceDiag(3).matrix(s1, s2, def_value=111) self.assertEqual(m1, m2) self.assertEqual(m2, m3) self.assertEqual(m3, m4) self.assertEqual(m4, m5) self.assertEqual(m5, m6)
def test_lev5(self): for t in TEXTS: s1, s2 = t[0], t[1] m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) p1 = fMinPath()(m1) self.assertEqual(p1, fLevPath()(m1)) self.assertEqual(s2, operations_to_text(p1, s1, s2)) m2 = UprightDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m2) p2 = fMinPath()(m2) self.assertEqual(p2, fLevPath()(m2)) self.assertEqual(s2, operations_to_text(p2, s1, s2)) m3 = HybridSubsetDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m3) p3 = fMinPath()(m3) self.assertEqual(p3, fLevPath()(m3)) self.assertEqual(s2, operations_to_text(p3, s1, s2)) m4 = UprightSubsetDistanceMatrix(len(s1)) fClassicalLevDistance().fill_matrix(s1, s2, m4) p4 = fMinPath()(m4) self.assertEqual(p4, fLevPath()(m4)) self.assertEqual(s2, operations_to_text(p4, s1, s2)) self.assertEqual(p1, p2) self.assertEqual(p2, p3) self.assertEqual(p3, p4)
def test_ldistance(self): ed = fClassicalLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fClassicalLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) ed = fLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) em = [ [1, 2, 2, 3, 4, 5 ], [2, 2, 3, 2, 3, 4 ], [3, 3, 3, 3, 2, 3 ], [4, 4, 4, 4, 3, 3 ], [5, 5, 5, 5, 4, 3 ] ] m = DistanceMatrix() fClassicalLevDistance().fill_matrix("Texmt", "sdText", m) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row,col)) m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix()) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row,col)) mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)] i = 0 for e in fMinPath()(m): self.assertEqual(e, mp[i]) i+=1 f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Text 2"), True) f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Tessxt 2"), False)
def test_distance(self): text, pattern = "Test acd abcd", "abc" f = fBitapDistance(fRjInsert) m = f.fill_matrix(text, pattern, DistanceMatrix()) print "Distance matrix:\n", m.toString(text, pattern) f = fBitapDistance(fRjInsert) print "Distance:", f("texts", "tddext") b = BitapDistanceMatrixSet(fRjInsert, "test string", "test pattern") print "Distance matrixes set:\n", b
def test_ldistance(self): ed = fClassicalLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fClassicalLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) ed = fLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) em = [[1, 2, 2, 3, 4, 5], [2, 2, 3, 2, 3, 4], [3, 3, 3, 3, 2, 3], [4, 4, 4, 4, 3, 3], [5, 5, 5, 5, 4, 3]] m = DistanceMatrix() fClassicalLevDistance().fill_matrix("Texmt", "sdText", m) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row, col)) m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix()) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row, col)) mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)] i = 0 for e in fMinPath()(m): self.assertEqual(e, mp[i]) i += 1 f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Text 2"), True) f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Tessxt 2"), False)
def test_lev4(self): for t in TEXTS: s1, s2 = t[0], t[1] m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) m2 = UprightDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m2) m3 = HybridSubsetDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m3) m4 = UprightSubsetDistanceMatrix(len(s1)) fClassicalLevDistance().fill_matrix(s1, s2, m4) self.assertEqual(m1, m2) self.assertEqual(m2, m3) self.assertEqual(m3, m4)
def test_lev3(self): for t in TEXTS: s1, s2 = t[0], t[1] d = fClassicalLevDistance()(s1, s2) m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) m2 = DistanceMatrix() fLevDistance().fill_matrix(s1, s2, m2) m3 = DistanceMatrix() fLevDistance2().fill_matrix(s1, s2, m3) m4 = DistanceMatrix(111) fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m4) m5 = fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) # C-code m6 = c_lev_distance.fLevDistance().matrix(s1, s2) m6_1 = DistanceMatrix() c_lev_distance.fLevDistance().fill_matrix(s1, s2, m6_1) m7 = c_lev_distance.fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) m7_1 = DistanceMatrix(def_value=111) c_lev_distance.fLevDistanceDiag(len(s1) - 1).fill_matrix( s1, s2, m7_1) self.assertEqual(m1, m2) self.assertEqual(m2, m3) self.assertEqual(m3, m4) self.assertEqual(m4, m5) self.assertEqual(m5, m6) self.assertEqual(m6, m7) self.assertEqual(m6, m6_1) self.assertEqual(m7, m7_1) # test C and Python diagonal m8 = c_lev_distance.fLevDistanceDiag(2).matrix(s1, s2) m9 = fLevDistanceDiag(2).matrix(s1, s2) self.assertEqual(m6, m7)
def test_hamming(self): def f1(): fHammingDistance()("Pattern 1", "Pattern") def f2(): fHammingDistance()("Pattern", "Pattern 2") def f3(): fHammingDistance()(None, "Pattern 2") def f4(): fHammingDistance()(None, None) def f5(): fHammingDistance()("pattern", None) self.assertRaises(AssertionError, f1) self.assertRaises(AssertionError, f2) self.assertRaises(AssertionError, f3) self.assertRaises(AssertionError, f4) self.assertRaises(AssertionError, f5) self.assertEqual(fHammingDistance()("Pattern 1", "Pattern 2"), 1) self.assertEqual(fHammingDistance()("Pattern", "Pattern"), 0) self.assertEqual(fHammingDistance()("", ""), 0) em = [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 0], [0, 0, 0, 3, 0, 0, 0], [0, 0, 0, 0, 4, 0, 0], [0, 0, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 4]] m = fHammingDistance().fill_matrix("abcdefg", "cbdrtfg", DistanceMatrix()) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row, col))
t = time.time() #prepare_data() #step1_fetch_text(os.path.join(data_dir, "xml")) #step2_run_tnt(os.path.join(data_dir, "txt")) #step3_calc_lev_distance() #step4_calc_minpath() #step5_calc_minpath_dt() #step6_cleantnt_txt() #step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000) from gravity.tae.match.lev_distance import fLevDistanceDiag #s1 = "Song about Alice dream" #s2 = "Song al?ce dream" s1 = " abc d dfg rer klm" s2 = "abc dfg klm" m = DistanceMatrix(111) #fLevDistanceDiag(1).fill_matrix(s2, s1, m) fLevDistance().fill_matrix(s2, s1, m) print m.toString(s2, s1, fMinPath()(m)) print "Elapsed time: ", (time.time() - t)
step1_fetch_text(os.path.join(data_dir, "xml")) step2_run_tnt(os.path.join(data_dir, "txt")) step6_cleantnt_txt() t = time.time() #prepare_data() #step1_fetch_text(os.path.join(data_dir, "xml")) #step2_run_tnt(os.path.join(data_dir, "txt")) #step3_calc_lev_distance() #step4_calc_minpath() #step5_calc_minpath_dt() #step6_cleantnt_txt() #step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000) from gravity.tae.match.lev_distance import fLevDistanceDiag #s1 = "Song about Alice dream" #s2 = "Song al?ce dream" s1 = " abc d dfg rer klm" s2 = "abc dfg klm" m = DistanceMatrix(111) #fLevDistanceDiag(1).fill_matrix(s2, s1, m) fLevDistance().fill_matrix(s2, s1, m) print m.toString(s2, s1, fMinPath()(m)) print "Elapsed time: ", (time.time() - t)
# small_fLoc_test2() # small_fLoc_test3() # general_test() # lev_test() # levdiag_test0() # levdiag_test2() # levdiag_test1() # levdiag_test3() # levdiag_test4() # levdiag_test5() t1, t2 = "asdsdsa", "ssasasas" m = DistanceMatrix(fLevenshteinDistance2, t1, t2) print m.toString(t1, t2) t1, t2 = "asdsdsa", "ssasasas" m = DistanceMatrix(fLevenshteinDistance, t1, t2) print m.toString(t1, t2) t1, t2 = "asdsdsa", "ssasasas" m = DistanceMatrix(fClassicalLevenshteinDistance, t1, t2) print m.toString(t1, t2) t1, t2 = "asdsdsa", "ssasasas" m = DistanceMatrix(fDiagDistance(6), t1, t2, 111) print m.toString(t1, t2)