def test_lev5(self): for t in TEXTS: s1, s2 = t[0], t[1] m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) p1 = fMinPath()(m1) self.assertEqual(p1, fLevPath()(m1)) self.assertEqual(s2, operations_to_text(p1, s1, s2)) m2 = UprightDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m2) p2 = fMinPath()(m2) self.assertEqual(p2, fLevPath()(m2)) self.assertEqual(s2, operations_to_text(p2, s1, s2)) m3 = HybridSubsetDistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m3) p3 = fMinPath()(m3) self.assertEqual(p3, fLevPath()(m3)) self.assertEqual(s2, operations_to_text(p3, s1, s2)) m4 = UprightSubsetDistanceMatrix(len(s1)) fClassicalLevDistance().fill_matrix(s1, s2, m4) p4 = fMinPath()(m4) self.assertEqual(p4, fLevPath()(m4)) self.assertEqual(s2, operations_to_text(p4, s1, s2)) self.assertEqual(p1, p2) self.assertEqual(p2, p3) self.assertEqual(p3, p4)
def step5_calc_minpath_dt(p_txt_folder='tnt', o_txt_folder='txt', size_limit=-1): res1 = [] res2 = [] skipped = [] print ":: Calculate minimal path deviation ." print " :: processed text folder:", p_txt_folder print " :: original text folder:", o_txt_folder print "-" * 64 print " Deviation | Text1 size | Text2 size | Text diff | Distance | " print "-" * 64 for s in gen_string_pairs(p_txt_folder, o_txt_folder): if size_limit > 0 and (len(s[1]) > size_limit or len(s[0]) > size_limit): skipped.append(os.path.basename(s[0])) continue m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) p = fMinPath()(m) #p = fLevPath(insPriority = 1)(m) i = 0 mdt1 = mdt2 = 0 for col in range(m.cols - 1, -1, -1): arow = (col * m.rows) // m.cols while i < len(p) and p[i][1] == col: dt = arow - p[i][0] if dt < 0: if dt < mdt2: mdt2 = dt else: if dt > mdt1: mdt1 = dt i += 1 res1.append(mdt1) res2.append(mdt2) print " [%4d ..%3d] " % (mdt2, mdt1), "| %6d " % len( s[0]), "| %6d " % len(s[1]), " | %3d " % ( len(s[0]) - len(s[1])), " | %3d " % m.get( m.rows - 1, m.cols - 1), "| %s " % os.path.basename(s[3]) res1.sort() res2.sort() print "=" * 52 print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" print "=" * 52
def step5_calc_minpath_dt(p_txt_folder = 'tnt', o_txt_folder = 'txt', size_limit = -1): res1 = [] res2 = [] skipped = [] print ":: Calculate minimal path deviation ." print " :: processed text folder:", p_txt_folder print " :: original text folder:", o_txt_folder print "-"*64 print " Deviation | Text1 size | Text2 size | Text diff | Distance | " print "-"*64 for s in gen_string_pairs(p_txt_folder, o_txt_folder): if size_limit > 0 and (len(s[1]) > size_limit or len(s[0]) > size_limit): skipped.append(os.path.basename(s[0])) continue m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) p = fMinPath()(m) #p = fLevPath(insPriority = 1)(m) i = 0 mdt1 = mdt2 = 0 for col in range(m.cols - 1, -1, -1): arow = (col * m.rows) // m.cols while i < len(p) and p[i][1] == col: dt = arow - p[i][0] if dt < 0: if dt < mdt2: mdt2 = dt else: if dt > mdt1: mdt1 = dt i += 1 res1.append(mdt1) res2.append(mdt2) print " [%4d ..%3d] " % (mdt2, mdt1), "| %6d " % len(s[0]), "| %6d " % len(s[1]), " | %3d " % (len(s[0]) - len(s[1])), " | %3d " % m.get(m.rows-1, m.cols-1), "| %s " % os.path.basename(s[3]) res1.sort() res2.sort() print "=" * 52 print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" print "=" * 52
def test_ldistance(self): ed = fClassicalLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fClassicalLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) ed = fLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) em = [ [1, 2, 2, 3, 4, 5 ], [2, 2, 3, 2, 3, 4 ], [3, 3, 3, 3, 2, 3 ], [4, 4, 4, 4, 3, 3 ], [5, 5, 5, 5, 4, 3 ] ] m = DistanceMatrix() fClassicalLevDistance().fill_matrix("Texmt", "sdText", m) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row,col)) m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix()) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row,col)) mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)] i = 0 for e in fMinPath()(m): self.assertEqual(e, mp[i]) i+=1 f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Text 2"), True) f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Tessxt 2"), False)
def test_ldistance(self): ed = fClassicalLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fClassicalLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) ed = fLevDistance() self.assertEqual(ed("Text", "Text"), 0) ed = fLevDistance() self.assertEqual(ed("Texmt", "sdText"), 3) em = [[1, 2, 2, 3, 4, 5], [2, 2, 3, 2, 3, 4], [3, 3, 3, 3, 2, 3], [4, 4, 4, 4, 3, 3], [5, 5, 5, 5, 4, 3]] m = DistanceMatrix() fClassicalLevDistance().fill_matrix("Texmt", "sdText", m) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row, col)) m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix()) for row in range(len(em)): for col in range(len(em[row])): self.assertEqual(em[row][col], m.get(row, col)) mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)] i = 0 for e in fMinPath()(m): self.assertEqual(e, mp[i]) i += 1 f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Text 2"), True) f = fDistanceMatch(fLevDistance) self.assertEqual(f("Text 1", "Tessxt 2"), False)
def step4_calc_minpath(): for s in gen_string_pairs("tnt", "txt"): m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) print fMinPath()(m)
t = time.time() #prepare_data() #step1_fetch_text(os.path.join(data_dir, "xml")) #step2_run_tnt(os.path.join(data_dir, "txt")) #step3_calc_lev_distance() #step4_calc_minpath() #step5_calc_minpath_dt() #step6_cleantnt_txt() #step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000) from gravity.tae.match.lev_distance import fLevDistanceDiag #s1 = "Song about Alice dream" #s2 = "Song al?ce dream" s1 = " abc d dfg rer klm" s2 = "abc dfg klm" m = DistanceMatrix(111) #fLevDistanceDiag(1).fill_matrix(s2, s1, m) fLevDistance().fill_matrix(s2, s1, m) print m.toString(s2, s1, fMinPath()(m)) print "Elapsed time: ", (time.time() - t)
step1_fetch_text(os.path.join(data_dir, "xml")) step2_run_tnt(os.path.join(data_dir, "txt")) step6_cleantnt_txt() t = time.time() #prepare_data() #step1_fetch_text(os.path.join(data_dir, "xml")) #step2_run_tnt(os.path.join(data_dir, "txt")) #step3_calc_lev_distance() #step4_calc_minpath() #step5_calc_minpath_dt() #step6_cleantnt_txt() #step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000) from gravity.tae.match.lev_distance import fLevDistanceDiag #s1 = "Song about Alice dream" #s2 = "Song al?ce dream" s1 = " abc d dfg rer klm" s2 = "abc dfg klm" m = DistanceMatrix(111) #fLevDistanceDiag(1).fill_matrix(s2, s1, m) fLevDistance().fill_matrix(s2, s1, m) print m.toString(s2, s1, fMinPath()(m)) print "Elapsed time: ", (time.time() - t)