def test_levenshtein(self): # Returns 0 (identical strings). v = metrics.levenshtein("gallahad", "gallahad") self.assertEqual(v, 0) # Returns 3 (1 insert, 1 delete, 1 replace). v = metrics.levenshtein("gallahad", "_g_llaha") self.assertEqual(v, 3)
def test_levenshtein(self): # Assert 0 (identical strings). v = metrics.levenshtein("gallahad", "gallahad") self.assertEqual(v, 0) # Assert 3 (1 insert, 1 delete, 1 replace). v = metrics.levenshtein("gallahad", "_g_llaha") self.assertEqual(v, 3) print("pattern.metrics.levenshtein()")
def duplicates( options, parser ): # get corpus, else exit try: corpus = Corpus.objects.get( name=options.corpus ) except: return error( message="corpus was not found! use sync.py script to load corpora", parser=parser ) # number of segments inside the corpus document_segments = Document_Segment.objects.filter(document__corpus = corpus) num_of_segments = document_segments.count() # print similarity("ciao", "caio", metric=DICE) c = 0 for i in range(0, num_of_segments ): for j in range (i+1, num_of_segments ): #print i,j, document_segments[i].segment.stemmed, document_segments[j].segment.stemmed a = document_segments[i].segment.stemmed b = document_segments[j].segment.stemmed if a == b: # equal strings? not now, please c+=1 continue dl = len(a) - len(b) ml = max( len(a), len(b) ) if abs(dl) > ml/10.0: continue # test levensh ratio = 1-levenshtein(a,b)/float(ml) if ratio < .75: continue # print similarity(a, b, metric=DICE) print print ratio print " ", document_segments[i].segment.stemmed, document_segments[j].segment.stemmed print " ", document_segments[i].segment.content, document_segments[j].segment.content print c+=1 # inner cycle #break print "found", c, "duplicates"