def test_trigram(self): unigrams = NGramSpace(3) x = unigrams.parse("This is a sentence") y = unigrams.parse("This is another sentence") self.assertEqual([1, 2], x) self.assertEqual([3, 4], y) self.assertEqual(0, overlap(x, y)) self.assertEqual(0, overlap(y, x)) self.assertEqual(0, jaccard(x, y)) self.assertEqual(0, jaccard(y, x))
def test_bigram(self): unigrams = NGramSpace(2) x = unigrams.parse("This is a sentence") y = unigrams.parse("This is another sentence") self.assertEqual([1, 2, 3], x) self.assertEqual([1, 4, 5], y) self.assertEqual(1, overlap(x, y)) self.assertEqual(1, overlap(y, x)) self.assertEqual(1.0 / 5.0, jaccard(x, y)) self.assertEqual(1.0 / 5.0, jaccard(y, x))
def test_trigram(self): unigrams = NGramSpace(3) x = unigrams.parse('This is a sentence') y = unigrams.parse('This is another sentence') self.assertEqual([1, 2], x) self.assertEqual([3, 4], y) self.assertEqual(0, overlap(x, y)) self.assertEqual(0, overlap(y, x)) self.assertEqual(0, jaccard(x, y)) self.assertEqual(0, jaccard(y, x))
def test_bigram(self): unigrams = NGramSpace(2) x = unigrams.parse('This is a sentence') y = unigrams.parse('This is another sentence') self.assertEqual([1, 2, 3], x) self.assertEqual([1, 4, 5], y) self.assertEqual(1, overlap(x, y)) self.assertEqual(1, overlap(y, x)) self.assertEqual(1.0 / 5.0, jaccard(x, y)) self.assertEqual(1.0 / 5.0, jaccard(y, x))
def __init__(self, docs): self.num_docs = len(docs) self.assignments = range(0, self.num_docs) self.distance = SymmetricMatrix(self.num_docs) count = 0 for i in range(0, self.num_docs): for j in range(0, i + 1): self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j]) count += 1 if count % 1000000 == 0: print "Computed %d distances out of %d..." % (count, self.num_docs * self.num_docs / 2) for i in range(0, self.num_docs): for j in range(0, i): if self.distance[i, j] == 0 and self.assignments[i] != self.assignments[j]: self.merge(i, j)
def __init__(self, docs): self.num_docs = len(docs) self.assignments = range(0, self.num_docs) self.distance = SymmetricMatrix(self.num_docs) count = 0 for i in range(0, self.num_docs): for j in range(0, i + 1): self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j]) count += 1 if count % 1000000 == 0: print "Computed %d distances out of %d..." % ( count, self.num_docs * self.num_docs / 2) for i in range(0, self.num_docs): for j in range(0, i): if self.distance[ i, j] == 0 and self.assignments[i] != self.assignments[j]: self.merge(i, j)