def test_pairs(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.closest_pair([0, 1, 2])) self.assertEqual((5, 3), c.closest_pair([3, 4, 5])) self.assertEqual((7, 6), c.closest_pair([6, 7])) self.assertEqual((2, 0), c.farthest_pair([0, 1, 2])) self.assertEqual((5, 4), c.farthest_pair([3, 4, 5])) self.assertEqual((7, 6), c.farthest_pair([6, 7]))
def test_nonseeded_clustering(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual((2, 1), c.min_link()) c.merge(2, 1) self.assertTrue(c.min_link() in [(4, 3), (5, 3)]) c.merge(3, 4) c.merge(3, 5) self.assertEqual((7, 6), c.min_link())
def test_distance(self): raw_docs = ["a b c", "b c d", "d e f"] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual(0, c.distance[0, 0]) self.assertEqual(0.5, c.distance[1, 0]) self.assertEqual(0, c.distance[1, 1]) self.assertEqual(1.0, c.distance[2, 0]) self.assertEqual(0.8, c.distance[2, 1]) self.assertEqual(0, c.distance[2, 2])
def test_distance(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual(0, c.distance[0, 0]) self.assertEqual(0.5, c.distance[1, 0]) self.assertEqual(0, c.distance[1, 1]) self.assertEqual(1.0, c.distance[2, 0]) self.assertEqual(0.8, c.distance[2, 1]) self.assertEqual(0, c.distance[2, 2])
def test_nearest_neighbors(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) c.pp_distance(range(0, len(test_docs))) self.assertEqual([1], c.closest_neighbors([0], 1)) self.assertEqual([1, 2], c.closest_neighbors([0], 2)) self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3)) self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4)) self.assertEqual([5], c.closest_neighbors([3, 4], 1)) self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
def test_trigram(self): unigrams = NGramSpace(3) x = unigrams.parse("This is a sentence") y = unigrams.parse("This is another sentence") self.assertEqual([1, 2], x) self.assertEqual([3, 4], y) self.assertEqual(0, overlap(x, y)) self.assertEqual(0, overlap(y, x)) self.assertEqual(0, jaccard(x, y)) self.assertEqual(0, jaccard(y, x))
def test_bigram(self): unigrams = NGramSpace(2) x = unigrams.parse("This is a sentence") y = unigrams.parse("This is another sentence") self.assertEqual([1, 2, 3], x) self.assertEqual([1, 4, 5], y) self.assertEqual(1, overlap(x, y)) self.assertEqual(1, overlap(y, x)) self.assertEqual(1.0 / 5.0, jaccard(x, y)) self.assertEqual(1.0 / 5.0, jaccard(y, x))
def test_trigram(self): unigrams = NGramSpace(3) x = unigrams.parse('This is a sentence') y = unigrams.parse('This is another sentence') self.assertEqual([1, 2], x) self.assertEqual([3, 4], y) self.assertEqual(0, overlap(x, y)) self.assertEqual(0, overlap(y, x)) self.assertEqual(0, jaccard(x, y)) self.assertEqual(0, jaccard(y, x))
def test_bigram(self): unigrams = NGramSpace(2) x = unigrams.parse('This is a sentence') y = unigrams.parse('This is another sentence') self.assertEqual([1, 2, 3], x) self.assertEqual([1, 4, 5], y) self.assertEqual(1, overlap(x, y)) self.assertEqual(1, overlap(y, x)) self.assertEqual(1.0 / 5.0, jaccard(x, y)) self.assertEqual(1.0 / 5.0, jaccard(y, x))
def test_clustering(self): raw_docs = ["a b c", "b c d", "d e f"] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual([1, 1, 2], c.assignments) self.assertEqual((2, 1), c.min_link()) c.merge(2, 0) self.assertEqual([2, 2, 2], c.assignments)
def test_clustering(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual([1, 1, 2], c.assignments) self.assertEqual((2, 1), c.min_link()) c.merge(2, 0) self.assertEqual([2, 2, 2], c.assignments)
def setup(source, pdf_path): ngrams = NGramSpace(4) print "parsing documents at %s..." % source docs = [ extract_row(row, pdf_path, ngrams) for row in csv.DictReader(open(source, 'r')) ] print "clustering %d documents..." % len(docs) clustering = Clustering([doc.parsed for doc in docs]) return (clustering, docs)