예제 #1
0
    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))
예제 #2
0
    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))
예제 #3
0
    def test_nonseeded_clustering(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())
        c.merge(1, 0)
        self.assertEqual((2, 1), c.min_link())
        c.merge(2, 1)
        self.assertTrue(c.min_link() in [(4, 3), (5, 3)])
        c.merge(3, 4)
        c.merge(3, 5)
        self.assertEqual((7, 6), c.min_link())
예제 #4
0
    def test_distance(self):
        raw_docs = ["a b c", "b c d", "d e f"]
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual(0, c.distance[0, 0])
        self.assertEqual(0.5, c.distance[1, 0])
        self.assertEqual(0, c.distance[1, 1])
        self.assertEqual(1.0, c.distance[2, 0])
        self.assertEqual(0.8, c.distance[2, 1])
        self.assertEqual(0, c.distance[2, 2])
예제 #5
0
    def test_nonseeded_clustering(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())
        c.merge(1, 0)
        self.assertEqual((2, 1), c.min_link())
        c.merge(2, 1)
        self.assertTrue(c.min_link() in [(4, 3), (5, 3)])
        c.merge(3, 4)
        c.merge(3, 5)
        self.assertEqual((7, 6), c.min_link())
예제 #6
0
    def test_distance(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual(0, c.distance[0, 0])
        self.assertEqual(0.5, c.distance[1, 0])
        self.assertEqual(0, c.distance[1, 1])
        self.assertEqual(1.0, c.distance[2, 0])
        self.assertEqual(0.8, c.distance[2, 1])
        self.assertEqual(0, c.distance[2, 2])
예제 #7
0
    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
예제 #8
0
    def test_trigram(self):
        unigrams = NGramSpace(3)

        x = unigrams.parse("This is a sentence")
        y = unigrams.parse("This is another sentence")

        self.assertEqual([1, 2], x)
        self.assertEqual([3, 4], y)

        self.assertEqual(0, overlap(x, y))
        self.assertEqual(0, overlap(y, x))

        self.assertEqual(0, jaccard(x, y))
        self.assertEqual(0, jaccard(y, x))
예제 #9
0
    def test_bigram(self):
        unigrams = NGramSpace(2)

        x = unigrams.parse("This is a sentence")
        y = unigrams.parse("This is another sentence")

        self.assertEqual([1, 2, 3], x)
        self.assertEqual([1, 4, 5], y)

        self.assertEqual(1, overlap(x, y))
        self.assertEqual(1, overlap(y, x))

        self.assertEqual(1.0 / 5.0, jaccard(x, y))
        self.assertEqual(1.0 / 5.0, jaccard(y, x))
예제 #10
0
    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
예제 #11
0
    def test_trigram(self):
        unigrams = NGramSpace(3)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2], x)
        self.assertEqual([3, 4], y)

        self.assertEqual(0, overlap(x, y))
        self.assertEqual(0, overlap(y, x))

        self.assertEqual(0, jaccard(x, y))
        self.assertEqual(0, jaccard(y, x))
예제 #12
0
    def test_bigram(self):
        unigrams = NGramSpace(2)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2, 3], x)
        self.assertEqual([1, 4, 5], y)

        self.assertEqual(1, overlap(x, y))
        self.assertEqual(1, overlap(y, x))

        self.assertEqual(1.0 / 5.0, jaccard(x, y))
        self.assertEqual(1.0 / 5.0, jaccard(y, x))
예제 #13
0
    def test_clustering(self):
        raw_docs = ["a b c", "b c d", "d e f"]
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)
예제 #14
0
    def test_clustering(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)
예제 #15
0
def setup(source, pdf_path):
    ngrams = NGramSpace(4)
    print "parsing documents at %s..." % source
    docs = [
        extract_row(row, pdf_path, ngrams)
        for row in csv.DictReader(open(source, 'r'))
    ]
    print "clustering %d documents..." % len(docs)
    clustering = Clustering([doc.parsed for doc in docs])
    return (clustering, docs)