Пример #1
0
 def test_information_gain(self):
     # Assert information gain weights.
     # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
     v = vector.Corpus([
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 1}, type=True),
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 1}, type=False)
     ])
     self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2)
     print "patten.vector.Corpus.information_gain()"
Пример #2
0
def corpus(top=None):
    """ Returns a Corpus of e-mail messages.
        Document type=True => HAM, False => SPAM.
        Documents are mostly of a technical nature (developer forum posts).
    """
    documents = []
    for score, message in Datasheet.load(
            os.path.join(PATH, "corpora", "spam-apache.csv")):
        document = vector.Document(message,
                                   stemmer="porter",
                                   top=top,
                                   type=int(score) > 0)
        documents.append(document)
    return vector.Corpus(documents)
Пример #3
0
 def test_lsa_concepts(self):
     try:
         import numpy
     except ImportError:
         return
     # Assert LSA concept space.
     corpus = vector.Corpus(
         (vector.Document("cats purr"), vector.Document("cats meow"),
          vector.Document("dogs howl"), vector.Document("dogs bark")))
     corpus.reduce(2)
     # Intuitively, we'd expect two concepts:
     # 1) with cats + purr + meow grouped together,
     # 2) with dogs + howl + bark grouped together.
     i1, i2 = 0, 0
     for i, concept in enumerate(corpus.lsa.concepts):
         self.assertTrue(isinstance(concept, dict))
         if concept["cats"] > 0.5:
             self.assertTrue(concept["purr"] > 0.5)
             self.assertTrue(concept["meow"] > 0.5)
             self.assertTrue(concept["howl"] == 0.0)
             self.assertTrue(concept["bark"] == 0.0)
             i1 = i
         if concept["dogs"] > 0.5:
             self.assertTrue(concept["howl"] > 0.5)
             self.assertTrue(concept["bark"] > 0.5)
             self.assertTrue(concept["purr"] == 0.0)
             self.assertTrue(concept["meow"] == 0.0)
             i2 = i
     # We'd expect the "cat" documents to score high on the "cat" concept vector.
     # We'd expect the "dog" documents to score high on the "dog" concept vector.
     v1 = corpus.lsa[corpus.documents[0].id]
     v2 = corpus.lsa[corpus.documents[2].id]
     self.assertTrue(v1[i1] > 0.7)
     self.assertTrue(v1[i2] == 0.0)
     self.assertTrue(v2[i1] == 0.0)
     self.assertTrue(v2[i2] > 0.7)
     # Assert LSA.transform() for unknown documents.
     v = corpus.lsa.transform(vector.Document("cats dogs"))
     self.assertAlmostEqual(v[0], 0.34, places=2)
     self.assertAlmostEqual(v[1], 0.34, places=2)
     print "pattern.vector.LSA.concepts"
     print "pattern.vector.LSA.transform()"