예제 #1
0
 def test_information_gain(self):
     # Assert information gain weights.
     # Example from
     # http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
     m = vector.Model([
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 1}, type=True),
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 1}, type=False)], weight=None
     )
     self.assertAlmostEqual(m.information_gain("wind"), 0.52, places=2)
     # Example from http://rutcor.rutgers.edu/~amai/aimath02/PAPERS/14.pdf
     m = vector.Model([
         vector.Document({"3": 1}, type=True),
         vector.Document({"3": 5}, type=True),
         vector.Document({"3": 1}, type=False),
         vector.Document({"3": 7}, type=True),
         vector.Document({"3": 2}, type=False),
         vector.Document({"3": 2}, type=True),
         vector.Document({"3": 6}, type=False),
         vector.Document({"3": 4}, type=True),
         vector.Document({"3": 0}, type=False),
         vector.Document({"3": 9}, type=True)], weight=None
     )
     self.assertAlmostEqual(m.ig("3"), 0.571, places=3)
     self.assertAlmostEqual(m.gr("3"), 0.195, places=3)
     print("patten.vector.Model.information_gain()")
     print("patten.vector.Model.gain_ratio()")
예제 #2
0
 def setUp(self):
     # Test model.
     self.model = vector.Model(
         documents=(vector.Document("cats purr", name="cat1", type=u"cåt"),
                    vector.Document("cats meow", name="cat2", type=u"cåt"),
                    vector.Document("dogs howl", name="dog1", type=u"døg"),
                    vector.Document("dogs bark", name="dog2", type=u"døg")))
예제 #3
0
 def test_condensed_nearest_neighbor(self):
     # Assert CNN for data reduction.
     v = vector.Model((
         vector.Document("woof", type="dog"),
         vector.Document("meow", type="cat"),  # redundant
         vector.Document("meow meow", type="cat")))
     self.assertTrue(len(v.cnn()) < len(v))
     print("pattern.vector.Model.condensed_nearest_neighbor()")
예제 #4
0
 def test_tfidf(self):
     # Assert tf-idf for documents not in a model.
     v = [[0.0, 0.4, 0.6], [0.6, 0.4, 0.0]]
     v = [dict(enumerate(v)) for v in v]
     m = vector.Model([vector.Document(x) for x in v], weight=vector.TFIDF)
     v = [vector.sparse(v) for v in vector.tf_idf(v)]
     self.assertEqual(sorted(m[0].vector.items()), sorted(v[0].items()))
     self.assertAlmostEqual(v[0][2], 0.42, places=2)
     self.assertAlmostEqual(v[1][0], 0.42, places=2)
     print("pattern.vector.tf_idf()")
예제 #5
0
def model(top=None):
    """ Returns a Model of e-mail messages.
        Document type=True => HAM, False => SPAM.
        Documents are mostly of a technical nature (developer forum posts).
    """
    documents = []
    for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")):
        document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0)
        documents.append(document)
    return vector.Model(documents)
예제 #6
0
 def test_classifier(self):
     # Assert that the model classifier is correctly saved and loaded.
     p = "test.model.tmp"
     v = vector.Model([vector.Document("chirp", type="bird")])
     v.train(vector.SVM)
     v.save(p)
     v = vector.Model.load(p)
     self.assertTrue(isinstance(v.classifier, vector.SVM))
     os.unlink(p)
     print("pattern.vector.Model.classifier")
     print("pattern.vector.Model.train()")
예제 #7
0
 def test_information_gain(self):
     # Assert information gain weights.
     # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
     v = vector.Model([
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 1}, type=True),
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 1}, type=False)
     ],
                      weight=vector.TF)
     self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2)
     print "patten.vector.Model.information_gain()"
예제 #8
0
 def test_lsa_concepts(self):
     try:
         import numpy
     except ImportError:
         return
     # Assert LSA concept space.
     model = vector.Model((
         vector.Document("cats purr"),
         vector.Document("cats meow"),
         vector.Document("dogs howl"),
         vector.Document("dogs bark")
     ))
     model.reduce(2)
     # Intuitively, we'd expect two concepts:
     # 1) with cats + purr + meow grouped together,
     # 2) with dogs + howl + bark grouped together.
     i1, i2 = 0, 0
     for i, concept in enumerate(model.lsa.concepts):
         self.assertTrue(isinstance(concept, dict))
         if concept["cats"] > 0.5:
             self.assertTrue(concept["purr"] > 0.5)
             self.assertTrue(concept["meow"] > 0.5)
             self.assertTrue(concept["howl"] == 0.0)
             self.assertTrue(concept["bark"] == 0.0)
             i1 = i
         if concept["dogs"] > 0.5:
             self.assertTrue(concept["howl"] > 0.5)
             self.assertTrue(concept["bark"] > 0.5)
             self.assertTrue(concept["purr"] == 0.0)
             self.assertTrue(concept["meow"] == 0.0)
             i2 = i
     # We'd expect the "cat" documents to score high on the "cat" concept vector.
     # We'd expect the "dog" documents to score high on the "dog" concept
     # vector.
     v1 = model.lsa[model.documents[0].id]
     v2 = model.lsa[model.documents[2].id]
     self.assertTrue(v1.get(i1, 0) > 0.7)
     self.assertTrue(v1.get(i2, 0) == 0.0)
     self.assertTrue(v2.get(i1, 0) == 0.0)
     self.assertTrue(v2.get(i2, 0) > 0.7)
     # Assert LSA.transform() for unknown documents.
     v = model.lsa.transform(vector.Document("cats dogs"))
     self.assertAlmostEqual(v[0], 0.34, places=2)
     self.assertAlmostEqual(v[1], 0.34, places=2)
     print("pattern.vector.LSA.concepts")
     print("pattern.vector.LSA.transform()")
예제 #9
0
 def test_feature_selection(self):
     # Assert information gain feature selection.
     m = vector.Model((
         vector.Document("the cat sat on the mat", type="cat", stopwords=True),
         vector.Document("the dog howled at the moon", type="dog", stopwords=True)
     ))
     v = m.feature_selection(top=3, method=vector.IG, threshold=0.0)
     self.assertEqual(v, ["at", "cat", "dog"])
     # Assert Model.filter().
     v = m.filter(v)
     self.assertTrue("at"  in v.terms)
     self.assertTrue("cat" in v.terms)
     self.assertTrue("dog" in v.terms)
     self.assertTrue("the" not in v.terms)
     self.assertTrue("mat" not in v.terms)
     print("pattern.vector.Model.feature_selection()")
     print("pattern.vector.Model.filter()")
def lsa_apply(df):
    print("Building model")
    m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF)
    print("Returning reduction")
    return m.reduce(2)
def get_lsa(texts):
    docs = [pv.Document(a) for a in texts]
    model = pv.Model(docs, weight=pv.TFIDF)
    lsa = model.reduce(2)
    return lsa
예제 #12
0
def lsa_apply(df):
    m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF)
    return m.reduce(2)
예제 #13
0
def create_models(group):
    docs = [pv.Document(item, threshold=1) for item in group]
    return pv.Model(docs, weight=pv.TFIDF)