def test_information_gain(self): # Assert information gain weights. # Example from # http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf m = vector.Model([ vector.Document({"wind": 1}, type=False), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 1}, type=True), vector.Document({"wind": 1}, type=False), vector.Document({"wind": 1}, type=False)], weight=None ) self.assertAlmostEqual(m.information_gain("wind"), 0.52, places=2) # Example from http://rutcor.rutgers.edu/~amai/aimath02/PAPERS/14.pdf m = vector.Model([ vector.Document({"3": 1}, type=True), vector.Document({"3": 5}, type=True), vector.Document({"3": 1}, type=False), vector.Document({"3": 7}, type=True), vector.Document({"3": 2}, type=False), vector.Document({"3": 2}, type=True), vector.Document({"3": 6}, type=False), vector.Document({"3": 4}, type=True), vector.Document({"3": 0}, type=False), vector.Document({"3": 9}, type=True)], weight=None ) self.assertAlmostEqual(m.ig("3"), 0.571, places=3) self.assertAlmostEqual(m.gr("3"), 0.195, places=3) print("patten.vector.Model.information_gain()") print("patten.vector.Model.gain_ratio()")
def setUp(self): # Test model. self.model = vector.Model( documents=(vector.Document("cats purr", name="cat1", type=u"cåt"), vector.Document("cats meow", name="cat2", type=u"cåt"), vector.Document("dogs howl", name="dog1", type=u"døg"), vector.Document("dogs bark", name="dog2", type=u"døg")))
def test_condensed_nearest_neighbor(self): # Assert CNN for data reduction. v = vector.Model(( vector.Document("woof", type="dog"), vector.Document("meow", type="cat"), # redundant vector.Document("meow meow", type="cat"))) self.assertTrue(len(v.cnn()) < len(v)) print("pattern.vector.Model.condensed_nearest_neighbor()")
def test_tfidf(self): # Assert tf-idf for documents not in a model. v = [[0.0, 0.4, 0.6], [0.6, 0.4, 0.0]] v = [dict(enumerate(v)) for v in v] m = vector.Model([vector.Document(x) for x in v], weight=vector.TFIDF) v = [vector.sparse(v) for v in vector.tf_idf(v)] self.assertEqual(sorted(m[0].vector.items()), sorted(v[0].items())) self.assertAlmostEqual(v[0][2], 0.42, places=2) self.assertAlmostEqual(v[1][0], 0.42, places=2) print("pattern.vector.tf_idf()")
def model(top=None): """ Returns a Model of e-mail messages. Document type=True => HAM, False => SPAM. Documents are mostly of a technical nature (developer forum posts). """ documents = [] for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")): document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0) documents.append(document) return vector.Model(documents)
def test_classifier(self): # Assert that the model classifier is correctly saved and loaded. p = "test.model.tmp" v = vector.Model([vector.Document("chirp", type="bird")]) v.train(vector.SVM) v.save(p) v = vector.Model.load(p) self.assertTrue(isinstance(v.classifier, vector.SVM)) os.unlink(p) print("pattern.vector.Model.classifier") print("pattern.vector.Model.train()")
def test_information_gain(self): # Assert information gain weights. # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf v = vector.Model([ vector.Document({"wind": 1}, type=False), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 1}, type=True), vector.Document({"wind": 1}, type=False), vector.Document({"wind": 1}, type=False) ], weight=vector.TF) self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2) print "patten.vector.Model.information_gain()"
def test_lsa_concepts(self): try: import numpy except ImportError: return # Assert LSA concept space. model = vector.Model(( vector.Document("cats purr"), vector.Document("cats meow"), vector.Document("dogs howl"), vector.Document("dogs bark") )) model.reduce(2) # Intuitively, we'd expect two concepts: # 1) with cats + purr + meow grouped together, # 2) with dogs + howl + bark grouped together. i1, i2 = 0, 0 for i, concept in enumerate(model.lsa.concepts): self.assertTrue(isinstance(concept, dict)) if concept["cats"] > 0.5: self.assertTrue(concept["purr"] > 0.5) self.assertTrue(concept["meow"] > 0.5) self.assertTrue(concept["howl"] == 0.0) self.assertTrue(concept["bark"] == 0.0) i1 = i if concept["dogs"] > 0.5: self.assertTrue(concept["howl"] > 0.5) self.assertTrue(concept["bark"] > 0.5) self.assertTrue(concept["purr"] == 0.0) self.assertTrue(concept["meow"] == 0.0) i2 = i # We'd expect the "cat" documents to score high on the "cat" concept vector. # We'd expect the "dog" documents to score high on the "dog" concept # vector. v1 = model.lsa[model.documents[0].id] v2 = model.lsa[model.documents[2].id] self.assertTrue(v1.get(i1, 0) > 0.7) self.assertTrue(v1.get(i2, 0) == 0.0) self.assertTrue(v2.get(i1, 0) == 0.0) self.assertTrue(v2.get(i2, 0) > 0.7) # Assert LSA.transform() for unknown documents. v = model.lsa.transform(vector.Document("cats dogs")) self.assertAlmostEqual(v[0], 0.34, places=2) self.assertAlmostEqual(v[1], 0.34, places=2) print("pattern.vector.LSA.concepts") print("pattern.vector.LSA.transform()")
def test_feature_selection(self): # Assert information gain feature selection. m = vector.Model(( vector.Document("the cat sat on the mat", type="cat", stopwords=True), vector.Document("the dog howled at the moon", type="dog", stopwords=True) )) v = m.feature_selection(top=3, method=vector.IG, threshold=0.0) self.assertEqual(v, ["at", "cat", "dog"]) # Assert Model.filter(). v = m.filter(v) self.assertTrue("at" in v.terms) self.assertTrue("cat" in v.terms) self.assertTrue("dog" in v.terms) self.assertTrue("the" not in v.terms) self.assertTrue("mat" not in v.terms) print("pattern.vector.Model.feature_selection()") print("pattern.vector.Model.filter()")
def lsa_apply(df): print("Building model") m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF) print("Returning reduction") return m.reduce(2)
def get_lsa(texts): docs = [pv.Document(a) for a in texts] model = pv.Model(docs, weight=pv.TFIDF) lsa = model.reduce(2) return lsa
def lsa_apply(df): m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF) return m.reduce(2)
def create_models(group): docs = [pv.Document(item, threshold=1) for item in group] return pv.Model(docs, weight=pv.TFIDF)