def __call__(self, attr, data, aprioriDist=None, weightID=None): import numpy from orngContingency import Entropy if attr in data.domain: # if we receive attr as string we have to convert to variable attr = data.domain[attr] attrClassCont = orange.ContingencyAttrClass(attr, data) dist = [] for vals in attrClassCont.values(): dist += list(vals) classAttrEntropy = Entropy(numpy.array(dist)) infoGain = orange.MeasureAttribute_info(attr, data) if classAttrEntropy > 0: return float(infoGain) / classAttrEntropy else: return 0
def __call__(self, gen, weightID=0): selectBest = orngMisc.BestOnTheFly() for attr in gen.domain.attributes: selectBest.candidate(self.measure(attr, gen, None, weightID)) bestAttr = gen.domain.attributes[selectBest.winnerIndex()] classifier = orange.ClassifierByLookupTable(gen.domain.classVar, bestAttr) contingency = orange.ContingencyAttrClass(bestAttr, gen, weightID) for i in range(len(contingency)): classifier.lookupTable[i] = contingency[i].modus() classifier.distributions[i] = contingency[i] classifier.lookupTable[-1] = contingency.innerDistribution.modus() classifier.distributions[-1] = contingency.innerDistribution for d in classifier.distributions: d.normalize() return classifier
def test_attrClass_disc(self): d = orange.ExampleTable("zoo") cd = orange.get_class_distribution(d) ad = orange.Distribution(0, d) cont = orange.ContingencyAttrClass(0, d) self.assertEqual(cont.inner_distribution, cd) self.assertEqual(cont.outer_distribution, ad) self.assertNotEqual(cont[0], cont[1]) self.assertEqual(id(cont[0]), id(cont[d.domain[0].values[0]])) self.assertEqual(len(cont), len(ad)) for cc in cont: self.assertEqual(cc, cont[0]) break self.assertEqual(cont.keys(), d.domain[0].values) self.assertEqual(cont.values()[0], cont[0]) self.assertEqual(cont.values()[1], cont[1]) k, v = cont.items()[0] self.assertEqual(k, d.domain[0].values[0]) self.assertEqual(v, cont[0]) s = pickle.dumps(cont) cont2 = pickle.loads(s) self.assertEqual(cont.innerDistribution, cont2.innerDistribution) self.assertEqual(cont.innerVariable, cont2.innerVariable) self.assertEqual(cont.outerDistribution, cont2.outerDistribution) self.assertEqual(cont.outerVariable, cont2.outerVariable) self.assertEqual(cont[0], cont2[0]) self.assertEqual(cont[1], cont2[1]) cont.normalize() self.assertAlmostEqual(sum(cont.p_class(0)), 1.0) self.assertEqual(cont.p_class(0)[0], cont.p_class(0, 0)) self.assertEqual(cont.p_class(0)[0], cont2.p_class(0)[0]/cont2.p_class(0).abs) x = cont[0][0] cont.add_var_class(0, 0, 0.5) self.assertEqual(x+0.5, cont[0][0]) self.assertEqual(cont[0][0], cont[0,0]) self.assertEqual(cont[d.domain[0].values[0], d.domain.classVar.values[0]], cont[0,0]) with self.assertRaises(IndexError): cont["?"]
def setData(self, attr, data): self.clearAll() self.attr, self.data = attr, data self.curCutPoints = [] if not data or not attr: self.snapDecimals = 1 self.probDist = None return if data.domain.classVar: self.contingency = orange.ContingencyAttrClass(attr, data) try: self.condProb = orange.ConditionalProbabilityEstimatorConstructor_loess( self.contingency, nPoints=50) except: self.condProb = None self.probDist = None attrValues = self.contingency.keys() else: self.condProb = self.contingency = None self.probDist = orange.Distribution(attr, data) attrValues = self.probDist.keys() if attrValues: self.minVal, self.maxVal = min(attrValues), max(attrValues) else: self.minVal, self.maxVal = 0, 1 mdist = self.maxVal - self.minVal if mdist > 1e-30: self.snapDecimals = -int(math.ceil(math.log(mdist, 10)) -2) else: self.snapDecimals = 1 self.baseCurveX = None self.plotRug(True) self.plotProbCurve(True) self.plotCutLines(True) self.updateLayout() self.replot()
def __call__(self, attr, data, aprioriDist=None, weightID=None): attrClassCont = orange.ContingencyAttrClass(attr, data) classDist = orange.Distribution(data.domain.classVar, data).values() nCls = len(classDist) nEx = len(data) priorMDL = logMultipleCombs(nEx, classDist) + logMultipleCombs( nEx + nCls - 1, [nEx, nCls - 1]) postPart1 = [ logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys() ] postPart2 = [ logMultipleCombs( sum(attrClassCont[key]) + nCls - 1, [sum(attrClassCont[key]), nCls - 1]) for key in attrClassCont.keys() ] ret = priorMDL for val in postPart1 + postPart2: ret -= val return ret / max(1, nEx)
def test_attrClass_cont(self): d = orange.ExampleTable("iris") cd = orange.get_class_distribution(d) ad = orange.Distribution(0, d) cont = orange.ContingencyAttrClass(0, d) fv = cont.keys()[0] self.assertEqual(cont.inner_distribution, cd) self.assertEqual(cont.outer_distribution, ad) self.assertEqual(len(cont), len(ad)) s = set() for v in d: s.add(v[0]) self.assertEqual(s, set(cont.keys())) self.assertEqual(len(d), sum(sum(v) for v in cont.values())) s = pickle.dumps(cont) cont2 = pickle.loads(s) self.assertEqual(cont.innerDistribution, cont2.innerDistribution) self.assertEqual(cont.innerVariable, cont2.innerVariable) self.assertEqual(cont.outerDistribution, cont2.outerDistribution) self.assertEqual(cont.outerVariable, cont2.outerVariable) self.assertEqual(cont[fv], cont2[fv]) cont.normalize() self.assertAlmostEqual(sum(cont.p_class(fv)), 1.0) self.assertEqual(cont.p_class(fv)[0], cont.p_class(fv, 0)) self.assertEqual(cont.p_class(fv)[0], cont2.p_class(fv)[0]/cont2.p_class(fv).abs) x = cont[0][0] cont.add_var_class(0, 0, 0.5) self.assertEqual(x+0.5, cont[0][0]) self.assertEqual(cont[fv][0], cont[fv,0]) with self.assertRaises(IndexError): cont["?"]
# Description: Shows the limitations of contingencies with continuous outer attributes # Category: statistics # Classes: Contingency # Uses: iris # Referenced: contingency.htm import orange data = orange.ExampleTable("iris") cont = orange.ContingencyAttrClass(0, data) print "Contingency items:" for val, dist in cont.items()[:5]: print val, dist print print "Contingency keys: ", cont.keys()[:3] print "Contingency values: ", cont.values()[:3] print "Contingency items: ", cont.items()[:3] print try: midkey = (cont.keys()[0] + cont.keys()[1])/2.0 print "cont[%5.3f] =" % (midkey, cont[midkey]) except Exception, v: print "Error: ", v
# Description: Demonstrates the use of ContingencyAttrClass # Category: statistics # Classes: Contingency, ContingencyAttrClass # Uses: monk1 # Referenced: contingency.htm import orange data = orange.ExampleTable("monk1") cont = orange.ContingencyAttrClass("e", data) print "Inner variable: ", cont.innerVariable.name print "Outer variable: ", cont.outerVariable.name print print "Class variable: ", cont.classVar.name print "Attribute: ", cont.variable.name print print "Distributions:" for val in cont.variable: print " p(.|%s) = %s" % (val.native(), cont.p_class(val)) print firstclass = orange.Value(cont.classVar, 1) firstnative = firstclass.native() print "Probabilities of class '%s'" % firstnative for val in cont.variable: print " p(%s|%s) = %5.3f" % (firstnative, val.native(), cont.p_class(val, firstclass)) print cont = orange.ContingencyAttrClass(data.domain["e"], data.domain.classVar)
# Category: feature induction # Classes: FeatureByMinComplexity, FeatureByIM, FeatureByKramer, FeatureByCartesianProduct # Uses: monks-2.tab import orange import orngCI data = orange.ExampleTable("../datasets/monks-2") ab, quality = orngCI.FeatureByMinComplexity(data, ["a", "b"]) print "Quality: %.3f" % quality print "Values", ab.values data2 = orngCI.addAnAttribute(ab, data) c = orange.ContingencyAttrClass(ab, data) for i in c: print i ab, quality = orngCI.FeatureByIM(data, ["a", "b"]) print "Quality: %.3f" % quality print "Values", ab.values data2 = orngCI.addAnAttribute(ab, data) c = orange.ContingencyAttrClass(ab, data) for i in c: print i ab, quality = orngCI.FeatureByKramer(data, ["a", "b"]) print "Quality: %.3f" % quality
# Description: Shows how to assess the quality of attributes # Category: attribute quality # Classes: MeasureAttribute, MeasureAttribute_info, # Uses: lenses # Referenced: MeasureAttribute.htm import orange, random data = orange.ExampleTable("lenses") meas = orange.MeasureAttribute_info() astigm = data.domain["astigmatic"] print "Information gain of 'astigmatic': %6.4f" % meas(astigm, data) classdistr = orange.Distribution(data.domain.classVar, data) cont = orange.ContingencyAttrClass("tear_rate", data) print "Information gain of 'tear_rate': %6.4f" % meas(cont, classdistr) dcont = orange.DomainContingency(data) print "Information gain of the first attribute: %6.4f" % meas(0, dcont) print print "*** A set of more exhaustive tests for different way of passing arguments to MeasureAttribute ***" names = [a.name for a in data.domain.attributes] attrs = len(names) print("%30s" + "%15s" * attrs) % (("", ) + tuple(names)) fstr = "%30s" + "%15.4f" * attrs