def getPvalue(lim,table): # import statistics # return 1-statisticsc.chi_squared(table.dof, 2.0*lim*table.total*_log2e) import statc # print 2.0*lim*table.total*_log2e, table.dof if 2.0*lim*table.total*_log2e <= 0.0: return 1.0 - statc.chisqprob(1.e-10, table.dof) return 1.0 - statc.chisqprob(2.0*lim*table.total*_log2e, table.dof)
def __init__(self, t, save_data=1, interactions_too = 1, dependencies_too=0, prepare=1, pvalues = 0, simple_too=0,iterative_scaling=0,weighting=None): if prepare: t = self._prepare(t) if save_data: self.discData = t # save the discretized data ### PREPARE INDIVIDUAL ATTRIBUTES ### # Attribute Preparation NA = len(t.domain.attributes) self.names = [] self.labelname = "" if t.domain.classVar: self.labelname = t.domain.classVar.name self.gains = [] self.freqs = [] self.way2 = {} self.way3 = {} self.ig = [] self.list = [] self.abslist = [] self.plist = [] self.plut = {} self.ents = {} self.corr = {} self.chi2 = {} self.simple = {} for i in range(NA): if weighting != None: atc = orngContingency.get2Int(t,t.domain.attributes[i],t.domain.classVar,wid=weighting) else: atc = orngContingency.get2Int(t,t.domain.attributes[i],t.domain.classVar) gai = atc.InteractionInformation() self.gains.append(gai) self.corr[(i,-1)] = gai self.ents[(i,)] = orngContingency.Entropy(atc.a) self.way2[(i,-1,)] = atc self.ents[(i,-1)] = orngContingency.Entropy(atc.m) N = sum(atc.a) self.chi2[(i, i)] = statc.chisqprob(N * (numpy.sum(numpy.outer(atc.pa, atc.pa)) - 2 + len(atc.pa)), (len(atc.pa)-1)**2) # self.chi2[(i, i)] = N * (numpy.sum(numpy.outer(atc.pa, atc.pa)) - 2 + len(atc.pa)) if simple_too: simp = 0.0 for k in xrange(min(len(atc.a),len(atc.b))): try: simp += atc.pm[k,k] except: pass self.simple[(i,-1)] = simp # fix the name st = '%s'%t.domain.attributes[i].name # copy self.names.append(st) if pvalues: pv = orngContingency.getPvalue(gai,atc) self.plist.append((pv,(gai,i,-1))) self.plut[(i,-1)] = pv #print "%s\t%f\t%f\t%d"%(st,pv,gai,atc.total) line = [] for j in range(i): if dependencies_too: if weighting != None: c = orngContingency.get2Int(t,t.domain.attributes[j],t.domain.attributes[i],wid=weighting) else: c = orngContingency.get2Int(t,t.domain.attributes[j],t.domain.attributes[i]) self.way2[(j,i,)] = c gai = c.InteractionInformation() self.ents[(j,i,)] = orngContingency.Entropy(c.m) self.corr[(j,i,)] = gai self.chi2[(j,i)] = c.ChiSquareP() if simple_too: simp = 0.0 for k in xrange(min(len(c.a),len(c.b))): try: qq = c.pm[k,k] except: qq = 0 simp += qq self.simple[(j,i)] = simp if pvalues: pv = orngContingency.getPvalue(gai,c) self.plist.append((pv,(gai,j,i))) self.plut[(j,i)] = pv if interactions_too: if weighting != None: c = orngContingency.get3Int(t,t.domain.attributes[j],t.domain.attributes[i],t.domain.classVar,wid=weighting) else: c = orngContingency.get3Int(t,t.domain.attributes[j],t.domain.attributes[i],t.domain.classVar) self.way3[(j,i,-1)] = c igv = c.InteractionInformation() line.append(igv) self.list.append((igv,(igv,j,i))) self.abslist.append((abs(igv),(igv,j,i))) if pvalues: if iterative_scaling: div = c.IPF() else: div = c.KSA()[0] pv = orngContingency.getPvalue(div,c) #print "%s-%s\t%f\t%f\t%d"%(c.names[0],c.names[1],pv,igv,c.total) self.plist.append((pv,(igv,j,i,-1))) self.plut[(j,i,-1)] = pv self.ig.append(line) self.entropy = orngContingency.Entropy(atc.b) self.ents[(-1,)] = self.entropy self.list.sort() self.abslist.sort() self.plist.sort() self.attlist = [] for i in range(NA): self.attlist.append((self.gains[i],i)) self.attlist.sort() self.NA = NA
def __init__(self, t, save_data=1, interactions_too=1, dependencies_too=0, prepare=1, pvalues=0, simple_too=0, iterative_scaling=0, weighting=None): if prepare: t = self._prepare(t) if save_data: self.discData = t # save the discretized data ### PREPARE INDIVIDUAL ATTRIBUTES ### # Attribute Preparation NA = len(t.domain.attributes) self.names = [] self.labelname = "" if t.domain.classVar: self.labelname = t.domain.classVar.name self.gains = [] self.freqs = [] self.way2 = {} self.way3 = {} self.ig = [] self.list = [] self.abslist = [] self.plist = [] self.plut = {} self.ents = {} self.corr = {} self.chi2 = {} self.simple = {} for i in range(NA): if weighting != None: atc = orngContingency.get2Int(t, t.domain.attributes[i], t.domain.classVar, wid=weighting) else: atc = orngContingency.get2Int(t, t.domain.attributes[i], t.domain.classVar) gai = atc.InteractionInformation() self.gains.append(gai) self.corr[(i, -1)] = gai self.ents[(i, )] = orngContingency.Entropy(atc.a) self.way2[( i, -1, )] = atc self.ents[(i, -1)] = orngContingency.Entropy(atc.m) N = sum(atc.a) self.chi2[(i, i)] = statc.chisqprob( N * (numpy.sum(numpy.outer(atc.pa, atc.pa)) - 2 + len(atc.pa)), (len(atc.pa) - 1)**2) # self.chi2[(i, i)] = N * (numpy.sum(numpy.outer(atc.pa, atc.pa)) - 2 + len(atc.pa)) if simple_too: simp = 0.0 for k in xrange(min(len(atc.a), len(atc.b))): try: simp += atc.pm[k, k] except: pass self.simple[(i, -1)] = simp # fix the name st = '%s' % t.domain.attributes[i].name # copy self.names.append(st) if pvalues: pv = orngContingency.getPvalue(gai, atc) self.plist.append((pv, (gai, i, -1))) self.plut[(i, -1)] = pv #print "%s\t%f\t%f\t%d"%(st,pv,gai,atc.total) line = [] for j in range(i): if dependencies_too: if weighting != None: c = orngContingency.get2Int(t, t.domain.attributes[j], t.domain.attributes[i], wid=weighting) else: c = orngContingency.get2Int(t, t.domain.attributes[j], t.domain.attributes[i]) self.way2[( j, i, )] = c gai = c.InteractionInformation() self.ents[( j, i, )] = orngContingency.Entropy(c.m) self.corr[( j, i, )] = gai self.chi2[(j, i)] = c.ChiSquareP() if simple_too: simp = 0.0 for k in xrange(min(len(c.a), len(c.b))): try: qq = c.pm[k, k] except: qq = 0 simp += qq self.simple[(j, i)] = simp if pvalues: pv = orngContingency.getPvalue(gai, c) self.plist.append((pv, (gai, j, i))) self.plut[(j, i)] = pv if interactions_too: if weighting != None: c = orngContingency.get3Int(t, t.domain.attributes[j], t.domain.attributes[i], t.domain.classVar, wid=weighting) else: c = orngContingency.get3Int(t, t.domain.attributes[j], t.domain.attributes[i], t.domain.classVar) self.way3[(j, i, -1)] = c igv = c.InteractionInformation() line.append(igv) self.list.append((igv, (igv, j, i))) self.abslist.append((abs(igv), (igv, j, i))) if pvalues: if iterative_scaling: div = c.IPF() else: div = c.KSA()[0] pv = orngContingency.getPvalue(div, c) #print "%s-%s\t%f\t%f\t%d"%(c.names[0],c.names[1],pv,igv,c.total) self.plist.append((pv, (igv, j, i, -1))) self.plut[(j, i, -1)] = pv self.ig.append(line) self.entropy = orngContingency.Entropy(atc.b) self.ents[(-1, )] = self.entropy self.list.sort() self.abslist.sort() self.plist.sort() self.attlist = [] for i in range(NA): self.attlist.append((self.gains[i], i)) self.attlist.sort() self.NA = NA
def getPvalueDOF(lim,table,dof): # import statisticsc # return 1-statisticsc.chi_squared(dof,2.0*lim*table.total*_log2e) import statc return 1.0 - statc.chisqprob(2.0*lim*table.total*_log2e, dof)
def ChiSquareP(self): E = numpy.outer(self.pa, self.pb) * self.total return statc.chisqprob(numpy.sum((E-self.m)**2 / E.clip(min=0.000001)), (len(self.pa)-1)*(len(self.pb)-1))