def weighted_entropies(buckets): #calculates weighted entropies of items in buckets import ediv,math for c,b in buckets.items(): ps = sorted(b.pairs) org_pairs = b.pairs[:] dinds = {} dvals = {} ds = ediv.ediv(ps) vs = [i[0] for i in ps] total = 0 wsum = 0 ls = [] for ind,d in enumerate(ds): ls.append(len(d[1])*1.0) dinds[ind] = [] dvals[ind] = d[1] for ind,o in enumerate(org_pairs): for key,value in dvals.items(): if o in value: dinds[key].append(ind) dvals[key].pop(value.index(o)) b.dinds = dinds #weighted sum of entropies based on indexes for l in ls: if l!= 0: e = -(l/len(ps))*math.log(l/len(ps),2) else: e = 0.0 wsum += e*1/l total += 1/l buckets[c].wsum = wsum/total return buckets
def weighted_entropies(buckets): #calculates weighted entropies of items in buckets import ediv, math for c, b in buckets.items(): ps = sorted(b.pairs) org_pairs = b.pairs[:] dinds = {} dvals = {} ds = ediv.ediv(ps) vs = [i[0] for i in ps] total = 0 wsum = 0 ls = [] for ind, d in enumerate(ds): ls.append(len(d[1]) * 1.0) dinds[ind] = [] dvals[ind] = d[1] for ind, o in enumerate(org_pairs): for key, value in dvals.items(): if o in value: dinds[key].append(ind) dvals[key].pop(value.index(o)) b.dinds = dinds #weighted sum of entropies based on indexes for l in ls: if l != 0: e = -(l / len(ps)) * math.log(l / len(ps), 2) else: e = 0.0 wsum += e * 1 / l total += 1 / l buckets[c].wsum = wsum / total return buckets
def discretizer(z,buckets): #discretizes data in z and updates buckets if args['v'] > -1: sys.stderr.write("\n#Discretiziing data.\n") for b_ind,b in buckets.items(): if b.name[1:] in colname[z]: cind = indexOf(z,b.name[1:]) for key,value in b.dinds.items(): for v in value: if key in b.lo.keys(): if data[z][v][cind] < b.lo[key]: b.lo[key] = data[z][v][cind] else: b.lo[key] = data[z][v][cind] if key in b.hi.keys(): if data[z][v][cind] > b.hi[key]: b.hi[key] = data[z][v][cind] else: b.hi[key] = data[z][v][cind] data[z][v][cind] = key
def discretizer(z, buckets): #discretizes data in z and updates buckets if args['v'] > -1: sys.stderr.write("\n#Discretiziing data.\n") for b_ind, b in buckets.items(): if b.name[1:] in colname[z]: cind = indexOf(z, b.name[1:]) for key, value in b.dinds.items(): for v in value: if key in b.lo.keys(): if data[z][v][cind] < b.lo[key]: b.lo[key] = data[z][v][cind] else: b.lo[key] = data[z][v][cind] if key in b.hi.keys(): if data[z][v][cind] > b.hi[key]: b.hi[key] = data[z][v][cind] else: b.hi[key] = data[z][v][cind] data[z][v][cind] = key
def tshortener(z,zlst,colname,data,dep,indep,patt=1.0,discretize=True): #The infogain techniques of pruning columns and discretization class Bucket: #class for each column with splitted pairs of data def __init__(self,name): self.pairs = [] #unsorted row pairs self.name = name self.wsum = 0 self.dinds = {} #sorted split indexs self.lo = {} self.hi = {} def addpairs(self,pairs): self.pairs.append(pairs) def addwsum(self,wsum): self.wsum = wsum def __repr__(self): s = 'n: '+str(self.name)+":" s += ' l: '+str(len(self.pairs)) s += ' e: '+str(self.wsum)+'\n' return s from globfile import buckets outcols = [] for key,value in buckets.items(): buckets[key] = None for Z in zlst[1:]: for c in indep[Z]: if c == 'C_id': continue if c not in buckets.keys(): buckets[c] = Bucket(c) elif buckets[c] is None: buckets[c] = Bucket(c) ind = colname[Z].index(c) cind = colname[Z].index('C_id') for r in data[Z]: buckets[c].addpairs((r[ind],str(r[cind]))) reader.removeTable(Z) buckets = weighted_entropies(buckets) vals = buckets.values()[:] vals.sort(key=lambda x: x.wsum,reverse=False) for i in range(0,int(len(vals)*patt)): outcols.append(vals[i].name) zshort = 'shortenedz' outcols = [i for i in colname[z] if i in outcols] print outcols,"#infogained" #Convert outcols to discrete attributes if discretize: outcols = [c[1:] for c in outcols] print outcols,"#discretized" reader.makeTable(outcols+dep[z],zshort) for r in data[z]: temp = [] for i,c in enumerate(colname[z]): if discretize: if c[1:] in outcols or c in dep[z]: temp.append(r[i]) else: if c in outcols+dep[z]: temp.append(r[i]) reader.addRow(temp,zshort) if discretize: discretizer(zshort,buckets) for Z in zlst: reader.removeTable(Z) #discretizer(zshort,buckets) return zshort
def tshortener(z, zlst, colname, data, dep, indep, patt=1.0, discretize=True): #The infogain techniques of pruning columns and discretization class Bucket: #class for each column with splitted pairs of data def __init__(self, name): self.pairs = [] #unsorted row pairs self.name = name self.wsum = 0 self.dinds = {} #sorted split indexs self.lo = {} self.hi = {} def addpairs(self, pairs): self.pairs.append(pairs) def addwsum(self, wsum): self.wsum = wsum def __repr__(self): s = 'n: ' + str(self.name) + ":" s += ' l: ' + str(len(self.pairs)) s += ' e: ' + str(self.wsum) + '\n' return s from globfile import buckets outcols = [] for key, value in buckets.items(): buckets[key] = None for Z in zlst[1:]: for c in indep[Z]: if c == 'C_id': continue if c not in buckets.keys(): buckets[c] = Bucket(c) elif buckets[c] is None: buckets[c] = Bucket(c) ind = colname[Z].index(c) cind = colname[Z].index('C_id') for r in data[Z]: buckets[c].addpairs((r[ind], str(r[cind]))) reader.removeTable(Z) buckets = weighted_entropies(buckets) vals = buckets.values()[:] vals.sort(key=lambda x: x.wsum, reverse=False) for i in range(0, int(len(vals) * patt)): outcols.append(vals[i].name) zshort = 'shortenedz' outcols = [i for i in colname[z] if i in outcols] print outcols, "#infogained" #Convert outcols to discrete attributes if discretize: outcols = [c[1:] for c in outcols] print outcols, "#discretized" reader.makeTable(outcols + dep[z], zshort) for r in data[z]: temp = [] for i, c in enumerate(colname[z]): if discretize: if c[1:] in outcols or c in dep[z]: temp.append(r[i]) else: if c in outcols + dep[z]: temp.append(r[i]) reader.addRow(temp, zshort) if discretize: discretizer(zshort, buckets) for Z in zlst: reader.removeTable(Z) #discretizer(zshort,buckets) return zshort