Exemplo n.º 1
0
def weighted_entropies(buckets):
    #calculates weighted entropies of items in buckets
    import ediv,math
    for c,b in buckets.items():
        ps = sorted(b.pairs)
        org_pairs = b.pairs[:]
        dinds = {}
        dvals = {}
        ds = ediv.ediv(ps)
        vs = [i[0] for i in ps]
        total = 0
        wsum = 0
        ls = []
        for ind,d in enumerate(ds):
            ls.append(len(d[1])*1.0)
            dinds[ind] = []
            dvals[ind] = d[1]
        for ind,o in enumerate(org_pairs):
            for key,value in dvals.items():
                if o in value:
                    dinds[key].append(ind)
                    dvals[key].pop(value.index(o))
        b.dinds = dinds
        #weighted sum of entropies based on indexes
        for l in ls:
            if l!= 0:
                e = -(l/len(ps))*math.log(l/len(ps),2)
            else: e = 0.0
            wsum += e*1/l
            total += 1/l
            buckets[c].wsum = wsum/total
    return buckets
Exemplo n.º 2
0
def weighted_entropies(buckets):
    #calculates weighted entropies of items in buckets
    import ediv, math
    for c, b in buckets.items():
        ps = sorted(b.pairs)
        org_pairs = b.pairs[:]
        dinds = {}
        dvals = {}
        ds = ediv.ediv(ps)
        vs = [i[0] for i in ps]
        total = 0
        wsum = 0
        ls = []
        for ind, d in enumerate(ds):
            ls.append(len(d[1]) * 1.0)
            dinds[ind] = []
            dvals[ind] = d[1]
        for ind, o in enumerate(org_pairs):
            for key, value in dvals.items():
                if o in value:
                    dinds[key].append(ind)
                    dvals[key].pop(value.index(o))
        b.dinds = dinds
        #weighted sum of entropies based on indexes
        for l in ls:
            if l != 0:
                e = -(l / len(ps)) * math.log(l / len(ps), 2)
            else:
                e = 0.0
            wsum += e * 1 / l
            total += 1 / l
            buckets[c].wsum = wsum / total
    return buckets
Exemplo n.º 3
0
def discretizer(z,buckets):
    #discretizes data in z and updates buckets
    if args['v'] > -1:
        sys.stderr.write("\n#Discretiziing data.\n")
    for b_ind,b in buckets.items():
        if b.name[1:] in colname[z]:
            cind = indexOf(z,b.name[1:])
            for key,value in b.dinds.items():
                for v in value:
                    if key in b.lo.keys():
                        if data[z][v][cind] < b.lo[key]:
                            b.lo[key] = data[z][v][cind]
                    else:
                        b.lo[key] = data[z][v][cind]
                    if key in b.hi.keys():
                        if data[z][v][cind] > b.hi[key]:
                            b.hi[key] = data[z][v][cind]
                    else:
                        b.hi[key] = data[z][v][cind]
                    data[z][v][cind] = key
Exemplo n.º 4
0
def discretizer(z, buckets):
    #discretizes data in z and updates buckets
    if args['v'] > -1:
        sys.stderr.write("\n#Discretiziing data.\n")
    for b_ind, b in buckets.items():
        if b.name[1:] in colname[z]:
            cind = indexOf(z, b.name[1:])
            for key, value in b.dinds.items():
                for v in value:
                    if key in b.lo.keys():
                        if data[z][v][cind] < b.lo[key]:
                            b.lo[key] = data[z][v][cind]
                    else:
                        b.lo[key] = data[z][v][cind]
                    if key in b.hi.keys():
                        if data[z][v][cind] > b.hi[key]:
                            b.hi[key] = data[z][v][cind]
                    else:
                        b.hi[key] = data[z][v][cind]
                    data[z][v][cind] = key
Exemplo n.º 5
0
def tshortener(z,zlst,colname,data,dep,indep,patt=1.0,discretize=True):
    #The infogain techniques of pruning columns and discretization
    class Bucket:
        #class for each column with splitted pairs of data
        def __init__(self,name):
            self.pairs = [] #unsorted row pairs
            self.name = name
            self.wsum = 0
            self.dinds = {} #sorted split indexs
            self.lo = {}
            self.hi = {}
            
        def addpairs(self,pairs):
            self.pairs.append(pairs)
        def addwsum(self,wsum):
            self.wsum = wsum
        def __repr__(self):
            s = 'n: '+str(self.name)+":"
            s += ' l: '+str(len(self.pairs))
            s += ' e: '+str(self.wsum)+'\n'
            return s

    from globfile import buckets
    outcols = []

    for key,value in buckets.items():
        buckets[key] = None

    for Z in zlst[1:]:
        for c in indep[Z]:
            if c == 'C_id': continue
            if c not in buckets.keys():
                buckets[c] =  Bucket(c)
            elif buckets[c] is None:
                buckets[c] = Bucket(c)
            ind = colname[Z].index(c)
            cind = colname[Z].index('C_id')
            for r in data[Z]:
                buckets[c].addpairs((r[ind],str(r[cind])))
        reader.removeTable(Z)
    buckets = weighted_entropies(buckets)
    vals = buckets.values()[:]
    vals.sort(key=lambda x: x.wsum,reverse=False)
    for i in range(0,int(len(vals)*patt)):
        outcols.append(vals[i].name)
    zshort = 'shortenedz'
    outcols = [i for i in colname[z] if i in outcols]
    print outcols,"#infogained"
    #Convert outcols to discrete attributes
    if discretize:
        outcols = [c[1:] for c in outcols]
        print outcols,"#discretized"

    reader.makeTable(outcols+dep[z],zshort)
    for r in data[z]:
        temp = []
        for i,c in enumerate(colname[z]):
            if discretize:
                if c[1:] in outcols or c in dep[z]:
                    temp.append(r[i])
            else:
                if c in outcols+dep[z]:
                    temp.append(r[i])
        reader.addRow(temp,zshort)
    if discretize: discretizer(zshort,buckets)
    for Z in zlst:
        reader.removeTable(Z)
    #discretizer(zshort,buckets)
    return zshort
Exemplo n.º 6
0
def tshortener(z, zlst, colname, data, dep, indep, patt=1.0, discretize=True):
    #The infogain techniques of pruning columns and discretization
    class Bucket:
        #class for each column with splitted pairs of data
        def __init__(self, name):
            self.pairs = []  #unsorted row pairs
            self.name = name
            self.wsum = 0
            self.dinds = {}  #sorted split indexs
            self.lo = {}
            self.hi = {}

        def addpairs(self, pairs):
            self.pairs.append(pairs)

        def addwsum(self, wsum):
            self.wsum = wsum

        def __repr__(self):
            s = 'n: ' + str(self.name) + ":"
            s += ' l: ' + str(len(self.pairs))
            s += ' e: ' + str(self.wsum) + '\n'
            return s

    from globfile import buckets
    outcols = []

    for key, value in buckets.items():
        buckets[key] = None

    for Z in zlst[1:]:
        for c in indep[Z]:
            if c == 'C_id': continue
            if c not in buckets.keys():
                buckets[c] = Bucket(c)
            elif buckets[c] is None:
                buckets[c] = Bucket(c)
            ind = colname[Z].index(c)
            cind = colname[Z].index('C_id')
            for r in data[Z]:
                buckets[c].addpairs((r[ind], str(r[cind])))
        reader.removeTable(Z)
    buckets = weighted_entropies(buckets)
    vals = buckets.values()[:]
    vals.sort(key=lambda x: x.wsum, reverse=False)
    for i in range(0, int(len(vals) * patt)):
        outcols.append(vals[i].name)
    zshort = 'shortenedz'
    outcols = [i for i in colname[z] if i in outcols]
    print outcols, "#infogained"
    #Convert outcols to discrete attributes
    if discretize:
        outcols = [c[1:] for c in outcols]
        print outcols, "#discretized"

    reader.makeTable(outcols + dep[z], zshort)
    for r in data[z]:
        temp = []
        for i, c in enumerate(colname[z]):
            if discretize:
                if c[1:] in outcols or c in dep[z]:
                    temp.append(r[i])
            else:
                if c in outcols + dep[z]:
                    temp.append(r[i])
        reader.addRow(temp, zshort)
    if discretize: discretizer(zshort, buckets)
    for Z in zlst:
        reader.removeTable(Z)
    #discretizer(zshort,buckets)
    return zshort