예제 #1
0
파일: tiles.py 프로젝트: dngajjar/Courses
def tilesv2(table, numdim, outfile):
    ntable = project.project(table)
    tile = Tile()
    tile.tiny = 4 # the minimum instance num to assign a leaf
    tile.pre = ''
    tile.m = len(table.data[0])  # num of instances have
    tile.big = 2 * math.sqrt(tile.m)
    c1 = 1
    tile.watch = 1
    tile.centers = 'centroids'
    centable = {}  # dictinary to store all the splitted tables including the center table
    centable0 = tablestr.Table()
    reader.makeTable(ntable.name, centable0)
    centable[0] = centable0
    tiles0(ntable, tile)
    pre = tile.pre
    tiles4(1, tile.m, 1, tile.m, ntable, tile, centable, c1, pre, outfile)
    centable['project'] = ntable
    """
    x = centable[0].name.index('$_XX')
    y = centable[0].name.index('$_YY')
    maxvx = float(max(centable[0].data[x])); minvx = float(min(centable[0].data[x]))
    maxvy = float(max(centable[0].data[y])); minvy = float(min(centable[0].data[y]))
    for i in range(len(centable[0].data[0])):
        centable[0].data[x][i] = str('%4.8f'%((float(centable[0].data[x][i])  - minvx)/(maxvx - minvx)))
        centable[0].data[y][i] = str('%4.8f'%((float(centable[0].data[y][i])  - minvy)/(maxvy - minvy)))
    """
    return centable, len(centable[0].data[0])
예제 #2
0
파일: tshortener.py 프로젝트: nave91/miner
def distance_pruner(zlst):
    #Prunes cluster tree i.e. zlst with distance between their centroids
    

    if args['v'] > -1:
        sys.stderr.write("\n#Pruning data based on eucledian distance between clusters.\n")
        print zlst," #Old zlst before distprune"
    
    import dist
    z0 = zlst[0]
    pairs = []
    for _i,i in enumerate(data[z0]):
        for _j,j in enumerate(data[z0]):
            if i != j:
                if dist.dist(i,j,z0,indep,nump) < 0.3:
                    if [_i,_j] not in pairs and [_j,_i] not in pairs:
                        pairs.append(['__'+str(_i+1),'__'+str(_j+1)])
                    
    def repaired(pairs):
        for i in pairs:
            for j in pairs:
                if i != j :
                    if i[0] in j or i[1] in j:
                        pairs = [list(set(i+j))]+\
                        [k for k in pairs if k not in [i,j]]
                        return repaired(pairs)
        return pairs
    
    repairs = repaired(pairs)
    ps = []
    for i in repairs:
        ps+=i
    for i in zlst[1:]:
        if i not in ps:
            repairs.append([i])
    temp_row = {}
    
    for ind,p in enumerate(repairs):
        temp_row[ind] = []
        for i in p:
            temp_row[ind] += data[i]
    col = colname[z0]
    for Z in zlst:
        reader.removeTable(Z)
    zlst = [None]
    for ind,value in enumerate(temp_row.values()):
        Z = '__'+str(ind+1)
        reader.makeTable(col,Z)
        for r in value:
            reader.addRow(r[:len(r)-1]+[ind],Z)
        zlst.append(Z)
    xy_lib.buildzero(zlst,'',args['e'])

    if args['v'] > -1:
        print zlst," #New zlst after distprune"
    return zlst
예제 #3
0
def distance_pruner(zlst):
    #Prunes cluster tree i.e. zlst with distance between their centroids

    if args['v'] > -1:
        sys.stderr.write(
            "\n#Pruning data based on eucledian distance between clusters.\n")
        print zlst, " #Old zlst before distprune"

    import dist
    z0 = zlst[0]
    pairs = []
    for _i, i in enumerate(data[z0]):
        for _j, j in enumerate(data[z0]):
            if i != j:
                if dist.dist(i, j, z0, indep, nump) < 0.3:
                    if [_i, _j] not in pairs and [_j, _i] not in pairs:
                        pairs.append(['__' + str(_i + 1), '__' + str(_j + 1)])

    def repaired(pairs):
        for i in pairs:
            for j in pairs:
                if i != j:
                    if i[0] in j or i[1] in j:
                        pairs = [list(set(i+j))]+\
                        [k for k in pairs if k not in [i,j]]
                        return repaired(pairs)
        return pairs

    repairs = repaired(pairs)
    ps = []
    for i in repairs:
        ps += i
    for i in zlst[1:]:
        if i not in ps:
            repairs.append([i])
    temp_row = {}

    for ind, p in enumerate(repairs):
        temp_row[ind] = []
        for i in p:
            temp_row[ind] += data[i]
    col = colname[z0]
    for Z in zlst:
        reader.removeTable(Z)
    zlst = [None]
    for ind, value in enumerate(temp_row.values()):
        Z = '__' + str(ind + 1)
        reader.makeTable(col, Z)
        for r in value:
            reader.addRow(r[:len(r) - 1] + [ind], Z)
        zlst.append(Z)
    xy_lib.buildzero(zlst, '', args['e'])

    if args['v'] > -1:
        print zlst, " #New zlst after distprune"
    return zlst
예제 #4
0
def attrtable(table, attrlst):
    lst, name, row = [], [], []
    for s in range(len(table.name)):
        if table.name[s][1:] in attrlst or table.name[s] in attrlst:
            lst += [s]
    lst += [table.klass[0]]
    name = [table.name[i] for i in lst]
    ntable = tablestr.Table()
    reader.makeTable(name, ntable)
    for s in range(len(table.data[0])):
        row = [table.data[k][s] for k in lst]
        reader.addRow(row, ntable)
    return ntable
예제 #5
0
def discrete(table, bins):
    tables = {}
    #breaks = labels.ewdbreaks; label = labels.ewdlablef
    breaks = labels.gbreaks; label = labels.globalf 
    b = {}  
    breaks(b)
    newNames = labels.discreteNames(table.name, table.num)
    ntable = tablestr.Table()
    reader.makeTable(newNames, ntable)
    discrete1(table, ntable, bins, b[bins], label)
    tables[0] = table
    tables['d'] = ntable    
    return tables
예제 #6
0
파일: xval.py 프로젝트: gokycat/LexisNexis
def xval(start,stop,rows,table,f):
	rmax=len(rows)
	r=0
	train = reader.makeTable(table.header)
	test = reader.makeTable(table.header)
	while(r<rmax):
		d=rows[r]
		r+=1
		if ((r>= start) & (r <= stop)):
			reader.addRow(d, train)
		else:
			reader.addRow(d, test)
	print test
	return f.zeror(train.klass.expected, test.klass)
예제 #7
0
def discrete(table, t, bins):
    tables = {}
    #breaks = labels.ewdbreaks; label = labels.ewdlablef
    breaks = labels.gbreaks; label = labels.globalf 
    b = {}  
    breaks(b)
    newNames = labels.discreteNames(table.name, table.num)
    ntable = tablestr.Table()
    reader.makeTable(newNames, ntable)
    discrete1(table, ntable, bins, b[bins], label)
    print 'b[1]=', b[3][0]
    print 'b[2]=', b[3][1]
    tables[0] = table
    t1 = 'D_'+ str(t)
    tables[t1] = ntable    
    return tables
예제 #8
0
파일: tiles.py 프로젝트: dngajjar/Courses
def makeNewTable(has, c1, table, tile, centable):
    c1 = c1 * 100
    z = table.name.index('$_ZZ')
    newtable = tablestr.Table()
    reader.makeTable(table.name, newtable)
    for one in range(len(has)):
        d = has[one]
        row1 = [table.data[s][d] for s in range(len(table.data))]
        row1[z] = str(c1)
        reader.addRow(row1, newtable)
    centers = tablestr.centroid(newtable) #centers[0] is mu or mode
    centers[0][z] = str(c1)
    reader.addRow(centers[0], centable[0])
    centable[c1/100] = newtable
            
    
예제 #9
0
def widen(table, x, y):
    adds = table.name[:]
    adds += ['$_XX']
    adds += ['$_YY']
    adds += ['$_Hell']
    adds += ['$_ZZ']
    ntable = tablestr.Table()
    reader.makeTable(adds, ntable)
    for s in range(len(table.data[0])):
        row = [table.data[k][s] for k in range(len(table.data))]
        #tmp = row[:]
        row += [x[s]]
        row += [y[s]]
        row += [table.data[table.klass[0]][s]]
        row += [str(0)]
        reader.addRow(row, ntable)
    return ntable
예제 #10
0
def mutate(conds, wcluster, appender):
    #mutates wcluster wrt conds
    temp_data = []
    for c in conds:
        ind = colname[wcluster].index(c[0])
        for d in data[wcluster]:
            le = c[1]
            if le:
                if d[ind] <= c[2]:
                    if d not in temp_data: temp_data.append(d)
            else:
                if d[ind] > c[2]:
                    if d not in temp_data: temp_data.append(d)
    wced = wcluster + appender
    reader.makeTable(colname[wcluster], wced)
    for r in temp_data:
        reader.addRow(r, wced)
    return wced
예제 #11
0
파일: diff.py 프로젝트: nave91/miner
def mutate(conds,wcluster,appender):
    #mutates wcluster wrt conds
    temp_data = []
    for c in conds:
        ind = colname[wcluster].index(c[0])
        for d in data[wcluster]:
            le = c[1]
            if le:
                if d[ind] <= c[2]:
                    if d not in temp_data: temp_data.append(d)
            else:
                if d[ind] > c[2]:
                    if d not in temp_data: temp_data.append(d)
    wced = wcluster+appender
    reader.makeTable(colname[wcluster],wced)
    for r in temp_data:
        reader.addRow(r,wced)
    return wced
예제 #12
0
파일: uxval.py 프로젝트: dngajjar/Courses
def xval(start, stop, rows, tables):
    testT = tablestr.Table()
    trainT = tablestr.Table()
    reader.makeTable(tables.name, testT)
    reader.makeTable(tables.name, trainT)
    for r in range(len(rows)):
        d = rows[r]
        a = []
        for j in range(len(tables.order)):
            a.append(tables.data[j][d])        
        if r >= start and r < stop: #belonging to testing data set
            reader.addRow(a, testT)
        else:
            reader.addRow(a, trainT)
    testT = reader.klasses(testT)
    trainT = reader.klasses(trainT)
    tables = {}
    tables['train'] = trainT
    tables['test'] = testT
    return tables    
예제 #13
0
파일: tshortener.py 프로젝트: nave91/miner
def tshortener(z,zlst,colname,data,dep,indep,patt=1.0,discretize=True):
    #The infogain techniques of pruning columns and discretization
    class Bucket:
        #class for each column with splitted pairs of data
        def __init__(self,name):
            self.pairs = [] #unsorted row pairs
            self.name = name
            self.wsum = 0
            self.dinds = {} #sorted split indexs
            self.lo = {}
            self.hi = {}
            
        def addpairs(self,pairs):
            self.pairs.append(pairs)
        def addwsum(self,wsum):
            self.wsum = wsum
        def __repr__(self):
            s = 'n: '+str(self.name)+":"
            s += ' l: '+str(len(self.pairs))
            s += ' e: '+str(self.wsum)+'\n'
            return s

    from globfile import buckets
    outcols = []

    for key,value in buckets.items():
        buckets[key] = None

    for Z in zlst[1:]:
        for c in indep[Z]:
            if c == 'C_id': continue
            if c not in buckets.keys():
                buckets[c] =  Bucket(c)
            elif buckets[c] is None:
                buckets[c] = Bucket(c)
            ind = colname[Z].index(c)
            cind = colname[Z].index('C_id')
            for r in data[Z]:
                buckets[c].addpairs((r[ind],str(r[cind])))
        reader.removeTable(Z)
    buckets = weighted_entropies(buckets)
    vals = buckets.values()[:]
    vals.sort(key=lambda x: x.wsum,reverse=False)
    for i in range(0,int(len(vals)*patt)):
        outcols.append(vals[i].name)
    zshort = 'shortenedz'
    outcols = [i for i in colname[z] if i in outcols]
    print outcols,"#infogained"
    #Convert outcols to discrete attributes
    if discretize:
        outcols = [c[1:] for c in outcols]
        print outcols,"#discretized"

    reader.makeTable(outcols+dep[z],zshort)
    for r in data[z]:
        temp = []
        for i,c in enumerate(colname[z]):
            if discretize:
                if c[1:] in outcols or c in dep[z]:
                    temp.append(r[i])
            else:
                if c in outcols+dep[z]:
                    temp.append(r[i])
        reader.addRow(temp,zshort)
    if discretize: discretizer(zshort,buckets)
    for Z in zlst:
        reader.removeTable(Z)
    #discretizer(zshort,buckets)
    return zshort
예제 #14
0
def tshortener(z, zlst, colname, data, dep, indep, patt=1.0, discretize=True):
    #The infogain techniques of pruning columns and discretization
    class Bucket:
        #class for each column with splitted pairs of data
        def __init__(self, name):
            self.pairs = []  #unsorted row pairs
            self.name = name
            self.wsum = 0
            self.dinds = {}  #sorted split indexs
            self.lo = {}
            self.hi = {}

        def addpairs(self, pairs):
            self.pairs.append(pairs)

        def addwsum(self, wsum):
            self.wsum = wsum

        def __repr__(self):
            s = 'n: ' + str(self.name) + ":"
            s += ' l: ' + str(len(self.pairs))
            s += ' e: ' + str(self.wsum) + '\n'
            return s

    from globfile import buckets
    outcols = []

    for key, value in buckets.items():
        buckets[key] = None

    for Z in zlst[1:]:
        for c in indep[Z]:
            if c == 'C_id': continue
            if c not in buckets.keys():
                buckets[c] = Bucket(c)
            elif buckets[c] is None:
                buckets[c] = Bucket(c)
            ind = colname[Z].index(c)
            cind = colname[Z].index('C_id')
            for r in data[Z]:
                buckets[c].addpairs((r[ind], str(r[cind])))
        reader.removeTable(Z)
    buckets = weighted_entropies(buckets)
    vals = buckets.values()[:]
    vals.sort(key=lambda x: x.wsum, reverse=False)
    for i in range(0, int(len(vals) * patt)):
        outcols.append(vals[i].name)
    zshort = 'shortenedz'
    outcols = [i for i in colname[z] if i in outcols]
    print outcols, "#infogained"
    #Convert outcols to discrete attributes
    if discretize:
        outcols = [c[1:] for c in outcols]
        print outcols, "#discretized"

    reader.makeTable(outcols + dep[z], zshort)
    for r in data[z]:
        temp = []
        for i, c in enumerate(colname[z]):
            if discretize:
                if c[1:] in outcols or c in dep[z]:
                    temp.append(r[i])
            else:
                if c in outcols + dep[z]:
                    temp.append(r[i])
        reader.addRow(temp, zshort)
    if discretize: discretizer(zshort, buckets)
    for Z in zlst:
        reader.removeTable(Z)
    #discretizer(zshort,buckets)
    return zshort