def transform(self): (max, min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount, self.reader.Dim) for k in xrange(self.reader.LineCount): (x, userid, itemid, label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x, userid, itemid, label) if k % 10000 == 0: print '%d/%d' % (k, self.reader.LineCount) self.writer.close()
def transform(self): (max,min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount,self.reader.Dim) for k in xrange(self.reader.LineCount): (x,userid,itemid,label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x,userid,itemid,label) if k % 10000 == 0: print '%d/%d' % (k,self.reader.LineCount) self.writer.close()
class Normailzer(object): """description of class""" def __init__(self,filename): self.filename = filename self.reader = BinReader(filename) def getMaxMin(self): self.reader.open() dim = self.reader.XDim - 1 #去除常数项 max = [0] * dim min = [0] * dim for k in xrange(self.reader.LineCount): (x,userid,itemid,label) = self.reader.readline() for i in xrange(dim): if x[i + 1] > max[i]: max[i] = x[i + 1] if x[i + 1] < min[i]: min[i] = x[i + 1] if k % 10000 == 0: print '%d/%d' % (k,self.reader.LineCount) self.reader.close() return (max,min) ##调用该函数,实现归一化 def transform(self): (max,min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount,self.reader.Dim) for k in xrange(self.reader.LineCount): (x,userid,itemid,label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x,userid,itemid,label) if k % 10000 == 0: print '%d/%d' % (k,self.reader.LineCount) self.writer.close()
class Normailzer(object): """description of class""" def __init__(self, filename): self.filename = filename self.reader = BinReader(filename) def getMaxMin(self): self.reader.open() dim = self.reader.XDim - 1 #去除常数项 max = [0] * dim min = [0] * dim for k in xrange(self.reader.LineCount): (x, userid, itemid, label) = self.reader.readline() for i in xrange(dim): if x[i + 1] > max[i]: max[i] = x[i + 1] if x[i + 1] < min[i]: min[i] = x[i + 1] if k % 10000 == 0: print '%d/%d' % (k, self.reader.LineCount) self.reader.close() return (max, min) ##调用该函数,实现归一化 def transform(self): (max, min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount, self.reader.Dim) for k in xrange(self.reader.LineCount): (x, userid, itemid, label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x, userid, itemid, label) if k % 10000 == 0: print '%d/%d' % (k, self.reader.LineCount) self.writer.close()
sys.exit(1) # When destructing, we need to make the directory if it doesn't exist if args.destruct: if not os.path.exists(args.destruct): os.makedirs(args.destruct) fh = FCH_Root() with BinReader(args.path) as br: fh.fromBinary(br) if not args.quiet: fh.printInfo() fh.destruct(args.destruct, overwrite=args.overwrite) elif args.construct: fh = FCH_Root() fh.construct(args.construct) with BinWriter(args.path, overwrite=args.overwrite) as wr: fh.toBinary(wr) # Sanity read it again! with BinReader(args.path) as br: fh.fromBinary(br) if not args.quiet: fh.printInfo() else: # Default is read the file and print info fh = FCH_Root() with BinReader(args.path) as br: fh.fromBinary(br) if not args.quiet: fh.printInfo() # vim:ts=4:sw=4:et
from BinReader import BinReader from BinWriter import BinWriter TOPN = 198000 reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() writer = BinWriter(reader._filename.rstrip('.bin') + '.top.bin') writer.open(TOPN,reader.Dim) with open('an.csv') as f: items = set(f.readlines()) posi = 0 for i in range(reader.LineCount): (x,userid,itemid,label) = reader.readline() if i < 800000: continue if '%d,%d\n' % (userid,itemid) in items: label = 1 posi+=1 else: label = 0 writer.writeline(x,userid,itemid,label) print ur'正例个数:',posi writer.close() reader.close()