def test(): theta = readTheta() count = 0 reader = BinReader(FILENAME) reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x, userid, itemid, label) = reader.readline() x[0] = 1 y = np.dot(x[:37], theta) result[i] = (userid, itemid, y) if i % 10000 == 0: print '%d/%d' % (i, reader.LineCount) result.sort(key=lambda x: x[2], reverse=True) result = result[:6500] print ur'样本总数:', count print ur'正在输出...' with open('result.csv', 'w') as f: for item in result: f.write('%d,%d\n' % (item[0], item[1])) print ur'测试结束,输出个数:', 6500
def test(): theta = readTheta() count = 0 reader = BinReader(FILENAME) reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x,userid,itemid,label) = reader.readline() x[0] = 1 y = np.dot(x[:37],theta) result[i] = (userid,itemid,y) if i % 10000 == 0: print '%d/%d' % (i,reader.LineCount) result.sort(key=lambda x:x[2],reverse=True) result = result[:6500] print ur'样本总数:',count print ur'正在输出...' with open('result.csv','w') as f: for item in result: f.write('%d,%d\n' % (item[0],item[1])) print ur'测试结束,输出个数:',6500
def test(theta): print '----------------------' print ur'正在测试' (raw_data,posiCount) = BinReader.readData(r'c:\data\homework\1218t20_3.bin') x,real_y,online,result = getXY(np.array(raw_data)) x = mapFeature(x) values = sigmoid(np.dot(x ,theta)) #第3行是真实的结果,第4行是预测的概率 result[:,3] = values result = list(result) result.sort(key=lambda x:x[3],reverse=True) print ur'共获得结果%d' % sum(values > 0.5) result = result[:1300] right = sum([item[2] for item in result]) precision,recall,f1 = print_analyse(right,1300,posiCount) print ur'测试完毕'
def test(theta): print '----------------------' print ur'正在测试' (raw_data, posiCount) = BinReader.readData(r'c:\data\homework\1218t20_3.bin') x, real_y, online, result = getXY(np.array(raw_data)) x = mapFeature(x) values = sigmoid(np.dot(x, theta)) #第3行是真实的结果,第4行是预测的概率 result[:, 3] = values result = list(result) result.sort(key=lambda x: x[3], reverse=True) print ur'共获得结果%d' % sum(values > 0.5) result = result[:1300] right = sum([item[2] for item in result]) precision, recall, f1 = print_analyse(right, 1300, posiCount) print ur'测试完毕'
class Normailzer(object): """description of class""" def __init__(self,filename): self.filename = filename self.reader = BinReader(filename) def getMaxMin(self): self.reader.open() dim = self.reader.XDim - 1 #去除常数项 max = [0] * dim min = [0] * dim for k in xrange(self.reader.LineCount): (x,userid,itemid,label) = self.reader.readline() for i in xrange(dim): if x[i + 1] > max[i]: max[i] = x[i + 1] if x[i + 1] < min[i]: min[i] = x[i + 1] if k % 10000 == 0: print '%d/%d' % (k,self.reader.LineCount) self.reader.close() return (max,min) ##调用该函数,实现归一化 def transform(self): (max,min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount,self.reader.Dim) for k in xrange(self.reader.LineCount): (x,userid,itemid,label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x,userid,itemid,label) if k % 10000 == 0: print '%d/%d' % (k,self.reader.LineCount) self.writer.close()
class Normailzer(object): """description of class""" def __init__(self, filename): self.filename = filename self.reader = BinReader(filename) def getMaxMin(self): self.reader.open() dim = self.reader.XDim - 1 #去除常数项 max = [0] * dim min = [0] * dim for k in xrange(self.reader.LineCount): (x, userid, itemid, label) = self.reader.readline() for i in xrange(dim): if x[i + 1] > max[i]: max[i] = x[i + 1] if x[i + 1] < min[i]: min[i] = x[i + 1] if k % 10000 == 0: print '%d/%d' % (k, self.reader.LineCount) self.reader.close() return (max, min) ##调用该函数,实现归一化 def transform(self): (max, min) = self.getMaxMin() self.reader.open() dim = self.reader.XDim - 1 #去除常数项 dis = [0] * dim for i in xrange(dim): dis[i] = max[i] - min[i] self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin') self.writer.open(self.reader.LineCount, self.reader.Dim) for k in xrange(self.reader.LineCount): (x, userid, itemid, label) = self.reader.readline() for i in xrange(dim): x[i + 1] = (x[i + 1] - min[i]) / dis[i] self.writer.writeline(x, userid, itemid, label) if k % 10000 == 0: print '%d/%d' % (k, self.reader.LineCount) self.writer.close()
def load(self): reader = BinReader(self._filename) reader.open() self.LineCount = reader.LineCount self.data = [0] * reader.LineCount self.PosiCount = 0 for i in xrange(reader.LineCount): self.data[i] = reader.readline() self.data[i][0][0] = 1 if self.data[i][3] == 1: self.PosiCount+=1 reader.close()
def __init__(self,filename): self.filename = filename self.reader = BinReader(filename)
from sklearn.ensemble import GradientBoostingClassifier from BinReader import BinReader import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor (data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') X_train = np.array(data) label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train) print 'testing...' reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x,userid,itemid,label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid,itemid,y) if i % 10000 == 0: print '%d/%d' % (i,reader.LineCount) result.sort(key=lambda x:x[2],reverse=True) result = result[:7000] print ur'正在输出...' with open('result.csv','w') as f: for item in result:
action='store_true', help="Don't print file info") args = argsp.parse_args() if args.construct and args.destruct: print("--construct and --destruct are mutually exclusive!") argsp.print_help() sys.exit(1) # When destructing, we need to make the directory if it doesn't exist if args.destruct: if not os.path.exists(args.destruct): os.makedirs(args.destruct) fh = FCH_Root() with BinReader(args.path) as br: fh.fromBinary(br) if not args.quiet: fh.printInfo() fh.destruct(args.destruct, overwrite=args.overwrite) elif args.construct: fh = FCH_Root() fh.construct(args.construct) with BinWriter(args.path, overwrite=args.overwrite) as wr: fh.toBinary(wr) # Sanity read it again! with BinReader(args.path) as br: fh.fromBinary(br) if not args.quiet: fh.printInfo() else:
def __init__(self, filename): self.filename = filename self.reader = BinReader(filename)
# encoding = utf-8 from sklearn.ensemble import GradientBoostingClassifier from BinReader import BinReader import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor path_train = ur'data\temp_train.bin' path_test = ur'data\temp_test.bin' (data, label, items) = BinReader.readData(path_train) X_train = np.array(data) label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0, loss='ls', verbose=1).fit(X_train, y_train) print 'testing...' reader = BinReader(path_test) reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x, userid, itemid, label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid, itemid, y) if i % 10000 == 0: print '%d/%d' % (i, reader.LineCount) result.sort(key=lambda x: x[2], reverse=True)
from BinReader import BinReader from BinWriter import BinWriter TOPN = 198000 reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() writer = BinWriter(reader._filename.rstrip('.bin') + '.top.bin') writer.open(TOPN,reader.Dim) with open('an.csv') as f: items = set(f.readlines()) posi = 0 for i in range(reader.LineCount): (x,userid,itemid,label) = reader.readline() if i < 800000: continue if '%d,%d\n' % (userid,itemid) in items: label = 1 posi+=1 else: label = 0 writer.writeline(x,userid,itemid,label) print ur'正例个数:',posi writer.close() reader.close()
from sklearn.ensemble import GradientBoostingClassifier from BinReader import BinReader import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor (data,label,items) = BinReader.readData(ur'C:\data\medium\norm\train1217.bin') X_train = np.array(data) label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1,max_depth=5, random_state=0, loss='ls',verbose=1).fit(X_train, y_train) print 'testing...' pass reader = BinReader(ur'C:\data\test1218.bin') reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x,userid,itemid,label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid,itemid,y) if i % 10000 == 0: print '%d/%d' % (i,reader.LineCount) result.sort(key=lambda x:x[2],reverse=True) result = result[:400]