Пример #1
0
def test():
    theta = readTheta()

    count = 0
    reader = BinReader(FILENAME)
    reader.open()
    result = [0] * reader.LineCount
    for i in xrange(reader.LineCount):
        (x, userid, itemid, label) = reader.readline()
        x[0] = 1
        y = np.dot(x[:37], theta)
        result[i] = (userid, itemid, y)
        if i % 10000 == 0:
            print '%d/%d' % (i, reader.LineCount)

    result.sort(key=lambda x: x[2], reverse=True)
    result = result[:6500]

    print ur'样本总数:', count
    print ur'正在输出...'
    with open('result.csv', 'w') as f:
        for item in result:
            f.write('%d,%d\n' % (item[0], item[1]))

    print ur'测试结束,输出个数:', 6500
Пример #2
0
def test():
    theta = readTheta()

    count = 0
    reader = BinReader(FILENAME)
    reader.open()
    result = [0] * reader.LineCount
    for i in xrange(reader.LineCount):
        (x,userid,itemid,label) = reader.readline()
        x[0] = 1
        y = np.dot(x[:37],theta)
        result[i] = (userid,itemid,y)
        if i % 10000 == 0:
            print '%d/%d' % (i,reader.LineCount)
    
    result.sort(key=lambda x:x[2],reverse=True)
    result = result[:6500]

    print ur'样本总数:',count
    print ur'正在输出...'
    with open('result.csv','w') as f:
        for item in result:
            f.write('%d,%d\n' % (item[0],item[1]))

    print ur'测试结束,输出个数:',6500
Пример #3
0
def test(theta):
    print '----------------------'
    print ur'正在测试'

    (raw_data,posiCount) = BinReader.readData(r'c:\data\homework\1218t20_3.bin')
    
    x,real_y,online,result = getXY(np.array(raw_data))

    x = mapFeature(x)
    values = sigmoid(np.dot(x ,theta))

    #第3行是真实的结果,第4行是预测的概率
    result[:,3] = values
    
    result = list(result)
    result.sort(key=lambda x:x[3],reverse=True)
    print ur'共获得结果%d' % sum(values > 0.5)
    
    result = result[:1300]
   
    right = sum([item[2] for item in result])
    
    precision,recall,f1 = print_analyse(right,1300,posiCount)
    
    print ur'测试完毕'
Пример #4
0
def test(theta):
    print '----------------------'
    print ur'正在测试'

    (raw_data,
     posiCount) = BinReader.readData(r'c:\data\homework\1218t20_3.bin')

    x, real_y, online, result = getXY(np.array(raw_data))

    x = mapFeature(x)
    values = sigmoid(np.dot(x, theta))

    #第3行是真实的结果,第4行是预测的概率
    result[:, 3] = values

    result = list(result)
    result.sort(key=lambda x: x[3], reverse=True)
    print ur'共获得结果%d' % sum(values > 0.5)

    result = result[:1300]

    right = sum([item[2] for item in result])

    precision, recall, f1 = print_analyse(right, 1300, posiCount)

    print ur'测试完毕'
Пример #5
0
class Normailzer(object):
    """description of class"""
    def __init__(self,filename):
        self.filename = filename
        self.reader = BinReader(filename)

    def getMaxMin(self):
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        max = [0] * dim
        min = [0] * dim
        for k in xrange(self.reader.LineCount):
            (x,userid,itemid,label) = self.reader.readline()
            for i in xrange(dim):
                if x[i + 1] > max[i]:
                    max[i] = x[i + 1]
                if x[i + 1] < min[i]:
                    min[i] = x[i + 1]
            if k % 10000 == 0:
                print '%d/%d' % (k,self.reader.LineCount)
        self.reader.close()
        return (max,min)
    
    ##调用该函数,实现归一化
    def transform(self):
        (max,min) = self.getMaxMin()
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        dis = [0] * dim
        for i in xrange(dim):
            dis[i] = max[i] - min[i]

        self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin')
        self.writer.open(self.reader.LineCount,self.reader.Dim)

        for k in xrange(self.reader.LineCount):
            (x,userid,itemid,label) = self.reader.readline()
            for i in xrange(dim):
                x[i + 1] = (x[i + 1] - min[i]) / dis[i]
            self.writer.writeline(x,userid,itemid,label)

            if k % 10000 == 0:
                print '%d/%d' % (k,self.reader.LineCount)

        self.writer.close()
Пример #6
0
class Normailzer(object):
    """description of class"""
    def __init__(self, filename):
        self.filename = filename
        self.reader = BinReader(filename)

    def getMaxMin(self):
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        max = [0] * dim
        min = [0] * dim
        for k in xrange(self.reader.LineCount):
            (x, userid, itemid, label) = self.reader.readline()
            for i in xrange(dim):
                if x[i + 1] > max[i]:
                    max[i] = x[i + 1]
                if x[i + 1] < min[i]:
                    min[i] = x[i + 1]
            if k % 10000 == 0:
                print '%d/%d' % (k, self.reader.LineCount)
        self.reader.close()
        return (max, min)

    ##调用该函数,实现归一化
    def transform(self):
        (max, min) = self.getMaxMin()
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        dis = [0] * dim
        for i in xrange(dim):
            dis[i] = max[i] - min[i]

        self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin')
        self.writer.open(self.reader.LineCount, self.reader.Dim)

        for k in xrange(self.reader.LineCount):
            (x, userid, itemid, label) = self.reader.readline()
            for i in xrange(dim):
                x[i + 1] = (x[i + 1] - min[i]) / dis[i]
            self.writer.writeline(x, userid, itemid, label)

            if k % 10000 == 0:
                print '%d/%d' % (k, self.reader.LineCount)

        self.writer.close()
Пример #7
0
 def load(self):
     reader = BinReader(self._filename)
     reader.open()
     self.LineCount = reader.LineCount
     self.data = [0] * reader.LineCount
     self.PosiCount = 0
     for i in xrange(reader.LineCount):
         self.data[i] = reader.readline()
         self.data[i][0][0] = 1
         if self.data[i][3] == 1:
             self.PosiCount+=1
     reader.close()
Пример #8
0
 def __init__(self,filename):
     self.filename = filename
     self.reader = BinReader(filename)
Пример #9
0
from sklearn.ensemble import GradientBoostingClassifier
from BinReader import BinReader
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

(data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') 

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train)
print 'testing...'

reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin')
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x,userid,itemid,label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid,itemid,y)
    if i % 10000 == 0:
        print '%d/%d' % (i,reader.LineCount)
    
result.sort(key=lambda x:x[2],reverse=True)
result = result[:7000]


print ur'正在输出...'
with open('result.csv','w') as f:
    for item in result:
Пример #10
0
                   action='store_true',
                   help="Don't print file info")

args = argsp.parse_args()

if args.construct and args.destruct:
    print("--construct and --destruct are mutually exclusive!")
    argsp.print_help()
    sys.exit(1)

# When destructing, we need to make the directory if it doesn't exist
if args.destruct:
    if not os.path.exists(args.destruct):
        os.makedirs(args.destruct)
    fh = FCH_Root()
    with BinReader(args.path) as br:
        fh.fromBinary(br)
    if not args.quiet:
        fh.printInfo()
    fh.destruct(args.destruct, overwrite=args.overwrite)
elif args.construct:
    fh = FCH_Root()
    fh.construct(args.construct)
    with BinWriter(args.path, overwrite=args.overwrite) as wr:
        fh.toBinary(wr)
    # Sanity read it again!
    with BinReader(args.path) as br:
        fh.fromBinary(br)
    if not args.quiet:
        fh.printInfo()
else:
Пример #11
0
 def __init__(self, filename):
     self.filename = filename
     self.reader = BinReader(filename)
# encoding = utf-8

from sklearn.ensemble import GradientBoostingClassifier
from BinReader import BinReader
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

path_train = ur'data\temp_train.bin'
path_test = ur'data\temp_test.bin'

(data, label, items) = BinReader.readData(path_train)

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0, loss='ls',
                                verbose=1).fit(X_train, y_train)
print 'testing...'

reader = BinReader(path_test)
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x, userid, itemid, label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid, itemid, y)
    if i % 10000 == 0:
        print '%d/%d' % (i, reader.LineCount)

result.sort(key=lambda x: x[2], reverse=True)
Пример #13
0
from BinReader import BinReader
from BinWriter import BinWriter

TOPN = 198000

reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin')
reader.open()

writer = BinWriter(reader._filename.rstrip('.bin') + '.top.bin')
writer.open(TOPN,reader.Dim)

with open('an.csv') as f:
    items = set(f.readlines())


posi = 0
for i in range(reader.LineCount):
    (x,userid,itemid,label) = reader.readline()

    if i < 800000:
        continue
    if '%d,%d\n' % (userid,itemid) in items:
        label = 1
        posi+=1
    else:
        label = 0
    writer.writeline(x,userid,itemid,label)
   
print ur'正例个数:',posi
writer.close()
reader.close()
Пример #14
0
from sklearn.ensemble import GradientBoostingClassifier
from BinReader import BinReader
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor


(data,label,items) = BinReader.readData(ur'C:\data\medium\norm\train1217.bin') 

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1,max_depth=5, random_state=0, loss='ls',verbose=1).fit(X_train, y_train)
print 'testing...'


pass

reader = BinReader(ur'C:\data\test1218.bin')
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x,userid,itemid,label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid,itemid,y)
    if i % 10000 == 0:
        print '%d/%d' % (i,reader.LineCount)
    
result.sort(key=lambda x:x[2],reverse=True)
result = result[:400]