Exemplo n.º 1
0
def test():
    theta = readTheta()

    count = 0
    reader = BinReader(FILENAME)
    reader.open()
    result = [0] * reader.LineCount
    for i in xrange(reader.LineCount):
        (x,userid,itemid,label) = reader.readline()
        x[0] = 1
        y = np.dot(x[:37],theta)
        result[i] = (userid,itemid,y)
        if i % 10000 == 0:
            print '%d/%d' % (i,reader.LineCount)
    
    result.sort(key=lambda x:x[2],reverse=True)
    result = result[:6500]

    print ur'样本总数:',count
    print ur'正在输出...'
    with open('result.csv','w') as f:
        for item in result:
            f.write('%d,%d\n' % (item[0],item[1]))

    print ur'测试结束,输出个数:',6500
Exemplo n.º 2
0
def test():
    theta = readTheta()

    count = 0
    reader = BinReader(FILENAME)
    reader.open()
    result = [0] * reader.LineCount
    for i in xrange(reader.LineCount):
        (x, userid, itemid, label) = reader.readline()
        x[0] = 1
        y = np.dot(x[:37], theta)
        result[i] = (userid, itemid, y)
        if i % 10000 == 0:
            print '%d/%d' % (i, reader.LineCount)

    result.sort(key=lambda x: x[2], reverse=True)
    result = result[:6500]

    print ur'样本总数:', count
    print ur'正在输出...'
    with open('result.csv', 'w') as f:
        for item in result:
            f.write('%d,%d\n' % (item[0], item[1]))

    print ur'测试结束,输出个数:', 6500
Exemplo n.º 3
0
 def load(self):
     reader = BinReader(self._filename)
     reader.open()
     self.LineCount = reader.LineCount
     self.data = [0] * reader.LineCount
     self.PosiCount = 0
     for i in xrange(reader.LineCount):
         self.data[i] = reader.readline()
         self.data[i][0][0] = 1
         if self.data[i][3] == 1:
             self.PosiCount+=1
     reader.close()
Exemplo n.º 4
0
class Normailzer(object):
    """description of class"""
    def __init__(self,filename):
        self.filename = filename
        self.reader = BinReader(filename)

    def getMaxMin(self):
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        max = [0] * dim
        min = [0] * dim
        for k in xrange(self.reader.LineCount):
            (x,userid,itemid,label) = self.reader.readline()
            for i in xrange(dim):
                if x[i + 1] > max[i]:
                    max[i] = x[i + 1]
                if x[i + 1] < min[i]:
                    min[i] = x[i + 1]
            if k % 10000 == 0:
                print '%d/%d' % (k,self.reader.LineCount)
        self.reader.close()
        return (max,min)
    
    ##调用该函数,实现归一化
    def transform(self):
        (max,min) = self.getMaxMin()
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        dis = [0] * dim
        for i in xrange(dim):
            dis[i] = max[i] - min[i]

        self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin')
        self.writer.open(self.reader.LineCount,self.reader.Dim)

        for k in xrange(self.reader.LineCount):
            (x,userid,itemid,label) = self.reader.readline()
            for i in xrange(dim):
                x[i + 1] = (x[i + 1] - min[i]) / dis[i]
            self.writer.writeline(x,userid,itemid,label)

            if k % 10000 == 0:
                print '%d/%d' % (k,self.reader.LineCount)

        self.writer.close()
Exemplo n.º 5
0
class Normailzer(object):
    """description of class"""
    def __init__(self, filename):
        self.filename = filename
        self.reader = BinReader(filename)

    def getMaxMin(self):
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        max = [0] * dim
        min = [0] * dim
        for k in xrange(self.reader.LineCount):
            (x, userid, itemid, label) = self.reader.readline()
            for i in xrange(dim):
                if x[i + 1] > max[i]:
                    max[i] = x[i + 1]
                if x[i + 1] < min[i]:
                    min[i] = x[i + 1]
            if k % 10000 == 0:
                print '%d/%d' % (k, self.reader.LineCount)
        self.reader.close()
        return (max, min)

    ##调用该函数,实现归一化
    def transform(self):
        (max, min) = self.getMaxMin()
        self.reader.open()
        dim = self.reader.XDim - 1  #去除常数项
        dis = [0] * dim
        for i in xrange(dim):
            dis[i] = max[i] - min[i]

        self.writer = BinWriter(self.filename.rstrip('.bin') + '.norm.bin')
        self.writer.open(self.reader.LineCount, self.reader.Dim)

        for k in xrange(self.reader.LineCount):
            (x, userid, itemid, label) = self.reader.readline()
            for i in xrange(dim):
                x[i + 1] = (x[i + 1] - min[i]) / dis[i]
            self.writer.writeline(x, userid, itemid, label)

            if k % 10000 == 0:
                print '%d/%d' % (k, self.reader.LineCount)

        self.writer.close()
Exemplo n.º 6
0
from sklearn.ensemble import GradientBoostingClassifier
from BinReader import BinReader
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

(data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') 

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train)
print 'testing...'

reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin')
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x,userid,itemid,label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid,itemid,y)
    if i % 10000 == 0:
        print '%d/%d' % (i,reader.LineCount)
    
result.sort(key=lambda x:x[2],reverse=True)
result = result[:7000]


print ur'正在输出...'
with open('result.csv','w') as f:
    for item in result:
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

path_train = ur'data\temp_train.bin'
path_test = ur'data\temp_test.bin'

(data, label, items) = BinReader.readData(path_train)

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0, loss='ls',
                                verbose=1).fit(X_train, y_train)
print 'testing...'

reader = BinReader(path_test)
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x, userid, itemid, label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid, itemid, y)
    if i % 10000 == 0:
        print '%d/%d' % (i, reader.LineCount)

result.sort(key=lambda x: x[2], reverse=True)
result = result[:7000]

print "input"
with open('result.csv', 'w') as f:
    for item in result: