Пример #1
0
def train(sc):
    
    feat_weight = {}

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
    while cur_iter < ITER_MAX:
        
        print ( "iteration %d" % cur_iter )
        
        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)

        grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(lambda a,b: a+b).collect()

        update_weight(grad,feat_weight,accum.value,learning_rate, THETA)

        eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, feat_weight,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae))

        cur_iter += 1
Пример #2
0
def train(sc):
    
    feat_weight = {}

    learning_rate = 1
    ITER_MAX = 1000
    THETA = 4
    K = 8

    SAMPLING_RATE = 0.1

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")
    #[train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train.test/train.test")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")
    #[eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval.test/eval.test")
    
    #feat_dict = utils.load_feat(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_feat/*")
    [ feat_dict, feat_freq ] = utils.load_feat_2(sc,"hdfs://hqz-ubuntu-master:9000/data/feat_count/*",10000)
    
    for f in feat_dict:
        feat_weight[f] = [ 0.0, [] ]
        
        if False: #feat_freq[f] >= 0:
            for i in range(0,K):
                feat_weight[f][1].append(random.uniform(0,0.001))
                #feat_weight[f][1].append(0)

    cur_iter = 0
    while cur_iter < ITER_MAX:
        print "============================================================================="
        print ( "iteration %d" % cur_iter )
        
        print "broadcasting feat_weight"
        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)
        print "calculating gradient"
        grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(add_gradient).collect()

        print "updating feat_weight"
        feat_weight = update_weight(grad,feat_weight,train_ins,accum.value,learning_rate, THETA)

        #print "returned weight:"
        fp=open("weights_%d" % cur_iter,"w")
        for f in feat_weight:
            fp.write("%d\t%f\t%s\n" % (f, feat_weight[f][0],"\t".join([str(i) for i in feat_weight[f][1]])) )
        fp.close()
        
        print "evaluating..."
        eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect()

        print "getting eval res"
        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, feat_weight,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae))

        cur_iter += 1
Пример #3
0
def update_weight(grad,feat_weight, train_ins, ins_num, learning_rate,theta):

    auc_array = []

    multiply = 1.0

    g = {}

    for p in grad:
        
        feat_sign = p[0]
        feat_grad = p[1]

        k = len(feat_grad[1])

        g[feat_sign] = [ 0,[] ]
        
        g[feat_sign][0] = ( learning_rate*1.0 / ins_num * (feat_grad[0] - theta * feat_weight[feat_sign][0]) )

        for i in range(0,k):
            g[feat_sign][1].append( learning_rate*1.0 / ins_num * (feat_grad[1][i] - theta * feat_weight[feat_sign][1][i]) )

    feat_weight_last = {}

    for i in range(1,100):
        
        feat_weight_tmp = {}

        for feat_sign in g:
            
            k = len(g[feat_sign][1])

            feat_weight_tmp[feat_sign] = [ 0,[] ]
            
            if feat_sign not in feat_weight:
                feat_weight_tmp[feat_sign][0] = - g[feat_sign][0] * multiply
                for ii in range(0,k):
                    feat_weight_tmp[feat_sign][1].append( - g[feat_sign][1][ii] * multiply )
            else:
                feat_weight_tmp[feat_sign][0] = feat_weight[feat_sign][0] - g[feat_sign][0] * multiply
                for ii in range(0,k):
                    feat_weight_tmp[feat_sign][1].append(feat_weight[feat_sign][1][ii] - g[feat_sign][1][ii] * multiply)
     
        eval_res = train_ins.map(lambda ins: ( ins.predict(feat_weight_tmp), ins.label)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        print "searching step: multiply %d: train auc: %f, train_loss:%f, train_mae: %f" % (multiply, auc,loss,mae)

        auc = -loss
        if len(auc_array) > 0 and auc <= auc_array[-1]:
            break
        
        auc_array.append(auc)
        feat_weight_last = feat_weight_tmp.copy()

        multiply = (i+1)*1.0
        
    return feat_weight_last
Пример #4
0
    def lossFunc_loss(self,x,broadcast_feat):

        eval_res = self.train_ins.map(lambda ins: eval_ins_map(ins,broadcast_feat)).sortByKey().collect()
        
        [auc,mae,ins_loss] = utils.get_eval_stat(eval_res)

        loss = ins_loss + self.l2_weight / 2 * ((x.T * x)[0,0])
        
        return loss
Пример #5
0
    def lossFunc_loss(self, x, broadcast_feat):

        eval_res = self.train_ins.map(lambda ins: eval_ins_map(
            ins, broadcast_feat)).sortByKey().collect()

        [auc, mae, ins_loss] = utils.get_eval_stat(eval_res)

        loss = ins_loss + self.l2_weight / 2 * ((x.T * x)[0, 0])

        return loss
Пример #6
0
def train(sc):

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins, train_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*")

    [eval_ins, eval_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0

    single_sample_learning_rate = learning_rate / (train_ins_count *
                                                   SAMPLING_RATE)
    single_sample_theta = THETA / (train_ins_count * SAMPLING_RATE)

    print "single sample learning rate: %f " % single_sample_learning_rate
    print "single sample theta: %f" % single_sample_theta

    feat_weight = sc.accumulator({}, WeightAccumulatorParam())

    broadcast_feat = sc.broadcast(feat_weight.value)

    while cur_iter < ITER_MAX:

        print("iteration %d" % cur_iter)

        selected_sample = train_ins.map(lambda ins: calc_gradient(
            ins, feat_weight, broadcast_feat, single_sample_learning_rate,
            SAMPLING_RATE, single_sample_theta)).reduce(lambda a, b: a + b)

        broadcast_feat = sc.broadcast(feat_weight.value)

        eval_res = eval_ins.map(lambda ins: utils.evalulate_map(
            ins, broadcast_feat)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)

        #utils.output(cur_iter, None, broadcast_feat.value,eval_res)

        print("selected %d samples: auc :%f, mae: %f" %
              (selected_sample, auc, mae))

        cur_iter += 1
Пример #7
0
    def eval_func_for_test_set(self,x):
        weight_dict={}

        #translating feature matrix to a python dictionary
        for feat in self.feat_dict:
            idx = self.feat_dict[feat]
            if x[idx,0] != 0:
                weight_dict[feat] = x[idx,0]

        #broadcast the feature weight and calculate the gradient distributely
        broadcast_feat = self.sc.broadcast(weight_dict)

        eval_res = self.eval_ins.map(lambda ins: eval_ins_map(ins,broadcast_feat)).sortByKey().collect()
        
        [auc,mae,ins_loss] = utils.get_eval_stat(eval_res)

        return [mae,auc]
Пример #8
0
    def eval_func_for_test_set(self, x):
        weight_dict = {}

        #translating feature matrix to a python dictionary
        for feat in self.feat_dict:
            idx = self.feat_dict[feat]
            if x[idx, 0] != 0:
                weight_dict[feat] = x[idx, 0]

        #broadcast the feature weight and calculate the gradient distributely
        broadcast_feat = self.sc.broadcast(weight_dict)

        eval_res = self.eval_ins.map(lambda ins: eval_ins_map(
            ins, broadcast_feat)).sortByKey().collect()

        [auc, mae, ins_loss] = utils.get_eval_stat(eval_res)

        return [mae, auc]
Пример #9
0
def train(sc):

    feat_weight = {}

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins, train_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")

    [eval_ins, eval_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
    while cur_iter < ITER_MAX:

        print("iteration %d" % cur_iter)

        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)

        grad = train_ins.flatMap(lambda ins: calc_gradient(
            ins, broadcast_feat, accum, SAMPLING_RATE)).reduceByKey(
                lambda a, b: a + b).collect()

        update_weight(grad, feat_weight, accum.value, learning_rate, THETA)

        eval_res = eval_ins.map(lambda ins: (ins.predict(feat_weight), ins.
                                             label)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)

        #utils.output(cur_iter, None, feat_weight,eval_res)

        print("selected %d samples: auc :%f, mae: %f" %
              (accum.value, auc, mae))

        cur_iter += 1
Пример #10
0
def train(sc):
    
    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
     
    single_sample_learning_rate = learning_rate / ( train_ins_count * SAMPLING_RATE )
    single_sample_theta = THETA/(train_ins_count * SAMPLING_RATE)

    print "single sample learning rate: %f " % single_sample_learning_rate
    print "single sample theta: %f" % single_sample_theta

    feat_weight = sc.accumulator({},WeightAccumulatorParam())
    
    broadcast_feat = sc.broadcast(feat_weight.value)

    while cur_iter < ITER_MAX:
        
        print ( "iteration %d" % cur_iter )
        
        selected_sample = train_ins.map(lambda ins: calc_gradient(ins, feat_weight,broadcast_feat,single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a,b: a+b)

        broadcast_feat = sc.broadcast(feat_weight.value)

        eval_res = eval_ins.map(lambda ins:utils.evalulate_map(ins,broadcast_feat) ).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, broadcast_feat.value,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (selected_sample,auc,mae))

        cur_iter += 1