Пример #1
0
 def load_ins_feat(self,train_file,eval_file,feat_file):
     [self.train_ins,self.train_ins_count] = utils.load_ins(self.sc,train_file)
 
     [self.eval_ins,self.eval_ins_count] = utils.load_ins(self.sc,eval_file)
 
     self.feat_dict = utils.load_feat(self.sc,feat_file)
     self.feat_weight = [0.0] * len(self.feat_dict)
Пример #2
0
def train(sc):
    
    feat_weight = {}

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
    while cur_iter < ITER_MAX:
        
        print ( "iteration %d" % cur_iter )
        
        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)

        grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(lambda a,b: a+b).collect()

        update_weight(grad,feat_weight,accum.value,learning_rate, THETA)

        eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, feat_weight,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae))

        cur_iter += 1
Пример #3
0
def train(sc):
    
    feat_weight = {}

    learning_rate = 1
    ITER_MAX = 1000
    THETA = 4
    K = 8

    SAMPLING_RATE = 0.1

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")
    #[train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train.test/train.test")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")
    #[eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval.test/eval.test")
    
    #feat_dict = utils.load_feat(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_feat/*")
    [ feat_dict, feat_freq ] = utils.load_feat_2(sc,"hdfs://hqz-ubuntu-master:9000/data/feat_count/*",10000)
    
    for f in feat_dict:
        feat_weight[f] = [ 0.0, [] ]
        
        if False: #feat_freq[f] >= 0:
            for i in range(0,K):
                feat_weight[f][1].append(random.uniform(0,0.001))
                #feat_weight[f][1].append(0)

    cur_iter = 0
    while cur_iter < ITER_MAX:
        print "============================================================================="
        print ( "iteration %d" % cur_iter )
        
        print "broadcasting feat_weight"
        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)
        print "calculating gradient"
        grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(add_gradient).collect()

        print "updating feat_weight"
        feat_weight = update_weight(grad,feat_weight,train_ins,accum.value,learning_rate, THETA)

        #print "returned weight:"
        fp=open("weights_%d" % cur_iter,"w")
        for f in feat_weight:
            fp.write("%d\t%f\t%s\n" % (f, feat_weight[f][0],"\t".join([str(i) for i in feat_weight[f][1]])) )
        fp.close()
        
        print "evaluating..."
        eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect()

        print "getting eval res"
        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, feat_weight,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae))

        cur_iter += 1
Пример #4
0
    def load_ins_feat(self, train_file, eval_file, feat_file):
        [self.train_ins,
         self.train_ins_count] = utils.load_ins(self.sc, train_file)

        [self.eval_ins,
         self.eval_ins_count] = utils.load_ins(self.sc, eval_file)

        self.feat_dict = utils.load_feat(self.sc, feat_file)
        self.feat_weight = [0.0] * len(self.feat_dict)
Пример #5
0
def train(sc):

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins, train_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*")

    [eval_ins, eval_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0

    single_sample_learning_rate = learning_rate / (train_ins_count *
                                                   SAMPLING_RATE)
    single_sample_theta = THETA / (train_ins_count * SAMPLING_RATE)

    print "single sample learning rate: %f " % single_sample_learning_rate
    print "single sample theta: %f" % single_sample_theta

    feat_weight = sc.accumulator({}, WeightAccumulatorParam())

    broadcast_feat = sc.broadcast(feat_weight.value)

    while cur_iter < ITER_MAX:

        print("iteration %d" % cur_iter)

        selected_sample = train_ins.map(lambda ins: calc_gradient(
            ins, feat_weight, broadcast_feat, single_sample_learning_rate,
            SAMPLING_RATE, single_sample_theta)).reduce(lambda a, b: a + b)

        broadcast_feat = sc.broadcast(feat_weight.value)

        eval_res = eval_ins.map(lambda ins: utils.evalulate_map(
            ins, broadcast_feat)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)

        #utils.output(cur_iter, None, broadcast_feat.value,eval_res)

        print("selected %d samples: auc :%f, mae: %f" %
              (selected_sample, auc, mae))

        cur_iter += 1
Пример #6
0
def train(sc):

    feat_weight = {}

    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins, train_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51")

    [eval_ins, eval_ins_count] = utils.load_ins(
        sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
    while cur_iter < ITER_MAX:

        print("iteration %d" % cur_iter)

        broadcast_feat = sc.broadcast(feat_weight)

        accum = sc.accumulator(0)

        grad = train_ins.flatMap(lambda ins: calc_gradient(
            ins, broadcast_feat, accum, SAMPLING_RATE)).reduceByKey(
                lambda a, b: a + b).collect()

        update_weight(grad, feat_weight, accum.value, learning_rate, THETA)

        eval_res = eval_ins.map(lambda ins: (ins.predict(feat_weight), ins.
                                             label)).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)

        #utils.output(cur_iter, None, feat_weight,eval_res)

        print("selected %d samples: auc :%f, mae: %f" %
              (accum.value, auc, mae))

        cur_iter += 1
Пример #7
0
def train(sc):
    
    learning_rate = 0.5
    ITER_MAX = 1000
    THETA = 4
    SAMPLING_RATE = 0.01

    [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*")
    
    [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*")

    cur_iter = 0
     
    single_sample_learning_rate = learning_rate / ( train_ins_count * SAMPLING_RATE )
    single_sample_theta = THETA/(train_ins_count * SAMPLING_RATE)

    print "single sample learning rate: %f " % single_sample_learning_rate
    print "single sample theta: %f" % single_sample_theta

    feat_weight = sc.accumulator({},WeightAccumulatorParam())
    
    broadcast_feat = sc.broadcast(feat_weight.value)

    while cur_iter < ITER_MAX:
        
        print ( "iteration %d" % cur_iter )
        
        selected_sample = train_ins.map(lambda ins: calc_gradient(ins, feat_weight,broadcast_feat,single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a,b: a+b)

        broadcast_feat = sc.broadcast(feat_weight.value)

        eval_res = eval_ins.map(lambda ins:utils.evalulate_map(ins,broadcast_feat) ).sortByKey().collect()

        [auc, mae, loss] = utils.get_eval_stat(eval_res)
        
        #utils.output(cur_iter, None, broadcast_feat.value,eval_res)
        
        print ("selected %d samples: auc :%f, mae: %f" % (selected_sample,auc,mae))

        cur_iter += 1