def train(sc): feat_weight = {} learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 while cur_iter < ITER_MAX: print ( "iteration %d" % cur_iter ) broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(lambda a,b: a+b).collect() update_weight(grad,feat_weight,accum.value,learning_rate, THETA) eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae)) cur_iter += 1
def train(sc): feat_weight = {} learning_rate = 1 ITER_MAX = 1000 THETA = 4 K = 8 SAMPLING_RATE = 0.1 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") #[train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train.test/train.test") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") #[eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval.test/eval.test") #feat_dict = utils.load_feat(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_feat/*") [ feat_dict, feat_freq ] = utils.load_feat_2(sc,"hdfs://hqz-ubuntu-master:9000/data/feat_count/*",10000) for f in feat_dict: feat_weight[f] = [ 0.0, [] ] if False: #feat_freq[f] >= 0: for i in range(0,K): feat_weight[f][1].append(random.uniform(0,0.001)) #feat_weight[f][1].append(0) cur_iter = 0 while cur_iter < ITER_MAX: print "=============================================================================" print ( "iteration %d" % cur_iter ) print "broadcasting feat_weight" broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) print "calculating gradient" grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(add_gradient).collect() print "updating feat_weight" feat_weight = update_weight(grad,feat_weight,train_ins,accum.value,learning_rate, THETA) #print "returned weight:" fp=open("weights_%d" % cur_iter,"w") for f in feat_weight: fp.write("%d\t%f\t%s\n" % (f, feat_weight[f][0],"\t".join([str(i) for i in feat_weight[f][1]])) ) fp.close() print "evaluating..." eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect() print "getting eval res" [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae)) cur_iter += 1
def update_weight(grad,feat_weight, train_ins, ins_num, learning_rate,theta): auc_array = [] multiply = 1.0 g = {} for p in grad: feat_sign = p[0] feat_grad = p[1] k = len(feat_grad[1]) g[feat_sign] = [ 0,[] ] g[feat_sign][0] = ( learning_rate*1.0 / ins_num * (feat_grad[0] - theta * feat_weight[feat_sign][0]) ) for i in range(0,k): g[feat_sign][1].append( learning_rate*1.0 / ins_num * (feat_grad[1][i] - theta * feat_weight[feat_sign][1][i]) ) feat_weight_last = {} for i in range(1,100): feat_weight_tmp = {} for feat_sign in g: k = len(g[feat_sign][1]) feat_weight_tmp[feat_sign] = [ 0,[] ] if feat_sign not in feat_weight: feat_weight_tmp[feat_sign][0] = - g[feat_sign][0] * multiply for ii in range(0,k): feat_weight_tmp[feat_sign][1].append( - g[feat_sign][1][ii] * multiply ) else: feat_weight_tmp[feat_sign][0] = feat_weight[feat_sign][0] - g[feat_sign][0] * multiply for ii in range(0,k): feat_weight_tmp[feat_sign][1].append(feat_weight[feat_sign][1][ii] - g[feat_sign][1][ii] * multiply) eval_res = train_ins.map(lambda ins: ( ins.predict(feat_weight_tmp), ins.label)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) print "searching step: multiply %d: train auc: %f, train_loss:%f, train_mae: %f" % (multiply, auc,loss,mae) auc = -loss if len(auc_array) > 0 and auc <= auc_array[-1]: break auc_array.append(auc) feat_weight_last = feat_weight_tmp.copy() multiply = (i+1)*1.0 return feat_weight_last
def lossFunc_loss(self,x,broadcast_feat): eval_res = self.train_ins.map(lambda ins: eval_ins_map(ins,broadcast_feat)).sortByKey().collect() [auc,mae,ins_loss] = utils.get_eval_stat(eval_res) loss = ins_loss + self.l2_weight / 2 * ((x.T * x)[0,0]) return loss
def lossFunc_loss(self, x, broadcast_feat): eval_res = self.train_ins.map(lambda ins: eval_ins_map( ins, broadcast_feat)).sortByKey().collect() [auc, mae, ins_loss] = utils.get_eval_stat(eval_res) loss = ins_loss + self.l2_weight / 2 * ((x.T * x)[0, 0]) return loss
def train(sc): learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins, train_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*") [eval_ins, eval_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 single_sample_learning_rate = learning_rate / (train_ins_count * SAMPLING_RATE) single_sample_theta = THETA / (train_ins_count * SAMPLING_RATE) print "single sample learning rate: %f " % single_sample_learning_rate print "single sample theta: %f" % single_sample_theta feat_weight = sc.accumulator({}, WeightAccumulatorParam()) broadcast_feat = sc.broadcast(feat_weight.value) while cur_iter < ITER_MAX: print("iteration %d" % cur_iter) selected_sample = train_ins.map(lambda ins: calc_gradient( ins, feat_weight, broadcast_feat, single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a, b: a + b) broadcast_feat = sc.broadcast(feat_weight.value) eval_res = eval_ins.map(lambda ins: utils.evalulate_map( ins, broadcast_feat)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, broadcast_feat.value,eval_res) print("selected %d samples: auc :%f, mae: %f" % (selected_sample, auc, mae)) cur_iter += 1
def eval_func_for_test_set(self,x): weight_dict={} #translating feature matrix to a python dictionary for feat in self.feat_dict: idx = self.feat_dict[feat] if x[idx,0] != 0: weight_dict[feat] = x[idx,0] #broadcast the feature weight and calculate the gradient distributely broadcast_feat = self.sc.broadcast(weight_dict) eval_res = self.eval_ins.map(lambda ins: eval_ins_map(ins,broadcast_feat)).sortByKey().collect() [auc,mae,ins_loss] = utils.get_eval_stat(eval_res) return [mae,auc]
def eval_func_for_test_set(self, x): weight_dict = {} #translating feature matrix to a python dictionary for feat in self.feat_dict: idx = self.feat_dict[feat] if x[idx, 0] != 0: weight_dict[feat] = x[idx, 0] #broadcast the feature weight and calculate the gradient distributely broadcast_feat = self.sc.broadcast(weight_dict) eval_res = self.eval_ins.map(lambda ins: eval_ins_map( ins, broadcast_feat)).sortByKey().collect() [auc, mae, ins_loss] = utils.get_eval_stat(eval_res) return [mae, auc]
def train(sc): feat_weight = {} learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins, train_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") [eval_ins, eval_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 while cur_iter < ITER_MAX: print("iteration %d" % cur_iter) broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) grad = train_ins.flatMap(lambda ins: calc_gradient( ins, broadcast_feat, accum, SAMPLING_RATE)).reduceByKey( lambda a, b: a + b).collect() update_weight(grad, feat_weight, accum.value, learning_rate, THETA) eval_res = eval_ins.map(lambda ins: (ins.predict(feat_weight), ins. label)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print("selected %d samples: auc :%f, mae: %f" % (accum.value, auc, mae)) cur_iter += 1
def train(sc): learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 single_sample_learning_rate = learning_rate / ( train_ins_count * SAMPLING_RATE ) single_sample_theta = THETA/(train_ins_count * SAMPLING_RATE) print "single sample learning rate: %f " % single_sample_learning_rate print "single sample theta: %f" % single_sample_theta feat_weight = sc.accumulator({},WeightAccumulatorParam()) broadcast_feat = sc.broadcast(feat_weight.value) while cur_iter < ITER_MAX: print ( "iteration %d" % cur_iter ) selected_sample = train_ins.map(lambda ins: calc_gradient(ins, feat_weight,broadcast_feat,single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a,b: a+b) broadcast_feat = sc.broadcast(feat_weight.value) eval_res = eval_ins.map(lambda ins:utils.evalulate_map(ins,broadcast_feat) ).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, broadcast_feat.value,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (selected_sample,auc,mae)) cur_iter += 1