def verify(self, corpus_file, xdim): begin = time.time() batch_x = [] batch_y = [] n = 0 corpus_generator = CorpusGenerator(corpus_file, 10, 0, xdim) for X, Y in corpus_generator: n += 1 batch_x.append(X) batch_y.append([Y]) x = np.array(batch_x) y = np.array(batch_y) y_hat = self.predict(x) loss = self.decisionFunc.loss(y, y_hat) / x.shape[0] logger.info("predit {} samples, mean loss {}, use {} seconds".format( n, loss, time.time() - begin))
def PreditByLR(lr, corpusFile, splitNum, splitIndex, alpha, beta, l1, l2): begin = time.time() key_range = Range() key_range.Begin = 0 key_range.End = lr.psClient.parameterTotal corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex, lr.psClient.parameterTotal) loss = 0.0 population = 0 w = [] z = [] n = [] for value in lr.psClient.GetAllParameter(): if len(value.Values) == 1: w.append(value.Values[0]) else: z.append(value.Values[0]) n.append(value.Values[1]) if len(z) > 0: logger.debug("model is trained by ftrl") z = np.array(z) n = np.array(n) # 根据n和z计算得到w w = np.array([ 0 if np.abs(z[i]) <= l1 else (np.sign(z[i]) * l1 - z[i]) / (l2 + (beta + np.sqrt(n[i])) / alpha) for i in xrange(len(z)) ]) else: logger.debug("model is trained by gd") w = np.array(w) for X, Y in corpusGenerator: x = np.array(X).reshape(1, len(X)) y_hat = lr.fn(w, x) loss += lr.loss(np.array(Y), y_hat) population += 1 logger.info("predit {} samples, mean loss {}, use {} seconds".format( population, loss / population, time.time() - begin))
def train(self, corpus_file, verbos=False, epochs=100, batch=64): begin = time.time() total = 0 for itr in xrange(epochs): logger.info("Epoch={:d}".format(itr)) # 尽量使用mini batch,充分发挥numpy的并行计算能力 mini_batch_x = [] mini_batch_y = [] n = 0 corpus_generator = CorpusGenerator(corpus_file, 1, 0, self.dim) for X, Y in corpus_generator: n += 1 mini_batch_x.append(X) mini_batch_y.append([Y]) if len(mini_batch_x) >= batch: self.update(np.array(mini_batch_x), np.array(mini_batch_y)) if verbos: Y_HAT = self.predict(np.array(mini_batch_x)) train_loss = self.decisionFunc.loss( np.array(mini_batch_y), Y_HAT) / len(mini_batch_x) logger.info("{:d}/{:d} train loss: {:f}\n".format( n, total, train_loss)) mini_batch_x = [] mini_batch_y = [] self.update(np.array(mini_batch_x), np.array(mini_batch_y)) if total == 0: total = n if verbos: Y_HAT = self.predict(np.array(mini_batch_x)) train_loss = self.decisionFunc.loss(np.array(mini_batch_y), Y_HAT) / len(mini_batch_x) logger.info("{:d}/{:d} train loss: {:f}\n".format( n, total, train_loss)) logger.info( "train lr with ftrl finished, use {} seconds".format(time.time() - begin))
def TrainLrWithGD(corpusFile, splitNum, splitIndex, epoch, batch, eta, manager, port, ParameterTotal, KeyRange, synRound): begin = time.time() lr = LR(manager, port, ParameterTotal, KeyRange, random.random, 1) iter = 1 WaitedPullMsgId = 0 use_time = [] for ep in xrange(epoch): logger.info("epoch={}".format(ep)) corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex, ParameterTotal) xBatch = [] yBatch = [] for X, Y in corpusGenerator: xBatch.append(X) yBatch.append(Y) if len(xBatch) >= batch: if WaitedPullMsgId > 0: if not lr.psClient.WaitPull(WaitedPullMsgId, 1): # 等之前的Pull命令完成 logger.error("wait pull timeout") msgId = lr.psClient.Pull() if iter % synRound == 0: WaitedPullMsgId = msgId iter += 1 t1 = time.time() x = np.array(xBatch) w = [] for value in lr.psClient.GetAllParameter(): if len(value.Values) >= 1: w.append(value.Values[0]) else: logger.error( "parameters of one key less than 2: {}".format( len(value.Values))) w = np.array(w) y_hat = lr.fn(w, x) y = np.array(yBatch).reshape(len(yBatch), 1) g = lr.grad(y, y_hat, x[:, KeyRange.Begin:KeyRange.End]) # 只需要计算部分梯度 w[KeyRange.Begin:KeyRange. End] -= eta * g # 梯度下降法的核心公式,只更新自己负责的区间段 Values = [] for i in xrange(KeyRange.Begin, KeyRange.End): value = Value() value.Values.append(w[i]) Values.append(value) t2 = time.time() use_time.append(t2 - t1) lr.psClient.UpdateLocalRangedParameter(Values) lr.psClient.Push() xBatch = [] yBatch = [] logger.debug("update paramter {} times, mean use time {}".format( len(use_time), np.mean(np.array(use_time)))) if len(xBatch) > 0: if WaitedPullMsgId > 0: if not lr.psClient.WaitPull(WaitedPullMsgId, 1): # 等之前的Pull命令完成 logger.error("wait pull timeout") msgId = lr.psClient.Pull() if iter % synRound == 0: WaitedPullMsgId = msgId iter += 1 x = np.array(xBatch) w = [] for value in lr.psClient.GetAllParameter(): if len(value.Values) >= 1: w.append(value.Values[0]) else: logger.error( "parameters of one key less than 2: {}".format( len(value.Values))) w = np.array(w) y_hat = lr.fn(w, x) y = np.array(yBatch).reshape(len(yBatch), 1) g = lr.grad(y, y_hat, x[:, KeyRange.Begin:KeyRange.End]) # 只需要计算部分梯度 w[KeyRange.Begin:KeyRange.End] -= eta * g # 梯度下降法的核心公式,只更新自己负责的区间段 Values = [] for i in xrange(KeyRange.Begin, KeyRange.End): value = Value() value.Values.append(w[i]) Values.append(value) lr.psClient.UpdateLocalRangedParameter(Values) lr.psClient.Push() logger.info( "train lr with gd finished, use {} seconds".format(time.time() - begin)) return lr
def TrainLrWithFTRL(corpusFile, splitNum, splitIndex, epoch, batch, manager, port, ParameterTotal, KeyRange, alpha, beta, l1, l2, synRound): begin = time.time() lr = LR(manager, port, ParameterTotal, KeyRange, init_zero, 2) # TRL中的z和n初始化为0,注意n一定不能是负数 iter = 1 WaitedPullMsgId = 0 use_time = [] for ep in xrange(epoch): logger.info("epoch={}".format(ep)) corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex, ParameterTotal) xBatch = [] yBatch = [] for X, Y in corpusGenerator: xBatch.append(X) yBatch.append(Y) if len(xBatch) >= batch: if WaitedPullMsgId > 0: if not lr.psClient.WaitPull(WaitedPullMsgId, 1): # 等之前的Pull命令完成 logger.error("wait pull timeout") msgId = lr.psClient.Pull() if iter % synRound == 0: WaitedPullMsgId = msgId iter += 1 t1 = time.time() x = np.array(xBatch) z = [] n = [] for v in lr.psClient.GetAllParameter(): if len(v.Values) >= 2: z.append(v.Values[0]) n.append(v.Values[1]) else: logger.error( "parameters of one key less than 2: {}".format( len(v.Values))) z = np.array(z) n = np.array(n) # FTRL核心公式 w = np.array([ 0 if np.abs(z[i]) <= l1 else (np.sign(z[i]) * l1 - z[i]) / (l2 + (beta + np.sqrt(n[i])) / alpha) for i in xrange(len(z)) ]) # print "w after", w[KeyRange.Begin:min(KeyRange.Begin + 10, KeyRange.End)] y_hat = lr.fn(w, x) y = np.array(yBatch).reshape(len(yBatch), 1) g = lr.grad(y, y_hat, x[:, KeyRange.Begin:KeyRange.End]) # 只需要计算部分梯度 # print "g", g[0:min(10, g.shape[0])] sigma = (np.sqrt(n[KeyRange.Begin:KeyRange.End] + g * g) - np.sqrt(n[KeyRange.Begin:KeyRange.End])) / alpha z[KeyRange.Begin:KeyRange.End] += g - sigma * w[ KeyRange.Begin:KeyRange.End] # 只更新自己负责的区间段 # print "z after", z[KeyRange.Begin:min(KeyRange.Begin + 10, KeyRange.End)] n[KeyRange.Begin:KeyRange.End] += g * g # 只更新自己负责的区间段 Values = [] for i in xrange(KeyRange.Begin, KeyRange.End): value = Value() value.Values.extend([z[i], n[i]]) Values.append(value) t2 = time.time() use_time.append(t2 - t1) lr.psClient.UpdateLocalRangedParameter(Values) lr.psClient.Push() xBatch = [] yBatch = [] logger.debug("update paramter {} times, mean use time {}".format( len(use_time), np.mean(np.array(use_time)))) if len(xBatch) > 0: if WaitedPullMsgId > 0: if not lr.psClient.WaitPull(WaitedPullMsgId, 1): # 等之前的Pull命令完成 logger.error("wait pull timeout") msgId = lr.psClient.Pull() if iter % synRound == 0: WaitedPullMsgId = msgId iter += 1 x = np.array(xBatch) z = [] n = [] for v in lr.psClient.GetAllParameter(): if len(v.Values) >= 2: z.append(v.Values[0]) n.append(v.Values[1]) else: logger.error( "parameters of one key less than 2: {}".format( len(v.Values))) z = np.array(z) n = np.array(n) # FTRL核心公式 w = np.array([ 0 if np.abs(z[i]) <= l1 else (np.sign(z[i]) * l1 - z[i]) / (l2 + (beta + np.sqrt(n[i])) / alpha) for i in xrange(len(z)) ]) y_hat = lr.fn(w, x) y = np.array(yBatch).reshape(len(yBatch), 1) g = lr.grad(y, y_hat, x[:, KeyRange.Begin:KeyRange.End]) # 只需要计算部分梯度 sigma = (np.sqrt(n[KeyRange.Begin:KeyRange.End] + g * g) - np.sqrt(n[KeyRange.Begin:KeyRange.End])) / alpha z[KeyRange.Begin:KeyRange. End] += g - sigma * w[KeyRange.Begin:KeyRange.End] # 只更新自己负责的区间段 n[KeyRange.Begin:KeyRange.End] += g * g # 只更新自己负责的区间段 Values = [] for i in xrange(KeyRange.Begin, KeyRange.End): value = Value() value.Values.extend([z[i], n[i]]) Values.append(value) lr.psClient.UpdateLocalRangedParameter(Values) lr.psClient.Push() logger.info( "train lr with ftrl finished, use {} seconds".format(time.time() - begin)) return lr