Exemplo n.º 1
0
 def verify(self, corpus_file, xdim):
     begin = time.time()
     batch_x = []
     batch_y = []
     n = 0
     corpus_generator = CorpusGenerator(corpus_file, 10, 0, xdim)
     for X, Y in corpus_generator:
         n += 1
         batch_x.append(X)
         batch_y.append([Y])
     x = np.array(batch_x)
     y = np.array(batch_y)
     y_hat = self.predict(x)
     loss = self.decisionFunc.loss(y, y_hat) / x.shape[0]
     logger.info("predit {} samples, mean loss {}, use {} seconds".format(
         n, loss,
         time.time() - begin))
Exemplo n.º 2
0
def PreditByLR(lr, corpusFile, splitNum, splitIndex, alpha, beta, l1, l2):
    begin = time.time()
    key_range = Range()
    key_range.Begin = 0
    key_range.End = lr.psClient.parameterTotal
    corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex,
                                      lr.psClient.parameterTotal)
    loss = 0.0
    population = 0
    w = []
    z = []
    n = []
    for value in lr.psClient.GetAllParameter():
        if len(value.Values) == 1:
            w.append(value.Values[0])
        else:
            z.append(value.Values[0])
            n.append(value.Values[1])
    if len(z) > 0:
        logger.debug("model is trained by ftrl")
        z = np.array(z)
        n = np.array(n)
        # 根据n和z计算得到w
        w = np.array([
            0 if np.abs(z[i]) <= l1 else
            (np.sign(z[i]) * l1 - z[i]) / (l2 + (beta + np.sqrt(n[i])) / alpha)
            for i in xrange(len(z))
        ])

    else:
        logger.debug("model is trained by gd")
        w = np.array(w)
    for X, Y in corpusGenerator:
        x = np.array(X).reshape(1, len(X))
        y_hat = lr.fn(w, x)
        loss += lr.loss(np.array(Y), y_hat)
        population += 1

    logger.info("predit {} samples, mean loss {}, use {} seconds".format(
        population, loss / population,
        time.time() - begin))
Exemplo n.º 3
0
    def train(self, corpus_file, verbos=False, epochs=100, batch=64):
        begin = time.time()
        total = 0
        for itr in xrange(epochs):
            logger.info("Epoch={:d}".format(itr))
            # 尽量使用mini batch,充分发挥numpy的并行计算能力
            mini_batch_x = []
            mini_batch_y = []
            n = 0
            corpus_generator = CorpusGenerator(corpus_file, 1, 0, self.dim)
            for X, Y in corpus_generator:
                n += 1
                mini_batch_x.append(X)
                mini_batch_y.append([Y])
                if len(mini_batch_x) >= batch:
                    self.update(np.array(mini_batch_x), np.array(mini_batch_y))
                    if verbos:
                        Y_HAT = self.predict(np.array(mini_batch_x))
                        train_loss = self.decisionFunc.loss(
                            np.array(mini_batch_y), Y_HAT) / len(mini_batch_x)
                        logger.info("{:d}/{:d} train loss: {:f}\n".format(
                            n, total, train_loss))
                    mini_batch_x = []
                    mini_batch_y = []
            self.update(np.array(mini_batch_x), np.array(mini_batch_y))
            if total == 0:
                total = n
            if verbos:
                Y_HAT = self.predict(np.array(mini_batch_x))
                train_loss = self.decisionFunc.loss(np.array(mini_batch_y),
                                                    Y_HAT) / len(mini_batch_x)
                logger.info("{:d}/{:d} train loss: {:f}\n".format(
                    n, total, train_loss))

        logger.info(
            "train lr with ftrl finished, use {} seconds".format(time.time() -
                                                                 begin))
Exemplo n.º 4
0
def TrainLrWithGD(corpusFile, splitNum, splitIndex, epoch, batch, eta, manager,
                  port, ParameterTotal, KeyRange, synRound):
    begin = time.time()
    lr = LR(manager, port, ParameterTotal, KeyRange, random.random, 1)
    iter = 1
    WaitedPullMsgId = 0
    use_time = []
    for ep in xrange(epoch):
        logger.info("epoch={}".format(ep))
        corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex,
                                          ParameterTotal)
        xBatch = []
        yBatch = []
        for X, Y in corpusGenerator:
            xBatch.append(X)
            yBatch.append(Y)
            if len(xBatch) >= batch:
                if WaitedPullMsgId > 0:
                    if not lr.psClient.WaitPull(WaitedPullMsgId,
                                                1):  # 等之前的Pull命令完成
                        logger.error("wait pull timeout")
                msgId = lr.psClient.Pull()
                if iter % synRound == 0:
                    WaitedPullMsgId = msgId
                iter += 1
                t1 = time.time()
                x = np.array(xBatch)
                w = []
                for value in lr.psClient.GetAllParameter():
                    if len(value.Values) >= 1:
                        w.append(value.Values[0])
                    else:
                        logger.error(
                            "parameters of one key less than 2: {}".format(
                                len(value.Values)))
                w = np.array(w)
                y_hat = lr.fn(w, x)
                y = np.array(yBatch).reshape(len(yBatch), 1)
                g = lr.grad(y, y_hat,
                            x[:, KeyRange.Begin:KeyRange.End])  # 只需要计算部分梯度
                w[KeyRange.Begin:KeyRange.
                  End] -= eta * g  # 梯度下降法的核心公式,只更新自己负责的区间段
                Values = []
                for i in xrange(KeyRange.Begin, KeyRange.End):
                    value = Value()
                    value.Values.append(w[i])
                    Values.append(value)
                t2 = time.time()
                use_time.append(t2 - t1)
                lr.psClient.UpdateLocalRangedParameter(Values)
                lr.psClient.Push()
                xBatch = []
                yBatch = []
        logger.debug("update paramter {} times, mean use time {}".format(
            len(use_time), np.mean(np.array(use_time))))
        if len(xBatch) > 0:
            if WaitedPullMsgId > 0:
                if not lr.psClient.WaitPull(WaitedPullMsgId,
                                            1):  # 等之前的Pull命令完成
                    logger.error("wait pull timeout")
            msgId = lr.psClient.Pull()
            if iter % synRound == 0:
                WaitedPullMsgId = msgId
            iter += 1
            x = np.array(xBatch)
            w = []
            for value in lr.psClient.GetAllParameter():
                if len(value.Values) >= 1:
                    w.append(value.Values[0])
                else:
                    logger.error(
                        "parameters of one key less than 2: {}".format(
                            len(value.Values)))
            w = np.array(w)
            y_hat = lr.fn(w, x)
            y = np.array(yBatch).reshape(len(yBatch), 1)
            g = lr.grad(y, y_hat, x[:,
                                    KeyRange.Begin:KeyRange.End])  # 只需要计算部分梯度
            w[KeyRange.Begin:KeyRange.End] -= eta * g  # 梯度下降法的核心公式,只更新自己负责的区间段
            Values = []
            for i in xrange(KeyRange.Begin, KeyRange.End):
                value = Value()
                value.Values.append(w[i])
                Values.append(value)
            lr.psClient.UpdateLocalRangedParameter(Values)
            lr.psClient.Push()

    logger.info(
        "train lr with gd finished, use {} seconds".format(time.time() -
                                                           begin))
    return lr
Exemplo n.º 5
0
def TrainLrWithFTRL(corpusFile, splitNum, splitIndex, epoch, batch, manager,
                    port, ParameterTotal, KeyRange, alpha, beta, l1, l2,
                    synRound):
    begin = time.time()
    lr = LR(manager, port, ParameterTotal, KeyRange, init_zero,
            2)  # TRL中的z和n初始化为0,注意n一定不能是负数
    iter = 1
    WaitedPullMsgId = 0
    use_time = []
    for ep in xrange(epoch):
        logger.info("epoch={}".format(ep))
        corpusGenerator = CorpusGenerator(corpusFile, splitNum, splitIndex,
                                          ParameterTotal)
        xBatch = []
        yBatch = []
        for X, Y in corpusGenerator:
            xBatch.append(X)
            yBatch.append(Y)
            if len(xBatch) >= batch:
                if WaitedPullMsgId > 0:
                    if not lr.psClient.WaitPull(WaitedPullMsgId,
                                                1):  # 等之前的Pull命令完成
                        logger.error("wait pull timeout")
                msgId = lr.psClient.Pull()
                if iter % synRound == 0:
                    WaitedPullMsgId = msgId
                iter += 1
                t1 = time.time()
                x = np.array(xBatch)
                z = []
                n = []
                for v in lr.psClient.GetAllParameter():
                    if len(v.Values) >= 2:
                        z.append(v.Values[0])
                        n.append(v.Values[1])
                    else:
                        logger.error(
                            "parameters of one key less than 2: {}".format(
                                len(v.Values)))
                z = np.array(z)
                n = np.array(n)
                # FTRL核心公式
                w = np.array([
                    0 if np.abs(z[i]) <= l1 else (np.sign(z[i]) * l1 - z[i]) /
                    (l2 + (beta + np.sqrt(n[i])) / alpha)
                    for i in xrange(len(z))
                ])
                # print "w after", w[KeyRange.Begin:min(KeyRange.Begin + 10, KeyRange.End)]
                y_hat = lr.fn(w, x)
                y = np.array(yBatch).reshape(len(yBatch), 1)
                g = lr.grad(y, y_hat,
                            x[:, KeyRange.Begin:KeyRange.End])  # 只需要计算部分梯度
                # print "g", g[0:min(10, g.shape[0])]
                sigma = (np.sqrt(n[KeyRange.Begin:KeyRange.End] + g * g) -
                         np.sqrt(n[KeyRange.Begin:KeyRange.End])) / alpha
                z[KeyRange.Begin:KeyRange.End] += g - sigma * w[
                    KeyRange.Begin:KeyRange.End]  # 只更新自己负责的区间段
                # print "z after", z[KeyRange.Begin:min(KeyRange.Begin + 10, KeyRange.End)]
                n[KeyRange.Begin:KeyRange.End] += g * g  # 只更新自己负责的区间段
                Values = []
                for i in xrange(KeyRange.Begin, KeyRange.End):
                    value = Value()
                    value.Values.extend([z[i], n[i]])
                    Values.append(value)
                t2 = time.time()
                use_time.append(t2 - t1)
                lr.psClient.UpdateLocalRangedParameter(Values)
                lr.psClient.Push()
                xBatch = []
                yBatch = []
        logger.debug("update paramter {} times, mean use time {}".format(
            len(use_time), np.mean(np.array(use_time))))
        if len(xBatch) > 0:
            if WaitedPullMsgId > 0:
                if not lr.psClient.WaitPull(WaitedPullMsgId,
                                            1):  # 等之前的Pull命令完成
                    logger.error("wait pull timeout")
            msgId = lr.psClient.Pull()
            if iter % synRound == 0:
                WaitedPullMsgId = msgId
            iter += 1
            x = np.array(xBatch)
            z = []
            n = []
            for v in lr.psClient.GetAllParameter():
                if len(v.Values) >= 2:
                    z.append(v.Values[0])
                    n.append(v.Values[1])
                else:
                    logger.error(
                        "parameters of one key less than 2: {}".format(
                            len(v.Values)))
            z = np.array(z)
            n = np.array(n)
            # FTRL核心公式
            w = np.array([
                0 if np.abs(z[i]) <= l1 else (np.sign(z[i]) * l1 - z[i]) /
                (l2 + (beta + np.sqrt(n[i])) / alpha) for i in xrange(len(z))
            ])
            y_hat = lr.fn(w, x)
            y = np.array(yBatch).reshape(len(yBatch), 1)
            g = lr.grad(y, y_hat, x[:,
                                    KeyRange.Begin:KeyRange.End])  # 只需要计算部分梯度
            sigma = (np.sqrt(n[KeyRange.Begin:KeyRange.End] + g * g) -
                     np.sqrt(n[KeyRange.Begin:KeyRange.End])) / alpha
            z[KeyRange.Begin:KeyRange.
              End] += g - sigma * w[KeyRange.Begin:KeyRange.End]  # 只更新自己负责的区间段
            n[KeyRange.Begin:KeyRange.End] += g * g  # 只更新自己负责的区间段
            Values = []
            for i in xrange(KeyRange.Begin, KeyRange.End):
                value = Value()
                value.Values.extend([z[i], n[i]])
                Values.append(value)
            lr.psClient.UpdateLocalRangedParameter(Values)
            lr.psClient.Push()

    logger.info(
        "train lr with ftrl finished, use {} seconds".format(time.time() -
                                                             begin))
    return lr