Exemplo n.º 1
0
def debug_log_sgd_losses(sgd_type, losses, epoch, n=20):
    if False:
        # disable logging -- should be used in PRODUCTION
        return
    elif True:
        # minimal info
        logger.debug("[%s] epochs: %d; avg last %d losses:%f" %
                     (sgd_type, epoch, n,
                      np.mean(losses[(epoch - min(n, epoch)):(epoch)])))
    else:
        # maximum info
        logger.debug("[%s] epochs: %d; avg last %d losses:%f\n%s\n%s" %
                     (sgd_type, epoch, n,
                      np.mean(losses[(epoch - min(n, epoch)):(epoch)]),
                      str(list(losses[0:min(n, epoch)])),
                      str(list(losses[(epoch - min(n, epoch)):(epoch)]))))
Exemplo n.º 2
0
def sgd(w0,
        x,
        y,
        f,
        grad,
        learning_rate=0.01,
        batch_size=100,
        max_epochs=1000,
        eps=1e-6,
        shuffle=False,
        rng=None):
    n = x.shape[0]
    n_batches = get_num_batches(n, batch_size)
    w = np.copy(w0)
    epoch_losses = np.zeros(max_epochs, dtype=float)
    epoch = 0
    w_best = np.copy(w0)
    loss_best = np.inf
    if n <= batch_size:
        shuffle = False  # no need to shuffle since all instances will be used up in one batch
    if shuffle:
        shuffled_idxs = np.arange(n)
        if rng is None:
            np.random.shuffle(shuffled_idxs)
        else:
            rng.shuffle(shuffled_idxs)
    else:
        shuffled_idxs = None
    while epoch < max_epochs:
        losses = np.zeros(n_batches, dtype=float)
        for i in range(n_batches):
            xi, yi = get_sgd_batch(x,
                                   y,
                                   i,
                                   batch_size,
                                   shuffled_idxs=shuffled_idxs)
            if xi.shape[0] == 0:
                raise ValueError("Batch size of 0")
            g = grad(w, xi, yi)
            w -= learning_rate * g
            losses[i] = f(w, xi, yi)
            if False:
                g_norm = g.dot(g)
                if np.isnan(g_norm) or np.isinf(g_norm):
                    logger.debug("|grad|=%f, i=%d/%d, epoch:%d" %
                                 (g.dot(g), i + 1, n_batches, epoch))
                    logger.debug("|w0|=%f" % w0.dot(w0))
                    raise ArithmeticError("grad is nan/inf in sgd")
        loss = np.mean(losses)
        if np.isnan(loss):
            logger.debug("loss is nan")
            logger.debug("|w|=%f" % w.dot(w))
            raise ArithmeticError("loss is nan in sgd")
        epoch_losses[epoch] = loss
        if loss < loss_best:
            # pocket algorithm
            np.copyto(w_best, w)
            loss_best = loss
        epoch += 1
        if loss < eps:
            break
    debug_log_sgd_losses("sgd", epoch_losses, epoch, n=20)
    # logger.debug("epochs: %d" % epoch)
    # logger.debug("net losses:")
    # logger.debug("epoch losses:\n%s" % str(epoch_losses[0:epoch]))
    # logger.debug("best loss: %f" % loss_best)
    return w_best
Exemplo n.º 3
0
def sgdAdam(w0,
            x,
            y,
            f,
            grad,
            learning_rate=0.01,
            batch_size=100,
            max_epochs=1000,
            delta=1e-8,
            ro1=0.9,
            ro2=0.999,
            eps=1e-6,
            shuffle=False,
            rng=None):
    n = x.shape[0]
    n_batches = get_num_batches(n, batch_size)
    w = np.copy(w0)
    s = np.zeros(len(w0), dtype=w0.dtype)  # first moment variable
    s_hat = np.zeros(len(w0),
                     dtype=w0.dtype)  # first moment corrected for bias
    r = np.zeros(len(w0), dtype=w0.dtype)  # second moment variable
    r_hat = np.zeros(len(w0),
                     dtype=w0.dtype)  # second moment corrected for bias
    t = 0  # time step
    epoch_losses = np.zeros(max_epochs, dtype=float)
    epoch = 0
    w_best = np.copy(w0)
    loss_best = np.inf
    if n <= batch_size:
        # no need to shuffle since all instances will be used up in one batch
        shuffle = False
    if shuffle:
        shuffled_idxs = np.arange(n)
        if rng is None:
            np.random.shuffle(shuffled_idxs)
        else:
            rng.shuffle(shuffled_idxs)
    else:
        shuffled_idxs = None
    prev_loss = np.inf
    while epoch < max_epochs:
        losses = np.zeros(n_batches, dtype=float)
        for i in range(n_batches):
            xi, yi = get_sgd_batch(x,
                                   y,
                                   i,
                                   batch_size,
                                   shuffled_idxs=shuffled_idxs)
            g = grad(w, xi, yi)
            t += 1
            s[:] = ro1 * s + (1 - ro1) * g
            r[:] = ro2 * r + (1 - ro2) * np.multiply(g, g)
            # correct bias in first moment
            s_hat[:] = (1. / (1 - ro1**t)) * s
            # correct bias in second moment
            r_hat[:] = (1. / (1 - ro2**t)) * r
            dw_scale = (learning_rate / (np.sqrt(delta + r_hat)))
            dw = np.multiply(dw_scale, s_hat)
            w[:] = w - dw
            losses[i] = f(w, xi, yi)
        loss = np.mean(losses)
        if np.isnan(loss):
            logger.debug("loss is nan")
            logger.debug("|w|=%f" % w.dot(w))
            raise ArithmeticError("loss is nan in sgd")
        epoch_losses[epoch] = loss
        if loss < loss_best:
            # pocket algorithm
            np.copyto(w_best, w)
            loss_best = loss
        epoch += 1
        if (loss < eps or np.abs(loss - prev_loss) < eps
                or avg_loss_check(epoch_losses, epoch, n=20, eps=eps)):
            break
        prev_loss = loss
    debug_log_sgd_losses("sgdAdam", epoch_losses, epoch, n=20)
    # logger.debug("epochs: %d" % epoch)
    # logger.debug("net losses:")
    # logger.debug("epoch losses:\n%s" % str(epoch_losses[0:epoch]))
    # logger.debug("best loss: %f" % loss_best)
    return w_best
Exemplo n.º 4
0
def sgdRMSPropNestorov(w0,
                       x,
                       y,
                       f,
                       grad,
                       learning_rate=0.01,
                       batch_size=100,
                       max_epochs=1000,
                       alpha=0.9,
                       delta=1e-6,
                       ro=0.9,
                       eps=1e-6,
                       shuffle=False,
                       rng=None):
    n = x.shape[0]
    n_batches = get_num_batches(n, batch_size)
    w = np.copy(w0)
    v = np.zeros(len(w0), dtype=w0.dtype)  # velocity
    r = np.zeros(len(w0), dtype=w0.dtype)  # gradient accumulation variable
    epoch_losses = np.zeros(max_epochs, dtype=float)
    epoch = 0
    w_best = np.copy(w0)
    loss_best = np.inf
    if n <= batch_size:
        # no need to shuffle since all instances will be used up in one batch
        shuffle = False
    if shuffle:
        shuffled_idxs = np.arange(n)
        if rng is None:
            np.random.shuffle(shuffled_idxs)
        else:
            rng.shuffle(shuffled_idxs)
    else:
        shuffled_idxs = None
    prev_loss = np.inf
    while epoch < max_epochs:
        losses = np.zeros(n_batches, dtype=float)
        for i in range(n_batches):
            xi, yi = get_sgd_batch(x,
                                   y,
                                   i,
                                   batch_size,
                                   shuffled_idxs=shuffled_idxs)
            tw = w + alpha * v
            g = grad(tw, xi, yi)
            r[:] = ro * r + (1 - ro) * np.multiply(g, g)
            dw_scale = (learning_rate / (np.sqrt(delta + r)))
            v = alpha * v - np.multiply(dw_scale, g)
            w[:] = w + v
            losses[i] = f(w, xi, yi)
        loss = np.mean(losses)
        if np.isnan(loss):
            logger.debug("loss is nan")
            logger.debug("|w|=%f" % w.dot(w))
            raise ArithmeticError("loss is nan in sgd")
        epoch_losses[epoch] = loss
        if loss < loss_best:
            # pocket algorithm
            np.copyto(w_best, w)
            loss_best = loss
        epoch += 1
        if (loss < eps or np.abs(loss - prev_loss) < eps
                or avg_loss_check(epoch_losses, epoch, n=20, eps=eps)):
            break
        prev_loss = loss
    debug_log_sgd_losses("sgdRMSPropNestorov", epoch_losses, epoch, n=20)
    # logger.debug("epochs: %d" % epoch)
    # logger.debug("net losses:")
    # logger.debug("epoch losses:\n%s" % str(epoch_losses[0:epoch]))
    # logger.debug("best loss: %f" % loss_best)
    return w_best