示例#1
0
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False):
  global final_loss, W_flat
  tf.set_random_seed(seed)
  np.random.seed(seed)
  
  images = tf.constant(u.get_mnist_images(batch_size).T)
  images = images[:batch_size]
  if cuda:
    images = images.gpu()
  data = images

  if cuda:
    device='/gpu:0'
  else:
    device=''

  device_ctx = tf.device(device)
  device_ctx.__enter__()

  visible_size = 28*28
  hidden_size = 196
  initial_val = tf.zeros([visible_size*hidden_size])
  if W_flat is None:
    W_flat = tfe.Variable(initial_val, name='W_flat')
  W_flat.assign(initial_val)
  
  
  def loss_fn(w_flat):
    w = tf.reshape(w_flat, [visible_size, hidden_size])
    x = tf.matmul(data, w)
    x = tf.sigmoid(x)
    x = tf.matmul(x, w, transpose_b=True)
    x = tf.sigmoid(x)
    return tf.reduce_mean(tf.square(x-data))

  value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn)
  def opfunc(x):  # returns (value, gradient)
    value, grads = value_and_gradients_fn(x)
    return value, grads[0]
      
  # initialize weights
  W_flat.assign(u.ng_init(visible_size, hidden_size).flatten())

  state = Struct()
  config = Struct()
  config.maxIter = iters
  config.nCorrection = history
  config.verbose = True
  x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose)

  if verbose:
    u.summarize_time()

  s = ','.join(["%f"%(n,) for n in times[2:]])
  print('{', s,'}')
  
  return final_loss
示例#2
0
def main():
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    images = torch.Tensor(u.get_mnist_images().T)
    images = images[:args.batch_size]
    if args.cuda:
        images = images.cuda()
    data = Variable(images)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.encoder = nn.Linear(args.visible_size,
                                     args.hidden_size,
                                     bias=False)
            self.decoder = nn.Linear(args.hidden_size,
                                     args.visible_size,
                                     bias=False)

        def forward(self, input):
            x = input.view(-1, args.visible_size)
            x = self.encoder(x)
            x = F.sigmoid(x)
            x = self.decoder(x)
            x = F.sigmoid(x)
            return x.view_as(input)

    # initialize model and weights
    model = Net()
    params1, params2 = list(model.parameters())
    params1.data = torch.Tensor(
        u.ng_init(args.visible_size, args.hidden_size).T)
    params2.data = torch.Tensor(
        u.ng_init(args.hidden_size, args.visible_size).T)
    if args.cuda:
        model.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    for step in range(args.iters):
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data)
        loss0 = loss.data[0]
        loss.backward()
        optimizer.step()

        print("Step %3d loss %6.5f" % (step, loss0))
        u.record_time()

    u.summarize_time()
示例#3
0
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False):
    global final_loss, W_flat
    tf.set_random_seed(seed)
    np.random.seed(seed)

    images = tf.constant(u.get_mnist_images(batch_size).T)
    images = images[:batch_size]
    if cuda:
        images = images.gpu()
    data = images

    if cuda:
        device = '/gpu:0'
    else:
        device = ''

    device_ctx = tf.device(device)
    device_ctx.__enter__()

    visible_size = 28 * 28
    hidden_size = 196
    initial_val = tf.zeros([visible_size * hidden_size])
    if W_flat is None:
        W_flat = tfe.Variable(initial_val, name='W_flat')
    W_flat.assign(initial_val)

    def loss_fn(w_flat):
        w = tf.reshape(w_flat, [visible_size, hidden_size])
        x = tf.matmul(data, w)
        x = tf.sigmoid(x)
        x = tf.matmul(x, w, transpose_b=True)
        x = tf.sigmoid(x)
        return tf.reduce_mean(tf.square(x - data))

    value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn)

    def opfunc(x):  # returns (value, gradient)
        value, grads = value_and_gradients_fn(x)
        return value, grads[0]

    # initialize weights
    W_flat.assign(u.ng_init(visible_size, hidden_size).flatten())

    state = Struct()
    config = Struct()
    config.maxIter = iters
    config.verbose = True
    x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose)

    if verbose:
        u.summarize_time()

    return final_loss
示例#4
0
def main():
  tf.set_random_seed(args.seed)
  np.random.seed(args.seed)
  
  images = tf.constant(u.get_mnist_images().T)
  images = images[:args.batch_size]
  if args.cuda:
    images = images.as_gpu_tensor()
  data = images

  if args.cuda:
    device='/gpu:0'
  else:
    device=''

  with tf.device(device):
    encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False,
                            activation=tf.sigmoid)
    decoder = tf.layers.Dense(units=args.visible_size, use_bias=False,
                              activation=tf.sigmoid)
    def loss_fn(inputs):
      predictions = decoder(encoder(inputs))
      return tf.reduce_mean(tf.square(predictions-inputs))
    value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)

    # initialize weights
    loss_fn(data)
    params1 = encoder.weights[0]
    params2 = decoder.weights[0]
    params1.assign(u.ng_init(args.visible_size, args.hidden_size))
    params2.assign(u.ng_init(args.hidden_size, args.visible_size))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr)
    for step in range(args.iters):
      value, grads_and_vars = value_and_gradients_fn(data)
      optimizer.apply_gradients(grads_and_vars)

      print("Step %3d loss %6.5f"%(step, value.numpy()))
      u.record_time()

    u.summarize_time()
示例#5
0
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(0)
    tf.set_random_seed(0)

    train_images = u.get_mnist_images()
    dsize = 1000
    fs = [dsize, 28 * 28, 196, 28 * 28]  # layer sizes
    lambda_ = 3e-3

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])
    Wf = tf.constant(W0f)
    assert Wf.dtype == tf.float32
    lr = tf.constant(0.2)

    losses = []
    for step in range(10):
        loss, grad, kfac_grad = loss_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %d loss %.2f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        if step >= 4:
            assert loss < 17.6
        u.record_time()

    u.summarize_time()
    assert losses[-1] < 0.8
    assert losses[-1] > 0.78
    assert 20e-3 < min(u.global_time_list) < 120e-3
示例#6
0
            args.advance_batch = 1
            args.extra_kfac_batch_advance = 1
            args.batch_size = 10000
            args.dataset = 'mnist'

        rundir = u.setup_experiment_run_directory(args.run)
        with open(rundir + '/args.txt', 'w') as f:
            f.write(json.dumps(vars(args), indent=4, separators=(',', ':')))
            f.write('\n')

    if args.dataset == 'cifar':
        # load data globally once
        from keras.datasets import cifar10
        (X_train, y_train), (X_test, y_test) = cifar10.load_data()
        X_train = X_train.astype(np.float32)
        X_train = X_train.reshape((X_train.shape[0], -1))
        X_test = X_test.astype(np.float32)
        X_test = X_test.reshape((X_test.shape[0], -1))
        X_train /= 255
        X_test /= 255

        # todo: rename to better names
        train_images = X_train.T  # batch first
        test_images = X_test.T
    elif args.dataset == 'mnist':
        train_images = u.get_mnist_images('train')
        test_images = u.get_mnist_images('test')
        train_images = train_images[:, :args.dataset_size]  # batch first

    main()
示例#7
0
def train(optimizer='sgd',
          nonlin=torch.sigmoid,
          kfac=True,
          iters=10,
          lr=0.2,
          newton_matrix='stochastic',
          eval_every_n_steps=1,
          print_interval=200):
    """Train on first 10k MNIST examples, evaluate on second 10k."""

    u.reset_time()
    dsize = 10000

    # model options
    dtype = np.float32
    torch_dtype = 'torch.FloatTensor'

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        torch_dtype = 'torch.cuda.FloatTensor'

    INVERSE_METHOD = 'numpy'  # numpy, gpu

    As = []
    Bs = []
    As_inv = []
    Bs_inv = []
    mode = 'capture'  # 'capture', 'kfac', 'standard'

    class KfacAddmm(Function):
        @staticmethod
        def _get_output(ctx, arg, inplace=False):
            if inplace:
                ctx.mark_dirty(arg)
                return arg
            else:
                return arg.new().resize_as_(arg)

        @staticmethod
        def forward(ctx,
                    add_matrix,
                    matrix1,
                    matrix2,
                    beta=1,
                    alpha=1,
                    inplace=False):
            ctx.save_for_backward(matrix1, matrix2)
            output = KfacAddmm._get_output(ctx, add_matrix, inplace=inplace)
            return torch.addmm(beta,
                               add_matrix,
                               alpha,
                               matrix1,
                               matrix2,
                               out=output)

        @staticmethod
        def backward(ctx, grad_output):
            matrix1, matrix2 = ctx.saved_variables
            grad_matrix1 = grad_matrix2 = None

            if mode == 'capture':
                Bs.insert(0, grad_output.data)
                As.insert(0, matrix2.data)
            elif mode == 'kfac':
                B = grad_output.data
                A = matrix2.data
                kfac_A = As_inv.pop() @ A
                kfac_B = Bs_inv.pop() @ B
                grad_matrix1 = Variable(torch.mm(kfac_B, kfac_A.t()))
            elif mode == 'standard':
                grad_matrix1 = torch.mm(grad_output, matrix2.t())

            else:
                assert False, 'unknown mode ' + mode

            if ctx.needs_input_grad[2]:
                grad_matrix2 = torch.mm(matrix1.t(), grad_output)

            return None, grad_matrix1, grad_matrix2, None, None, None

    def kfac_matmul(mat1, mat2):
        output = Variable(mat1.data.new(mat1.data.size(0), mat2.data.size(1)))
        return KfacAddmm.apply(output, mat1, mat2, 0, 1, True)

    torch.manual_seed(1)
    np.random.seed(1)
    if use_cuda:
        torch.cuda.manual_seed(1)

    # feature sizes at each layer
    fs = [dsize, 28 * 28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28 * 28]
    n = len(fs) - 2  # number of matmuls

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(fs[1], -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(kfac_matmul(W, x))
            return x.view_as(input)

    model = Net()

    if use_cuda:
        model.cuda()

    images = u.get_mnist_images()
    train_data0 = images[:, :dsize].astype(dtype)
    train_data = Variable(torch.from_numpy(train_data0))
    test_data0 = images[:, dsize:2 * dsize].astype(dtype)
    test_data = Variable(torch.from_numpy(test_data0))
    if use_cuda:
        train_data = train_data.cuda()
        test_data = test_data.cuda()

    model.train()
    if optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=lr)
    elif optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        assert False, 'unknown optimizer ' + optimizer

    noise = torch.Tensor(*train_data.data.shape).type(torch_dtype)
    assert fs[-1] <= dsize
    padding = dsize - fs[-1]
    zero_mat = torch.zeros((fs[-1], padding))
    frozen = torch.cat([torch.eye(fs[-1]), zero_mat], 1).type(torch_dtype)

    covA_inv_saved = [None] * n
    losses = []
    vlosses = []

    for step in range(iters):
        mode = 'standard'
        output = model(train_data)

        if kfac:
            mode = 'capture'
            optimizer.zero_grad()
            del As[:], Bs[:], As_inv[:], Bs_inv[:]

            if newton_matrix == 'stochastic':
                noise.normal_()
                err_add = noise
            elif newton_matrix == 'exact':
                err_add = frozen
            else:
                assert False, 'unknown method for newton matrix ' + newton_matrix

            output_hat = Variable(output.data + err_add)
            err_hat = output_hat - output

            loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize
            loss_hat.backward(retain_graph=True)

            # compute inverses
            for i in range(n):
                # first layer activations don't change, only compute once
                if i == 0 and covA_inv_saved[i] is not None:
                    covA_inv = covA_inv_saved[i]
                else:
                    covA_inv = regularized_inverse(As[i] @ As[i].t() / dsize)
                    covA_inv_saved[i] = covA_inv
                As_inv.append(covA_inv)

                covB = (Bs[i] @ Bs[i].t()) * dsize
                # alternative formula: slower but numerically better result
                # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize

                covB_inv = regularized_inverse(covB)
                Bs_inv.append(covB_inv)
            mode = 'kfac'

        else:
            mode = 'standard'

        if step % eval_every_n_steps == 0:
            old_mode = mode
            mode = 'standard'
            test_output = model(test_data)
            test_err = test_data - test_output
            test_loss = torch.sum(test_err * test_err) / 2 / dsize
            vloss0 = test_loss.data.cpu().numpy()[0]
            vlosses.append(vloss0)
            mode = old_mode

        optimizer.zero_grad()
        err = output - train_data
        loss = torch.sum(err * err) / 2 / dsize
        loss.backward()
        optimizer.step()

        loss0 = loss.data.cpu().numpy()[0]
        losses.append(loss0)
        if step % print_interval == 0:
            print("Step %3d loss %10.9f" % (step, loss0))

        u.record_time()

    return losses, vlosses
from tensorflow.contrib.eager.python import tfe

tfe.enable_eager_execution()

import common_gd

args = common_gd.args
args.cuda = not args.no_cuda and tfe.num_gpus() > 0

# for line profiling
try:
    profile  # throws an exception when profile isn't defined
except NameError:
    profile = lambda x: x  # if it's not defined simply ignore the decorator.

train_images = u.get_mnist_images()
dsize = 10000
fs = [dsize, 28 * 28, 196, 28 * 28]  # layer sizes
lambda_ = 3e-3


def f(i):
    return fs[i + 1]  # W[i] has shape f[i] x f[i-1]


n = len(fs) - 2

dtype = np.float32
tf_dtype = tf.float32

identity_cache = {}
def train(optimizer='sgd', kfac=True, iters=10, verbose=True):
  global mode
  
  torch.manual_seed(1)
  np.random.seed(1)
  if args.cuda:
    torch.cuda.manual_seed(1)

  # feature sizes at each layer
  fs = [dsize, 28*28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28*28]
  n = len(fs) - 2   # number of matmuls

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      for i in range(1, n+1):
        W0 = u.ng_init(fs[i+1], fs[i])
        setattr(self, 'W'+str(i), nn.Parameter(torch.from_numpy(W0)))

    def forward(self, input):
      x = input.view(fs[1], -1)
      for i in range(1, n+1):
        W = getattr(self, 'W'+str(i))
        x = nonlin(kfac_matmul(W, x))
      return x.view_as(input)

  model = Net()

  if args.cuda:
    model.cuda()

  data0 = u.get_mnist_images()
  data0 = data0[:, :dsize].astype(dtype)
  data = Variable(torch.from_numpy(data0))
  if args.cuda:
    data = data.cuda()

  model.train()
  if optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=lr)
  elif optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=lr)
  else:
    assert False, 'unknown optimizer '+optimizer
    
  noise = torch.Tensor(*data.data.shape).type(torch_dtype)
  covA_inv_saved = [None]*n
  losses = []
  
  for step in range(10):
    mode = 'standard'
    output = model(data)
    
    mode = 'capture'
    optimizer.zero_grad()
    del As[:], Bs[:], As_inv[:], Bs_inv[:]
    noise.normal_()

    output_hat = Variable(output.data+noise)
    err_hat = output_hat - output
    loss_hat = torch.sum(err_hat*err_hat)/2/dsize
    loss_hat.backward(retain_graph=True)
    
    # compute inverses
    for i in range(n):
      # first layer activations don't change, only compute once
      if i == 0 and covA_inv_saved[i] is not None:
        covA_inv = covA_inv_saved[i]
      else:
        covA_inv = regularized_inverse(As[i] @ As[i].t()/dsize)
        covA_inv_saved[i] = covA_inv
      As_inv.append(covA_inv)

      covB = (Bs[i]@Bs[i].t())*dsize
      # alternative formula: slower but numerically better result
      # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize
      
      covB_inv = regularized_inverse(covB)
      Bs_inv.append(covB_inv)

    if kfac:
      mode = 'kfac'
    else:
      mode = 'standard'
    optimizer.zero_grad()
    err = output - data
    loss = torch.sum(err*err)/2/dsize
    loss.backward()
    optimizer.step()
    
    loss0 = loss.data.cpu().numpy()[0]
    losses.append(loss0)
    if verbose:
      print("Step %3d loss %10.9f"%(step, loss0))
    u.record_time()

  return losses  
示例#10
0
def main():
    #  global forward, backward, DO_PRINT
    global mode, covA_inv, covB_inv

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [args.batch_size, 28 * 28, 196, 28 * 28]
    # number of layers
    n = len(fs) - 2

    # todo, move to more elegant backprop
    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(784, -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()
    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    losses = []

    covA = [None] * n
    covA_inv = [None] * n
    covB_inv = [None] * n

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)

    # TODO:
    # only do 2 passes like in eager mode
    # integrate with optimizer/same results
    # scale to deep autoencoder
    for step in range(10):
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        output = model(data)
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize

        loss.backward(retain_graph=True)
        backward.reverse()

        loss0 = loss.data[0]

        A = forward[:]
        B = backward[:]
        assert len(B) == n

        del forward[:]
        del backward[:]

        noise.normal_()
        synthetic_data = Variable(output.data + noise)

        err2 = output - synthetic_data
        loss2 = torch.sum(err2 * err2) / 2 / dsize
        optimizer.zero_grad()
        loss2.backward()
        B2 = backward[::-1]
        assert len(B2) == n

        # mode = 'kfac'

        # compute whitened gradient
        pre_dW = []
        for i in range(n):
            # only compute first activation once
            if i > 0:
                covA[i] = A[i] @ t(A[i]) / dsize
                covA_inv[i] = regularized_inverse(covA[i])
            else:
                if covA[i] is None:
                    covA[i] = A[i] @ t(A[i]) / dsize
                    covA_inv[i] = regularized_inverse(covA[i])

            #      else:
            covB2 = B2[i] @ t(B2[i]) / dsize
            covB = B[i] @ t(B[i]) / dsize  # todo: remove

            covB_inv[i] = regularized_inverse(covB2.data)

            whitened_A = covA_inv[i] @ A[i]
            whitened_B = covB_inv[i] @ B[i].data
            pre_dW.append(whitened_B @ t(whitened_A) / dsize)

        params = list(model.parameters())
        assert len(params) == len(pre_dW)
        for i in range(len(params)):
            params[i].data -= lr * pre_dW[i]

        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    loss0 = loss.data.cpu().numpy()  #[0]
    target = 2.360062122

    if 'Apple' in sys.version:
        target = 2.360126972
        target = 2.335654736  # after changing to torch.randn
    if args.cuda:
        target = 2.337174654
        target = 2.337215662  # switching to numpy inverse

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
示例#11
0
def main():
    np.random.seed(0)
    tf.set_random_seed(0)

    dtype = np.float32

    train_images = u.get_mnist_images()

    dsize = 10000
    patches = train_images[:, :dsize].astype(dtype)
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    X0 = patches
    lambda_ = 3e-3
    rho = tf.constant(0.1, dtype=dtype)
    beta = 3
    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    # helper to create variables with numpy or TF initial value
    init_dict = {}  # {var_placeholder: init_value}
    vard = {}  # {var: u.VarInfo}

    def init_var(val, name, trainable=False, noinit=False):
        if isinstance(val, tf.Tensor):
            collections = [] if noinit else None
            var = tf.Variable(val, name=name, collections=collections)
        else:
            val = np.array(val)
            assert u.is_numeric, "Unknown type"
            holder = tf.placeholder(dtype,
                                    shape=val.shape,
                                    name=name + "_holder")
            var = tf.Variable(holder, name=name, trainable=trainable)
            init_dict[holder] = val
        var_p = tf.placeholder(var.dtype, var.shape)
        var_setter = var.assign(var_p)
        vard[var] = u.VarInfo(var_setter, var_p)
        return var

    lr = init_var(0.2, "lr")

    Wf = init_var(W0f, "Wf", True)
    Wf_copy = init_var(W0f, "Wf_copy", True)
    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    X = init_var(X0, "X")
    W.insert(0, X)

    def sigmoid(x):
        return tf.sigmoid(x)

    def d_sigmoid(y):
        return y * (1 - y)

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)

    fail_node = tf.Print(0, [0], "fail, this must never run")
    with tf.control_dependencies([fail_node]):
        A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = backprops from sampled labels needed for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              noinit=True)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    pre_dW_stable = [None] * (n + 1)  # preconditioned stable dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    for i in range(1, n + 1):
        cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0])
        cov_A[i] = init_var(cov_op, "cov_A%d" % (i, ))
        cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity(
            B2[i].shape[0])
        cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i],
                                     "svd_A_%d" % (i, ),
                                     do_inverses=True)
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],
                                      "svd_B2_%d" % (i, ),
                                      do_inverses=True)
        whitened_A = vars_svd_A[i].inv @ A[i]
        whitened_B = vars_svd_B2[i].inv @ B[i]
        pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Loss function
    reconstruction = u.L2(err) / (2 * dsize)

    loss = reconstruction

    grad_live = u.flatten(dW[1:])
    pre_grad_live = u.flatten(pre_dW[1:])  # fisher preconditioned gradient
    grad = init_var(grad_live, "grad")
    pre_grad = init_var(pre_grad_live, "pre_grad")

    update_params_op = Wf.assign(Wf - lr * pre_grad).op
    save_params_op = Wf_copy.assign(Wf).op
    pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad)
    grad_norm = tf.reduce_sum(grad * grad)
    pre_grad_norm = u.L2(pre_grad)

    def dump_svd_info(step):
        """Dump singular values and gradient values in those coordinates."""
        for i in range(1, n + 1):
            svd = vars_svd_A[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "A_%d_%d" % (i, step))
            A0 = A[i].eval()
            At0 = v0.T @ A0
            u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step))
            u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step))
            u.dump(s0, "As_%d_%d" % (i, step))

        for i in range(1, n + 1):
            svd = vars_svd_B2[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "B2_%d_%d" % (i, step))
            B0 = B[i].eval()
            Bt0 = v0.T @ B0
            u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step))
            u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step))
            u.dump(s0, "Bs_%d_%d" % (i, step))

    def advance_batch():
        sess.run(sampled_labels.initializer)  # new labels for next call

    def update_covariances():
        ops_A = [cov_A[i].initializer for i in range(1, n + 1)]
        ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)]
        sess.run(ops_A + ops_B2)

    def update_svds():
        vars_svd_A[2].update()
        vars_svd_B2[2].update()
        vars_svd_B2[1].update()

    def init_svds():
        """Initialize our SVD to identity matrices."""
        ops = []
        for i in range(1, n + 1):
            ops.extend(vars_svd_A[i].init_ops)
            ops.extend(vars_svd_B2[i].init_ops)
        sess = tf.get_default_session()
        sess.run(ops)

    init_op = tf.global_variables_initializer()

    from tensorflow.core.protobuf import rewriter_config_pb2

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                    rewrite_options=rewrite_options)
    config = tf.ConfigProto(graph_options=graph_options)

    sess = tf.InteractiveSession(config=config)
    sess.run(Wf.initializer, feed_dict=init_dict)
    sess.run(X.initializer, feed_dict=init_dict)
    advance_batch()
    update_covariances()
    init_svds()
    sess.run(init_op, feed_dict=init_dict)  # initialize everything else

    print("Running training.")
    u.reset_time()

    step_lengths = []  # keep track of learning rates
    losses = []

    # adaptive line search parameters
    alpha = 0.3  # acceptable fraction of predicted decrease
    beta = 0.8  # how much to shrink when violation
    growth_rate = 1.05  # how much to grow when too conservative

    def update_cov_A(i):
        sess.run(cov_A[i].initializer)

    def update_cov_B2(i):
        sess.run(cov_B2[i].initializer)

    # only update whitening matrix of input activations in the beginning
    vars_svd_A[1].update()

    for step in range(40):
        update_covariances()
        update_svds()

        sess.run(grad.initializer)
        sess.run(pre_grad.initializer)

        lr0, loss0 = sess.run([lr, loss])
        update_params_op.run()
        advance_batch()

        losses.append(loss0)
        step_lengths.append(lr0)

        print("Step %d loss %.2f" % (step, loss0))
        u.record_time()

    assert losses[-1] < 0.59
    assert losses[-1] > 0.57
    assert 20e-3 < min(
        u.global_time_list) < 50e-3, "Time should be 40ms on 1080"
    u.summarize_time()
    print("Test passed")
def main():
    global mode

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # number of layers
    n = len(fs) - 2

    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10
            # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10
            # self.W1 = nn.Parameter(torch.from_numpy(W1))
            # self.W2 = nn.Parameter(torch.from_numpy(W2))
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(fs[1], -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()

    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)
    covA_inv_saved = [None] * n

    for step in range(10):
        mode = 'standard'
        output = model(data)

        mode = 'capture'
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        del forward_inv[:]
        del backward_inv[:]
        noise.normal_()
        output_hat = Variable(output.data + noise)
        output = model(data)
        err_hat = output_hat - output
        loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize
        loss_hat.backward(retain_graph=True)

        backward.reverse()
        forward.reverse()
        assert len(backward) == n
        assert len(forward) == n
        A = forward[:]
        B = backward[:]

        # compute inverses
        for i in range(n):
            # first layer doesn't change so only compute once
            if i == 0 and covA_inv_saved[i] is not None:
                covA_inv = covA_inv_saved[i]
            else:
                covA_inv = regularized_inverse(A[i] @ t(A[i]) / dsize)
                covA_inv_saved[i] = covA_inv
            forward_inv.append(covA_inv)

            covB_inv = regularized_inverse(B[i] @ t(B[i]) / dsize)
            backward_inv.append(covB_inv)

        mode = 'kfac'
        optimizer.zero_grad()
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize
        loss.backward()
        optimizer.step()

        loss0 = loss.data.cpu().numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    if args.cuda:
        target = 2.337120533
    else:
        target = 2.335612774

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
示例#13
0
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False):
    global step, final_loss

    step = 0
    final_loss = None

    torch.manual_seed(seed)
    np.random.seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    visible_size = 28 * 28
    hidden_size = 196

    images = torch.Tensor(u.get_mnist_images(batch_size).T)
    images = images[:batch_size]
    if cuda:
        images = images.cuda()
    data = Variable(images)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size))

        def forward(self, input):
            x = input.view(-1, visible_size)
            x = torch.sigmoid(torch.mm(x, self.encoder))
            x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1)))
            return x.view_as(input)

    # initialize model and weights
    model = Net()
    model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size))
    if cuda:
        model.cuda()

    model.train()
    optimizer = optim.LBFGS(model.parameters(),
                            max_iter=iters,
                            history_size=100,
                            lr=1.0)

    def closure():
        global step, final_loss
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data)
        if verbose:
            loss0 = loss.data[0]
            print("Step %3d loss %6.5f msec %6.3f" %
                  (step, loss0, u.last_time()))
        step += 1
        if step == iters:
            final_loss = loss.data[0]
        loss.backward()
        u.record_time()
        return loss

    optimizer.step(closure)

    output = model(data)
    loss = F.mse_loss(output, data)
    loss0 = loss.data[0]

    if verbose:
        u.summarize_time()

    return final_loss
示例#14
0
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False):
  global step, final_loss
  
  step = 0
  final_loss = None

  torch.manual_seed(seed)
  np.random.seed(seed)
  if cuda:
    torch.cuda.manual_seed(seed)

  visible_size = 28*28
  hidden_size = 196
  
  images = torch.Tensor(u.get_mnist_images(batch_size).T)
  images = images[:batch_size]
  if cuda:
    images = images.cuda()
  data = Variable(images)

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size))

    def forward(self, input):
      x = input.view(-1, visible_size)
      x = torch.sigmoid(torch.mm(x, self.encoder))
      x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1)))
      return x.view_as(input)

  # initialize model and weights
  model = Net()
  model.encoder.data = torch.Tensor(u.ng_init(visible_size,
                                              hidden_size))
  if cuda:
    model.cuda()
  
  model.train()
  optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0)

  times = []
  def closure():
    global step, final_loss
    optimizer.zero_grad()
    output = model(data)
    loss = F.mse_loss(output, data)
    if verbose:
      loss0 = loss.data[0]
      times.append(u.last_time())
      print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time()))
    step+=1
    if step == iters:
      final_loss = loss.data[0]
    loss.backward()
    u.record_time()
    return loss
  
  optimizer.step(closure)

  output = model(data)
  loss = F.mse_loss(output, data)
  loss0 = loss.data[0]

  if verbose:
    u.summarize_time()

    #  print(times)
  s = ','.join(["%f"%(n,) for n in times[2:]])
  print('{', s,'}')
  
  return final_loss