def main(): torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) images = torch.Tensor(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Linear(args.visible_size, args.hidden_size, bias=False) self.decoder = nn.Linear(args.hidden_size, args.visible_size, bias=False) def forward(self, input): x = input.view(-1, args.visible_size) x = self.encoder(x) x = F.sigmoid(x) x = self.decoder(x) x = F.sigmoid(x) return x.view_as(input) # initialize model and weights model = Net() params1, params2 = list(model.parameters()) params1.data = torch.Tensor( u.ng_init(args.visible_size, args.hidden_size).T) params2.data = torch.Tensor( u.ng_init(args.hidden_size, args.visible_size).T) if args.cuda: model.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=args.lr) for step in range(args.iters): optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] loss.backward() optimizer.step() print("Step %3d loss %6.5f" % (step, loss0)) u.record_time() u.summarize_time()
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False): global final_loss, W_flat tf.set_random_seed(seed) np.random.seed(seed) images = tf.constant(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.gpu() data = images if cuda: device='/gpu:0' else: device='' device_ctx = tf.device(device) device_ctx.__enter__() visible_size = 28*28 hidden_size = 196 initial_val = tf.zeros([visible_size*hidden_size]) if W_flat is None: W_flat = tfe.Variable(initial_val, name='W_flat') W_flat.assign(initial_val) def loss_fn(w_flat): w = tf.reshape(w_flat, [visible_size, hidden_size]) x = tf.matmul(data, w) x = tf.sigmoid(x) x = tf.matmul(x, w, transpose_b=True) x = tf.sigmoid(x) return tf.reduce_mean(tf.square(x-data)) value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn) def opfunc(x): # returns (value, gradient) value, grads = value_and_gradients_fn(x) return value, grads[0] # initialize weights W_flat.assign(u.ng_init(visible_size, hidden_size).flatten()) state = Struct() config = Struct() config.maxIter = iters config.nCorrection = history config.verbose = True x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose) if verbose: u.summarize_time() s = ','.join(["%f"%(n,) for n in times[2:]]) print('{', s,'}') return final_loss
def __init__(self): super(Net, self).__init__() # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10 # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10 # self.W1 = nn.Parameter(torch.from_numpy(W1)) # self.W2 = nn.Parameter(torch.from_numpy(W2)) for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))
def main(): tf.set_random_seed(args.seed) np.random.seed(args.seed) images = tf.constant(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.as_gpu_tensor() data = images if args.cuda: device='/gpu:0' else: device='' with tf.device(device): encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False, activation=tf.sigmoid) decoder = tf.layers.Dense(units=args.visible_size, use_bias=False, activation=tf.sigmoid) def loss_fn(inputs): predictions = decoder(encoder(inputs)) return tf.reduce_mean(tf.square(predictions-inputs)) value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn) # initialize weights loss_fn(data) params1 = encoder.weights[0] params2 = decoder.weights[0] params1.assign(u.ng_init(args.visible_size, args.hidden_size)) params2.assign(u.ng_init(args.hidden_size, args.visible_size)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr) for step in range(args.iters): value, grads_and_vars = value_and_gradients_fn(data) optimizer.apply_gradients(grads_and_vars) print("Step %3d loss %6.5f"%(step, value.numpy())) u.record_time() u.summarize_time()
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False): global final_loss, W_flat tf.set_random_seed(seed) np.random.seed(seed) images = tf.constant(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.gpu() data = images if cuda: device = '/gpu:0' else: device = '' device_ctx = tf.device(device) device_ctx.__enter__() visible_size = 28 * 28 hidden_size = 196 initial_val = tf.zeros([visible_size * hidden_size]) if W_flat is None: W_flat = tfe.Variable(initial_val, name='W_flat') W_flat.assign(initial_val) def loss_fn(w_flat): w = tf.reshape(w_flat, [visible_size, hidden_size]) x = tf.matmul(data, w) x = tf.sigmoid(x) x = tf.matmul(x, w, transpose_b=True) x = tf.sigmoid(x) return tf.reduce_mean(tf.square(x - data)) value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn) def opfunc(x): # returns (value, gradient) value, grads = value_and_gradients_fn(x) return value, grads[0] # initialize weights W_flat.assign(u.ng_init(visible_size, hidden_size).flatten()) state = Struct() config = Struct() config.maxIter = iters config.verbose = True x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose) if verbose: u.summarize_time() return final_loss
def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(0) tf.set_random_seed(0) train_images = u.get_mnist_images() dsize = 1000 fs = [dsize, 28 * 28, 196, 28 * 28] # layer sizes lambda_ = 3e-3 def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) Wf = tf.constant(W0f) assert Wf.dtype == tf.float32 lr = tf.constant(0.2) losses = [] for step in range(10): loss, grad, kfac_grad = loss_and_grad(Wf) loss0 = loss.numpy() print("Step %d loss %.2f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad if step >= 4: assert loss < 17.6 u.record_time() u.summarize_time() assert losses[-1] < 0.8 assert losses[-1] > 0.78 assert 20e-3 < min(u.global_time_list) < 120e-3
def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(args.seed) tf.set_random_seed(args.seed) if args.cuda: device = '/gpu:0' else: device = '/cpu:0' device_context = tf.device(device) device_context.__enter__() X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0, W1_0]) Wf = tf.constant(W0f) assert Wf.dtype == tf.float32 lr = tf.constant(0.2) losses = [] for step in range(40): loss, grad, kfac_grad = loss_and_grad(Wf) loss0 = loss.numpy() print("Step %3d loss %10.9f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad if step >= 4: assert loss < 17.6 u.record_time() u.summarize_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 30ms on 1080"
def __init__(self): super(Net, self).__init__() for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 train_images = u.get_mnist_images() dsize = 10000 patches = train_images[:, :dsize].astype(dtype) fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: u.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0]) cov_A[i] = init_var(cov_op, "cov_A%d" % (i, )) cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity( B2[i].shape[0]) cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ), do_inverses=True) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ), do_inverses=True) whitened_A = vars_svd_A[i].inv @ A[i] whitened_B = vars_svd_B2[i].inv @ B[i] pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) loss = reconstruction grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) u.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) u.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): vars_svd_A[2].update() vars_svd_B2[2].update() vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning vars_svd_A[1].update() for step in range(40): update_covariances() update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) update_params_op.run() advance_batch() losses.append(loss0) step_lengths.append(lr0) print("Step %d loss %.2f" % (step, loss0)) u.record_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 40ms on 1080" u.summarize_time() print("Test passed")
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False): global step, final_loss step = 0 final_loss = None torch.manual_seed(seed) np.random.seed(seed) if cuda: torch.cuda.manual_seed(seed) visible_size = 28 * 28 hidden_size = 196 images = torch.Tensor(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size)) def forward(self, input): x = input.view(-1, visible_size) x = torch.sigmoid(torch.mm(x, self.encoder)) x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) return x.view_as(input) # initialize model and weights model = Net() model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size)) if cuda: model.cuda() model.train() optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=100, lr=1.0) def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] print("Step %3d loss %6.5f msec %6.3f" % (step, loss0, u.last_time())) step += 1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss optimizer.step(closure) output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] if verbose: u.summarize_time() return final_loss
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False): global step, final_loss step = 0 final_loss = None torch.manual_seed(seed) np.random.seed(seed) if cuda: torch.cuda.manual_seed(seed) visible_size = 28*28 hidden_size = 196 images = torch.Tensor(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size)) def forward(self, input): x = input.view(-1, visible_size) x = torch.sigmoid(torch.mm(x, self.encoder)) x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) return x.view_as(input) # initialize model and weights model = Net() model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size)) if cuda: model.cuda() model.train() optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0) times = [] def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] times.append(u.last_time()) print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time())) step+=1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss optimizer.step(closure) output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] if verbose: u.summarize_time() # print(times) s = ','.join(["%f"%(n,) for n in times[2:]]) print('{', s,'}') return final_loss