def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))] for _ in xrange(2 * n_layers): inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem))) outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer * 2] prev_c = inputs[i_layer * 2 + 1] if i_layer == 0: x = inputs[0] size_x = size_input else: x = outputs[(i_layer - 1) * 2] size_x = size_mem input_sums = nn.Affine(size_x, 4 * size_mem)(x) + nn.Affine( size_x, 4 * size_mem)(prev_h) sigmoid_chunk = cgt.sigmoid(input_sums[:, 0:3 * size_mem]) in_gate = sigmoid_chunk[:, 0:size_mem] forget_gate = sigmoid_chunk[:, size_mem:2 * size_mem] out_gate = sigmoid_chunk[:, 2 * size_mem:3 * size_mem] in_transform = cgt.tanh(input_sums[:, 3 * size_mem:4 * size_mem]) next_c = forget_gate * prev_c + in_gate * in_transform next_h = out_gate * cgt.tanh(next_c) outputs.append(next_c) outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output)(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def build_fc_return_loss(X, y): """ Build fully connected network and return loss """ np.random.seed(0) h1 = nn.rectify( nn.Affine(28 * 28, 256, weight_init=nn.IIDGaussian(std=.1))(X)) h2 = nn.rectify( nn.Affine(256, 256, weight_init=nn.IIDGaussian(std=.1))(h1)) logprobs = nn.logsoftmax( nn.Affine(256, 10, weight_init=nn.IIDGaussian(std=.1))(h2)) neglogliks = -logprobs[cgt.arange(X.shape[0]), y] loss = neglogliks.mean() return loss
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}): assert len(num_units) == len(num_stos) net_in = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in dbg_out['NET~in'] = net_in curr_layer = 1 for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): assert curr_num_units >= curr_num_sto >= 0 prev_out = combo_layer( prev_out, prev_num_units, curr_num_units, (curr_num_sto, ), s_funcs=s_func_ip, o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out))(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out
def lstm_network_t(size_in, size_out, num_units, num_mems, dbg_out={}): def s_func_lstm(_in, _s_in, _s_out, name=''): c_prev = cgt.matrix(fixed_shape=(None, _s_out)) h_prev = cgt.matrix(fixed_shape=(None, _s_out)) c_cur, h_cur = lstm_block(h_prev, c_prev, _in, _s_in, _s_out, name) net_c_prev.append(c_prev) net_h_prev.append(h_prev) net_c_curr.append(c_cur) net_h_curr.append(h_cur) return h_cur assert len(num_units) == len(num_mems) net_c_prev, net_h_prev, net_c_curr, net_h_curr = [], [], [], [] net_in = cgt.matrix(fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in curr_layer = 1 for curr_num_units, curr_num_mem in zip(num_units, num_mems): assert curr_num_units >= curr_num_mem >= 0 prev_out = combo_layer(prev_out, prev_num_units, curr_num_units, (curr_num_mem, ), s_funcs=(s_func_lstm, s_func_ip), o_funcs=(None, cgt.sigmoid), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="Out")(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out, net_c_prev, net_h_prev, net_c_curr, net_h_curr
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out=[]): assert len(num_units) == len(num_stos) X = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, X dbg_out.append(X) for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): _layer_dbg_out = [] prev_out = hybrid_layer(prev_out, prev_num_units, curr_num_units, curr_num_sto, dbg_out=_layer_dbg_out) prev_num_units = curr_num_units dbg_out.extend(_layer_dbg_out) dbg_out.append(prev_out) # TODO_TZ bigger problem! param cannot deterministically influence cost # otherwise the surrogate cost is not complete log likelihood net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out))(prev_out) dbg_out.append(net_out) # assert prev_num_units == size_out # net_out = prev_out return X, net_out
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem update_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2u")(x) + nn.Affine(size_mem, size_mem, name="h2u")(prev_h)) reset_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2r")(x) + nn.Affine(size_mem, size_mem, name="h2r")(prev_h)) gated_hidden = reset_gate * prev_h p2 = nn.Affine(size_mem, size_mem)(gated_hidden) p1 = nn.Affine(size_x, size_mem)(x) hidden_target = cgt.tanh(p1 + p2) next_h = (1.0 - update_gate) * prev_h + update_gate * hidden_target outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def lstm_block(h_prev, c_prev, x_curr, size_x, size_c, name=''): """ Construct a LSTM cell block of specified number of cells :param h_prev: self activations at previous time step :param c_prev: self memory state at previous time step :param x_curr: inputs from previous layer at current time step :param size_x: size of inputs :param size_c: size of both c and h :return: c and h at current time step :rtype: """ input_sums = nn.Affine(size_x, 4 * size_c, name=name+'*x')(x_curr) + \ nn.Affine(size_c, 4 * size_c, name=name+'*h')(h_prev) c_new = cgt.tanh(input_sums[:, 3 * size_c:]) sigmoid_chunk = cgt.sigmoid(input_sums[:, :3 * size_c]) in_gate = sigmoid_chunk[:, :size_c] forget_gate = sigmoid_chunk[:, size_c:2 * size_c] out_gate = sigmoid_chunk[:, 2 * size_c:3 * size_c] c_curr = forget_gate * c_prev + in_gate * c_new h_curr = out_gate * cgt.tanh(c_curr) return c_curr, h_curr
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]): assert size_out >= size_random >= 0 out = cgt.sigmoid( nn.Affine(size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out))(X)) dbg_out.append(out) if size_random == 0: return out if size_random == size_out: out_s = cgt.bernoulli(out) return out_s out_s = cgt.bernoulli(out[:, :size_random]) out = cgt.concatenate([out_s, out[:, size_random:]], axis=1) return out
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(X)) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(pool1)) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
import cgt from cgt import nn, utils import numpy as np, numpy.random as nr from numpy.linalg import norm from param_collection import ParamCollection k_in = 1 size_x = 3 size_mem = 4 size_batch = 4 x = cgt.matrix(fixed_shape=(size_batch, size_x)) prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem)) r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters = [prev_h_3] for i in xrange(k_in * 2): inter_in = inters[-1] r_cur = r[:, i, :] r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul( r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - ref_cur inters.append(inter_out) h = inters[-1]
def s_func_ip(X, size_in, size_out, name): return nn.Affine(size_in, size_out, name=name)(X)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)