def init_params(net, weight_path=None, is_train=False): '''Init parameters randomly or from checkpoint file. Args: net, a constructed neural net weight_path, checkpoint file path is_train, if false, then a checkpoint file must be presented ''' assert is_train is True or weight_path is not None, \ 'must provide a checkpoint file for serving' if weight_path is None: for pname, pval in zip(net.param_names(), net.param_values()): if 'conv' in pname and len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[1]) elif 'dense' in pname: if len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[0]) else: pval.set_value(0) # init params from batch norm layer elif 'mean' in pname or 'beta' in pname: pval.set_value(0) elif 'var' in pname: pval.set_value(1) elif 'gamma' in pname: initializer.uniform(pval, 0, 1) else: net.load(weight_path, use_pickle=True)
def create_net(use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) ConvBnReLU(net, 'conv1_1', 64, (3, 32, 32)) net.add(layer.Dropout('drop1', 0.3)) ConvBnReLU(net, 'conv1_2', 64) net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv2_1', 128) net.add(layer.Dropout('drop2_1', 0.4)) ConvBnReLU(net, 'conv2_2', 128) net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv3_1', 256) net.add(layer.Dropout('drop3_1', 0.4)) ConvBnReLU(net, 'conv3_2', 256) net.add(layer.Dropout('drop3_2', 0.4)) ConvBnReLU(net, 'conv3_3', 256) net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv4_1', 512) net.add(layer.Dropout('drop4_1', 0.4)) ConvBnReLU(net, 'conv4_2', 512) net.add(layer.Dropout('drop4_2', 0.4)) ConvBnReLU(net, 'conv4_3', 512) net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv5_1', 512) net.add(layer.Dropout('drop5_1', 0.4)) ConvBnReLU(net, 'conv5_2', 512) net.add(layer.Dropout('drop5_2', 0.4)) ConvBnReLU(net, 'conv5_3', 512) net.add(layer.MaxPooling2D('pool5', 2, 2, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dropout('drop_flat', 0.5)) net.add(layer.Dense('ip1', 512)) net.add(layer.BatchNormalization('batchnorm_ip1')) net.add(layer.Activation('relu_ip1')) net.add(layer.Dropout('drop_ip2', 0.5)) net.add(layer.Dense('ip2', 10)) print('Start intialization............') for (p, name) in zip(net.param_values(), net.param_names()): print(name, p.shape) if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: initializer.gaussian(p, 0, 3 * 3 * p.shape[0]) else: p.gaussian(0, 0.02) else: p.set_value(0) print(name, p.l1()) return net
def create_net(use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) ConvBnReLU(net, 'conv1_1', 64, (3, 32, 32)) net.add(layer.Dropout('drop1', 0.3)) ConvBnReLU(net, 'conv1_2', 64) net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv2_1', 128) net.add(layer.Dropout('drop2_1', 0.4)) ConvBnReLU(net, 'conv2_2', 128) net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv3_1', 256) net.add(layer.Dropout('drop3_1', 0.4)) ConvBnReLU(net, 'conv3_2', 256) net.add(layer.Dropout('drop3_2', 0.4)) ConvBnReLU(net, 'conv3_3', 256) net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv4_1', 512) net.add(layer.Dropout('drop4_1', 0.4)) ConvBnReLU(net, 'conv4_2', 512) net.add(layer.Dropout('drop4_2', 0.4)) ConvBnReLU(net, 'conv4_3', 512) net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv5_1', 512) net.add(layer.Dropout('drop5_1', 0.4)) ConvBnReLU(net, 'conv5_2', 512) net.add(layer.Dropout('drop5_2', 0.4)) ConvBnReLU(net, 'conv5_3', 512) net.add(layer.MaxPooling2D('pool5', 2, 2, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dropout('drop_flat', 0.5)) net.add(layer.Dense('ip1', 512)) net.add(layer.BatchNormalization('batchnorm_ip1')) net.add(layer.Activation('relu_ip1')) net.add(layer.Dropout('drop_ip2', 0.5)) net.add(layer.Dense('ip2', 10)) print 'Start intialization............' for (p, name) in zip(net.param_values(), net.param_names()): print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: initializer.gaussian(p, 0, 3 * 3 * p.shape[0]) else: p.gaussian(0, 0.02) else: p.set_value(0) print name, p.l1() return net
def create_net(input_shape, use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) net.add( layer.Conv2D('conv1', nb_kernels=32, kernel=7, stride=3, pad=1, input_sample_shape=input_shape)) net.add(layer.Activation('relu1')) net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid')) net.add(layer.Conv2D('conv2', nb_kernels=64, kernel=5, stride=3)) net.add(layer.Activation('relu2')) net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid')) net.add(layer.Conv2D('conv3', nb_kernels=128, kernel=3, stride=1, pad=2)) net.add(layer.Activation('relu3')) net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid')) net.add(layer.Conv2D('conv4', nb_kernels=256, kernel=3, stride=1)) net.add(layer.Activation('relu4')) net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dense('ip5', 256)) net.add(layer.Activation('relu5')) net.add(layer.Dense('ip6', 16)) net.add(layer.Activation('relu6')) net.add(layer.Dense('ip7', 2)) print 'Parameter intialization............' for (p, name) in zip(net.param_values(), net.param_names()): print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: initializer.gaussian(p, 0, p.size()) else: p.gaussian(0, 0.02) else: p.set_value(0) print name, p.l1() return net
def create_net(input_shape, use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) ConvBnReLUPool(net, 'conv1', 32, input_shape) ConvBnReLUPool(net, 'conv2', 64) ConvBnReLUPool(net, 'conv3', 128) ConvBnReLUPool(net, 'conv4', 128) ConvBnReLUPool(net, 'conv5', 256) ConvBnReLUPool(net, 'conv6', 256) ConvBnReLUPool(net, 'conv7', 512) ConvBnReLUPool(net, 'conv8', 512) net.add(layer.Flatten('flat')) net.add(layer.Dense('ip1', 256)) net.add(layer.BatchNormalization('bn1')) net.add(layer.Activation('relu1')) net.add(layer.Dropout('dropout1', 0.2)) net.add(layer.Dense('ip2', 16)) net.add(layer.BatchNormalization('bn2')) net.add(layer.Activation('relu2')) net.add(layer.Dropout('dropout2', 0.2)) net.add(layer.Dense('ip3', 2)) print 'Parameter intialization............' for (p, name) in zip(net.param_values(), net.param_names()): print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: initializer.gaussian(p, 0, p.size()) else: p.gaussian(0, 0.02) else: p.set_value(0) print name, p.l1() return net
def create_net(use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) net.add( layer.Conv2D("conv1", 16, 3, 1, pad=1, input_sample_shape=(3, 32, 32))) net.add(layer.BatchNormalization("bn1")) net.add(layer.Activation("relu1")) Block(net, "2a", 16, 1) Block(net, "2b", 16, 1) Block(net, "2c", 16, 1) Block(net, "3a", 32, 2) Block(net, "3b", 32, 1) Block(net, "3c", 32, 1) Block(net, "4a", 64, 2) Block(net, "4b", 64, 1) Block(net, "4c", 64, 1) net.add(layer.AvgPooling2D("pool4", 8, 8, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dense('ip5', 10)) print 'Start intialization............' for (p, name) in zip(net.param_values(), net.param_names()): # print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: # initializer.gaussian(p, 0, math.sqrt(2.0/p.shape[1])) initializer.gaussian(p, 0, 9.0 * p.shape[0]) else: initializer.uniform(p, p.shape[0], p.shape[1]) else: p.set_value(0) # print name, p.l1() return net
def create_net(use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) net.add(layer.Conv2D("conv1", 16, 3, 1, pad=1, input_sample_shape=(3, 32, 32))) net.add(layer.BatchNormalization("bn1")) net.add(layer.Activation("relu1")) Block(net, "2a", 16, 1) Block(net, "2b", 16, 1) Block(net, "2c", 16, 1) Block(net, "3a", 32, 2) Block(net, "3b", 32, 1) Block(net, "3c", 32, 1) Block(net, "4a", 64, 2) Block(net, "4b", 64, 1) Block(net, "4c", 64, 1) net.add(layer.AvgPooling2D("pool4", 8, 8, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dense('ip5', 10)) print('Start intialization............') for (p, name) in zip(net.param_values(), net.param_names()): # print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: # initializer.gaussian(p, 0, math.sqrt(2.0/p.shape[1])) initializer.gaussian(p, 0, 9.0 * p.shape[0]) else: initializer.uniform(p, p.shape[0], p.shape[1]) else: p.set_value(0) # print name, p.l1() return net
def init_params(net, weight_path=None): if weight_path == None: for pname, pval in zip(net.param_names(), net.param_values()): print(pname, pval.shape) if 'conv' in pname and len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[1]) elif 'dense' in pname: if len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[0]) else: pval.set_value(0) # init params from batch norm layer elif 'mean' in pname or 'beta' in pname: pval.set_value(0) elif 'var' in pname: pval.set_value(1) elif 'gamma' in pname: initializer.uniform(pval, 0, 1) else: net.load(weight_path, use_pickle='pickle' in weight_path)
def init_params(net, weight_path=None): if weight_path == None: for pname, pval in zip(net.param_names(), net.param_values()): print(pname, pval.shape) if 'conv' in pname and len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[1]) elif 'dense' in pname: if len(pval.shape) > 1: initializer.gaussian(pval, 0, pval.shape[0]) else: pval.set_value(0) # init params from batch norm layer elif 'mean' in pname or 'beta' in pname: pval.set_value(0) elif 'var' in pname: pval.set_value(1) elif 'gamma' in pname: initializer.uniform(pval, 0, 1) else: net.load(weight_path, use_pickle = 'pickle' in weight_path)
def train(data_file, use_gpu, num_epoch=10, batch_size=100): print 'Start intialization............' lr = 0.1 # Learning rate weight_decay = 0.0002 hdim = 1000 vdim = 784 opt = optimizer.SGD(momentum=0.8, weight_decay=weight_decay) tweight = tensor.Tensor((vdim, hdim)) tweight.gaussian(0.0, 0.1) tvbias = tensor.from_numpy(np.zeros(vdim, dtype = np.float32)) thbias = tensor.from_numpy(np.zeros(hdim, dtype = np.float32)) opt = optimizer.SGD(momentum=0.5, weight_decay=weight_decay) print 'Loading data ..................' train_x, valid_x = load_train_data(data_file) if use_gpu: dev = device.create_cuda_gpu() else: dev = device.get_default_device() for t in [tweight, tvbias, thbias]: t.to_device(dev) num_train_batch = train_x.shape[0] / batch_size print "num_train_batch = %d " % (num_train_batch) for epoch in range(num_epoch): trainerrorsum = 0.0 print 'Epoch %d' % epoch for b in range(num_train_batch): # positive phase tdata = tensor.from_numpy( train_x[(b * batch_size):((b + 1) * batch_size), : ]) tdata.to_device(dev) tposhidprob = tensor.mult(tdata, tweight) tposhidprob.add_row(thbias) tposhidprob = tensor.sigmoid(tposhidprob) tposhidrandom = tensor.Tensor(tposhidprob.shape, dev) tposhidrandom.uniform(0.0, 1.0) tposhidsample = tensor.gt(tposhidprob, tposhidrandom) # negative phase tnegdata = tensor.mult(tposhidsample, tweight.T()) tnegdata.add_row(tvbias) tnegdata = tensor.sigmoid(tnegdata) tneghidprob = tensor.mult(tnegdata, tweight) tneghidprob.add_row(thbias) tneghidprob = tensor.sigmoid(tneghidprob) error = tensor.sum(tensor.square((tdata - tnegdata))) trainerrorsum = error + trainerrorsum tgweight = tensor.mult(tnegdata.T(), tneghidprob) -\ tensor.mult(tdata.T(), tposhidprob) tgvbias = tensor.sum(tnegdata, 0) - tensor.sum(tdata, 0) tghbias = tensor.sum(tneghidprob, 0) - tensor.sum(tposhidprob, 0) opt.apply_with_lr(epoch, lr / batch_size, tgweight, tweight, 'w') opt.apply_with_lr(epoch, lr / batch_size, tgvbias, tvbias, 'vb') opt.apply_with_lr(epoch, lr / batch_size, tghbias, thbias, 'hb') print 'training errorsum = %f' % (trainerrorsum) tvaliddata = tensor.from_numpy(valid_x) tvaliddata.to_device(dev) tvalidposhidprob = tensor.mult(tvaliddata, tweight) tvalidposhidprob.add_row(thbias) tvalidposhidprob = tensor.sigmoid(tvalidposhidprob) tvalidposhidrandom = tensor.Tensor(tvalidposhidprob.shape, dev) initializer.uniform(tvalidposhidrandom, 0.0, 1.0) tvalidposhidsample = tensor.gt(tvalidposhidprob, tvalidposhidrandom) tvalidnegdata = tensor.mult(tvalidposhidsample, tweight.T()) tvalidnegdata.add_row(tvbias) tvalidnegdata = tensor.sigmoid(tvalidnegdata) validerrorsum = tensor.sum(tensor.square((tvaliddata - tvalidnegdata))) print 'valid errorsum = %f' % (validerrorsum)
def train(data_file, use_gpu, num_epoch=10, batch_size=100): print('Start intialization............') lr = 0.1 # Learning rate weight_decay = 0.0002 hdim = 1000 vdim = 784 tweight = tensor.Tensor((vdim, hdim)) tweight.gaussian(0.0, 0.1) tvbias = tensor.from_numpy(np.zeros(vdim, dtype=np.float32)) thbias = tensor.from_numpy(np.zeros(hdim, dtype=np.float32)) opt = optimizer.SGD(momentum=0.5, weight_decay=weight_decay) print('Loading data ..................') train_x, valid_x = load_train_data(data_file) if use_gpu: dev = device.create_cuda_gpu() else: dev = device.get_default_device() for t in [tweight, tvbias, thbias]: t.to_device(dev) num_train_batch = train_x.shape[0] // batch_size print("num_train_batch = %d " % (num_train_batch)) for epoch in range(num_epoch): trainerrorsum = 0.0 print('Epoch %d' % epoch) for b in range(num_train_batch): # positive phase tdata = tensor.from_numpy( train_x[(b * batch_size):((b + 1) * batch_size), :]) tdata.to_device(dev) tposhidprob = tensor.mult(tdata, tweight) tposhidprob = tposhidprob + thbias tposhidprob = tensor.sigmoid(tposhidprob) tposhidrandom = tensor.Tensor(tposhidprob.shape, dev) tposhidrandom.uniform(0.0, 1.0) tposhidsample = tensor.gt(tposhidprob, tposhidrandom) # negative phase tnegdata = tensor.mult(tposhidsample, tweight.T()) tnegdata = tnegdata + tvbias tnegdata = tensor.sigmoid(tnegdata) tneghidprob = tensor.mult(tnegdata, tweight) tneghidprob = tneghidprob + thbias tneghidprob = tensor.sigmoid(tneghidprob) error = tensor.sum(tensor.square((tdata - tnegdata))) trainerrorsum = error + trainerrorsum tgweight = tensor.mult(tnegdata.T(), tneghidprob) \ - tensor.mult(tdata.T(), tposhidprob) tgvbias = tensor.sum(tnegdata, 0) - tensor.sum(tdata, 0) tghbias = tensor.sum(tneghidprob, 0) - tensor.sum(tposhidprob, 0) opt.apply_with_lr(epoch, lr / batch_size, tgweight, tweight, 'w') opt.apply_with_lr(epoch, lr / batch_size, tgvbias, tvbias, 'vb') opt.apply_with_lr(epoch, lr / batch_size, tghbias, thbias, 'hb') print('training erroraverage = %f' % (tensor.to_numpy(trainerrorsum) / train_x.shape[0])) tvaliddata = tensor.from_numpy(valid_x) tvaliddata.to_device(dev) tvalidposhidprob = tensor.mult(tvaliddata, tweight) tvalidposhidprob = tvalidposhidprob + thbias tvalidposhidprob = tensor.sigmoid(tvalidposhidprob) tvalidposhidrandom = tensor.Tensor(tvalidposhidprob.shape, dev) initializer.uniform(tvalidposhidrandom, 0.0, 1.0) tvalidposhidsample = tensor.gt(tvalidposhidprob, tvalidposhidrandom) tvalidnegdata = tensor.mult(tvalidposhidsample, tweight.T()) tvalidnegdata = tvalidnegdata + tvbias tvalidnegdata = tensor.sigmoid(tvalidnegdata) validerrorsum = tensor.sum(tensor.square((tvaliddata - tvalidnegdata))) print('valid erroraverage = %f' % (tensor.to_numpy(validerrorsum) / valid_x.shape[0]))
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip(['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM( name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=( data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense( 'dense', data.vocab_size, input_sample_shape=( hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size: (b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr( epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr( epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size: (b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip( ['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
def create_net(input_shape, use_cpu=False): if use_cpu: layer.engine = 'singacpp' net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) ConvBnReLU(net, 'conv1_1', 64, input_shape) #net.add(layer.Dropout('drop1', 0.3)) net.add(layer.MaxPooling2D('pool0', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv1_2', 128) net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv2_1', 128) net.add(layer.Dropout('drop2_1', 0.4)) ConvBnReLU(net, 'conv2_2', 128) net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv3_1', 256) net.add(layer.Dropout('drop3_1', 0.4)) ConvBnReLU(net, 'conv3_2', 256) net.add(layer.Dropout('drop3_2', 0.4)) ConvBnReLU(net, 'conv3_3', 256) net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv4_1', 256) net.add(layer.Dropout('drop4_1', 0.4)) ConvBnReLU(net, 'conv4_2', 256) net.add(layer.Dropout('drop4_2', 0.4)) ConvBnReLU(net, 'conv4_3', 256) net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid')) ConvBnReLU(net, 'conv5_1', 512) net.add(layer.Dropout('drop5_1', 0.4)) ConvBnReLU(net, 'conv5_2', 512) net.add(layer.Dropout('drop5_2', 0.4)) ConvBnReLU(net, 'conv5_3', 512) net.add(layer.MaxPooling2D('pool5', 2, 2, border_mode='valid')) #ConvBnReLU(net, 'conv6_1', 512) #net.add(layer.Dropout('drop6_1', 0.4)) #ConvBnReLU(net, 'conv6_2', 512) #net.add(layer.Dropout('drop6_2', 0.4)) #ConvBnReLU(net, 'conv6_3', 512) #net.add(layer.MaxPooling2D('pool6', 2, 2, border_mode='valid')) #ConvBnReLU(net, 'conv7_1', 512) #net.add(layer.Dropout('drop7_1', 0.4)) #ConvBnReLU(net, 'conv7_2', 512) #net.add(layer.Dropout('drop7_2', 0.4)) #ConvBnReLU(net, 'conv7_3', 512) #net.add(layer.MaxPooling2D('pool7', 2, 2, border_mode='valid')) net.add(layer.Flatten('flat')) net.add(layer.Dense('ip1', 256)) net.add(layer.BatchNormalization('bn1')) net.add(layer.Activation('relu1')) net.add(layer.Dropout('dropout1', 0.2)) net.add(layer.Dense('ip2', 16)) net.add(layer.BatchNormalization('bn2')) net.add(layer.Activation('relu2')) net.add(layer.Dropout('dropout2', 0.2)) net.add(layer.Dense('ip3', 2)) print 'Parameter intialization............' for (p, name) in zip(net.param_values(), net.param_names()): print name, p.shape if 'mean' in name or 'beta' in name: p.set_value(0.0) elif 'var' in name: p.set_value(1.0) elif 'gamma' in name: initializer.uniform(p, 0, 1) elif len(p.shape) > 1: if 'conv' in name: initializer.gaussian(p, 0, p.size()) else: p.gaussian(0, 0.02) else: p.set_value(0) print name, p.l1() return net
hidden_size=64, num_stacks=5, dropout=0.5, input_sample_shape=(vocab_size, )) encoder.to_device(cuda) decoder.to_device(cuda) encoder_w = encoder.param_values()[0] encoder_w.uniform(-0.08, 0.08) decoder_w = decoder.param_values()[0] decoder_w.uniform(-0.08, 0.08) dense = layer.Dense('dense', vocab_size, input_sample_shape=(64, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] initializer.uniform(dense_w, dense_w.shape[0], 0) dense_b.set_value(0) #g_encoder_w = tensor.Tensor(encoder_w.shape, cuda) #g_encoder_w.set_value(0.0) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() batch_size = 50 maxlength = 22 num_train_batch = 5000 num_epoch = 5 metadata, idx_q, idx_a = load_data() trainlosslist = np.zeros(num_epoch) for epoch in range(num_epoch):