def __init__(self, hidden_size, word_emb_size): super(NZSigmoidLoss, self).__init__() require_type_lst = utils.get_ontoNotes_train_types() self.weight = nn.Parameter( torch.zeros(len(require_type_lst), hidden_size * 2 + word_emb_size)) utils.init_weight(self.weight)
def __init__(self, hidden_size, word_emb_size): super(NZSigmoidLoss, self).__init__() require_type_lst = get_type_lst(fg_config['data']) self.weight = nn.Parameter( torch.zeros(len(require_type_lst), hidden_size * 2 + word_emb_size)) utils.init_weight(self.weight)
def __init__(self, V, D, K, activation): self.D = D self.f = activation # word embedding We = init_weight(V, D) # linear terms W1 = init_weight(D, D) W2 = init_weight(D, D) # bias bh = np.zeros(D) # output layer Wo = init_weight(D, K) bo = np.zeros(K) # make them tensorflow variables self.We = tf.Variable(We.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.params = [self.We, self.W1, self.W2, self.Wo]
def __init__(self, block, num_classes=10): super(PreActResNet_MR, self).__init__() self.register_parameter('conv0', init_weight(64, 3, 3, 3)) self.layer1_left = self.make_layer(block, 64, 64) self.layer1_right = self.make_layer(block, 64, 64) self.layer20_left = self.make_layer(block, 64, 128) self.layer20_right = self.make_layer(block, 64, 128) self.layer21_left = self.make_layer(block, 128, 128) self.layer21_right = self.make_layer(block, 128, 128) self.layer30_left = self.make_layer(block, 128, 256) self.layer30_right = self.make_layer(block, 128, 256) self.layer31_left = self.make_layer(block, 256, 256) self.layer31_right = self.make_layer(block, 256, 256) self.layer40_left = self.make_layer(block, 256, 512) self.layer40_right = self.make_layer(block, 256, 512) self.layer41_left = self.make_layer(block, 512, 512) self.layer41_right = self.make_layer(block, 512, 512) self.bn4 = nn.BatchNorm2d(512) self.register_parameter('fc', init_weight(num_classes, 512)) self.bn5 = nn.BatchNorm1d(10) self.ls = nn.LogSoftmax(dim=1)
def __init__(self, input_dim, output_dim, activation=None, learning_rate=0.2): self.input_dim = input_dim # 上层的神经元个数 self.output_dim = output_dim # 该层的神经元个数 self.learning_rate = learning_rate if activation is None: self.activator = IdentityActivator() elif activation is "sigmoid": self.activator = SigmoidActivator() elif activation is "tanh": self.activator = TanhActivator() elif activation is "relu": self.activator = ReluActivator() elif activation is "softmax": self.activator = SoftmaxActivator() else: raise Exception('Non-supported activation function') # 初始化权重矩阵,偏置项 self.W = init_weight(self.output_dim, self.input_dim) self.b = init_weight(self.output_dim, 1)
def train(args, local_rank, distributed): model = EfficientNet.from_name(args.arch) init_weight(model) device = torch.device("cuda") model.to(device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, list(range(3, args.epochs, 3)), gamma=0.9) amp_opt_level = 'O0' if args.float16: amp_opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # 更新BN参数,comment下面选项 # broadcast_buffers=False, ) do_train( args, model, optimizer, scheduler, device, ) return model
def rand_init(self): # initialize weight for p in [self.fioux, self.iouh, self.fh]: utils.init_weight(p.weight) # initialize forget gate bias self.fioux.bias.data.zero_() self.fioux.bias.data[:self.mem_dim] = 1
def __init__(self, hidden_size, word_emb_size): super(CtxAtt, self).__init__() self.hidden_size = hidden_size self.word_emb_size = word_emb_size self.att_weight = nn.Parameter( torch.FloatTensor(hidden_size * 2, word_emb_size)) utils.init_weight(self.att_weight) self.softmax = nn.Softmax()
def rand_init(self): # initialize weight for p in [self.grzx, self.rzh, self.gh]: utils.init_weight(p.weight) # initialize forget gate bias self.grzx.bias.data.zero_() self.grzx.bias.data[self. mem_dim:] = 1 # bias for z and r gate is init to 1
def __init__(self, hidden_size, word_emb_size): super(SigmoidLoss, self).__init__() self.weight0 = nn.Parameter( torch.zeros(hidden_size * 2 + word_emb_size * 2, hidden_size * 2 + word_emb_size * 2)) self.weight1 = nn.Parameter( torch.zeros(hidden_size * 2 + word_emb_size * 2, 1)) utils.init_weight(self.weight0) utils.init_weight(self.weight1)
def rand_init(self): """ Initialize """ # initialize weights for p in [self.ioux, self.iouh, self.fx, self.fh]: utils.init_weight(p.weight) # initialize forget gate bias self.ioux.bias.data.zero_() self.fx.bias.data[:] = 1
def __init__(self, emb_size): super(AttentionFlowLayer, self).__init__() self.att_w = nn.Parameter(torch.FloatTensor(3 * emb_size, 1)) # torch.nn.init.uniform(self.att_w, -config['weight_scale'], config['weight_scale']) utils.init_weight(self.att_w) self.softmax = nn.Softmax() if config['gate']: self.gate_weight = nn.Parameter( torch.FloatTensor(4 * emb_size, 4 * emb_size)) torch.nn.init.uniform(self.gate_weight, -config['weight_scale'], config['weight_scale'])
def __init__(self): # Output size is 5, because it needs to output the copied 5 bits self.size = 128 # We'll have 1 read head, which produces a single read_vector of size 10. We also need to feed in the input, which is of size 5 (for the five bits) # so our total input size is 15 self.fc_1 = init_weight(15, 128) self.fc_2 = init_weight(128, 128) # This is our controller output self.fc_3 = init_weight(128, 128)
def __init__(self, controller_size, memory_slots, slot_size, batch_size): self.controller_size = controller_size self.memory_slots = memory_slots self.slot_size = slot_size self.batch_size = batch_size # For now we'll only allow one shift backwards or forwards self.weights = { "controller->key" : init_weight(self.controller_size, self.slot_size), "controller->shift" : init_weight(self.controller_size, 3), "controller->sharpen" : init_weight(self.controller_size, 1), "controller->strengthen" : init_weight(self.controller_size, 1), "controller->interpolation" : init_weight(self.controller_size, 1), }
def __init__(self, emb_file): with codecs.open(emb_file, mode='rb', encoding='utf-8') as f: for i, line in enumerate(f): line = line.strip() if line: if i == 0: parts = line.split(' ') self.voc_size = int(parts[0]) + 8 self.emb_size = int(parts[1]) self.embedding_tensor = torch.zeros( self.voc_size, self.emb_size) utils.init_weight(self.embedding_tensor) else: parts = line.split(' ') for j, part in enumerate(parts[1:]): self.embedding_tensor[i + 2, j] = float(part)
def rand_init(self, init_embedding=False): """ random initialization args: init_embedding: random initialize word embedding or not """ if init_embedding: utils.init_embedding(self.word_embeds.weight) if self.position: utils.init_embedding(self.position_embeds.weight) utils.init_lstm(self.lstm) utils.init_linear(self.att2out) utils.init_weight(self.relation_embeds) self.attention.rand_init() self.att_weight = None
def __init__(self, M1, M2, an_id): self.id = an_id self.M1 = M1 self.M2 = M2 W = init_weight(M1, M2) b = np.zeros(M2) self.W = theano.shared(W, "W_%s" % self.id) self.b = theano.shared(b, "b_%s" % self.id) self.params = [self.W, self.b]
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation Wxh = init_weight(Mi, Mo) Whh = init_weight(Mo, Mo) bh = np.zeros(Mo) h0 = np.zeros(Mo) Wxr = init_weight(Mi, Mo) Whr = init_weight(Mo, Mo) br = np.zeros(Mo) Wxz = init_weight(Mi, Mo) Whz = init_weight(Mi, Mo) bz = np.zeros(Mo) self.Wxh = theano.shared(Wxh) self.Whh = theano.shared(Whh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wxr = theano.shared(Wxr) self.Whr = theano.shared(Whr) self.br = theano.shared(bh) self.Wxz = theano.shared(Wxz) self.Whz = theano.shared(Whz) self.bz = theano.shared(bh) self.params = [ self.Wxh, self.Whh, self.bh, self.h0, self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz ]
def __init__(self, controller, output_size, memory_slots=32, slot_size=10, read_heads=1, batch_size=10): """ NTM.__init__(controller, memory_slots, slot_size, read_heads, batch_size) -> None initializes a Neural Turing Machine @param controller: controller is another class, which must have a method called `forward` and another called `get_weights`. This controller will process the read_vectors and the external input. This implemenation modularizes the NTM, meaning that the controller can be defined however the user wants to. @param memory_slots: the value M, the number of memory slots we have available @param slot_size: the value N, how much data can be stored in each individual slot @param read_heads: the number of read heads we have @param batch_size: batch size """ self.controller = controller self.output_size = output_size # This represents the OUTPUT size of the controller self.controller_size = controller.size self.memory_slots = memory_slots self.slot_size = slot_size self.batch_size = batch_size self.read_heads = [ readHead(controller_size=self.controller_size, memory_slots=self.memory_slots, slot_size=self.slot_size, batch_size=self.batch_size) for i in range(read_heads) ] self.write_head = writeHead(controller_size=self.controller_size, memory_slots=self.memory_slots, slot_size=self.slot_size, batch_size=self.batch_size) self.output_weight = init_weight(self.controller_size, self.output_size) self.weights = [self.output_weight] #self.weights = [] for head in self.read_heads: self.weights += head.get_weights() self.weights += self.write_head.get_weights() self.weights += self.controller.get_weights()
def __init__(self, hidden_size, word_emb_size, dropout_p=fg_config['dropout']): super(WARPLoss, self).__init__() require_type_lst = None if fg_config['data'] == 'onto': require_type_lst = utils.get_ontoNotes_train_types() elif fg_config['data'] == 'wiki': require_type_lst = utils.get_wiki_types() elif fg_config['data'] == 'bbn': require_type_lst = utils.get_bbn_types() num_labels = len(require_type_lst) self.weight = nn.Parameter( torch.zeros(hidden_size * 2 + word_emb_size, word_emb_size)) utils.init_weight(self.weight) self.rank_weights = [1.0 / 1] for i in range(1, num_labels): self.rank_weights.append(self.rank_weights[i - 1] + (1.0 / i + 1)) self.trans = nn.Linear(hidden_size * 2 + word_emb_size, word_emb_size) utils.init_linear(self.trans) self.activate = nn.ReLU() self.dropout = nn.Dropout(dropout_p)
def __init__(self, block, num_classes=10): super(PreActResNet_MR, self).__init__() self.register_parameter('conv0', init_weight(64, 3, 3, 3)) self.layer1 = self.make_layer(block, 64, 64) self.layer20 = self.make_layer(block, 64, 128) self.register_parameter('shortcut1', init_weight(128, 64, 1, 1)) self.layer21 = self.make_layer(block, 128, 128) self.layer30 = self.make_layer(block, 128, 256) self.register_parameter('shortcut2', init_weight(256, 128, 1, 1)) self.layer31 = self.make_layer(block, 256, 256) self.layer40 = self.make_layer(block, 256, 512) self.register_parameter('shortcut3', init_weight(512, 256, 1, 1)) self.layer41 = self.make_layer(block, 512, 512) self.bn4 = nn.BatchNorm2d(512) self.register_parameter('fc', init_weight(num_classes, 512)) self.bn5 = nn.BatchNorm1d(10) self.ls = nn.LogSoftmax(dim=1)
def __init__(self, hidden_size, word_emb_size): super(NZCtxAtt, self).__init__() self.hidden_size = hidden_size self.word_emb_size = word_emb_size if fg_config['att'] == 'label_att': self.att_weight = nn.Parameter( torch.FloatTensor(hidden_size * 2, word_emb_size)) utils.init_weight(self.att_weight) elif fg_config['att'] == 'orig_att': self.We = nn.Parameter( torch.FloatTensor(hidden_size * 2, fg_config['Da'])) utils.init_weight(self.We) self.Wa = nn.Parameter(torch.FloatTensor(fg_config['Da'], 1)) utils.init_weight(self.Wa) elif fg_config['att'] == 'no': self.att_weight = nn.Parameter( torch.FloatTensor(hidden_size * 2, word_emb_size)) utils.init_weight(self.att_weight) self.softmax = nn.Softmax()
def __init__(self, units, input_dim): self.units = units self.input_dim = input_dim concat_len = input_dim + units self.wg = init_weight(units, concat_len) self.wi = init_weight(units, concat_len) self.wf = init_weight(units, concat_len) self.wo = init_weight(units, concat_len) self.bg = init_weight(units) self.bi = init_weight(units) self.bf = init_weight(units) self.bo = init_weight(units) # derivative of loss function w.r.t. all parameters self.wg_diff = np.zeros((units, concat_len)) self.wi_diff = np.zeros((units, concat_len)) self.wf_diff = np.zeros((units, concat_len)) self.wo_diff = np.zeros((units, concat_len)) self.bg_diff = np.zeros(units) self.bi_diff = np.zeros(units) self.bf_diff = np.zeros(units) self.bo_diff = np.zeros(units)
def main(): # Get training options opt = get_opt() device = torch.device("cuda") if opt.cuda else torch.device("cpu") # Define the networks # netG_A: used to transfer image from domain A to domain B netG_A = networks.Generator(opt.input_nc, opt.output_nc, opt.ngf, opt.n_res, opt.dropout) if opt.u_net: netG_A = networks.U_net(opt.input_nc, opt.output_nc, opt.ngf) # netD_B: used to test whether an image is from domain A netD_B = networks.Discriminator(opt.input_nc + opt.output_nc, opt.ndf) # Initialize the networks if opt.cuda: netG_A.cuda() netD_B.cuda() utils.init_weight(netG_A) utils.init_weight(netD_B) if opt.pretrained: netG_A.load_state_dict(torch.load('pretrained/netG_A.pth')) netD_B.load_state_dict(torch.load('pretrained/netD_B.pth')) # Define the loss functions criterion_GAN = utils.GANLoss() if opt.cuda: criterion_GAN.cuda() criterion_l1 = torch.nn.L1Loss() # Define the optimizers optimizer_G = torch.optim.Adam(netG_A.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizer_D_B = torch.optim.Adam(netD_B.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) # Create learning rate schedulers lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(optimizer_G, lr_lambda = utils.Lambda_rule(opt.epoch, opt.n_epochs, opt.n_epochs_decay).step) lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(optimizer_D_B, lr_lambda = utils.Lambda_rule(opt.epoch, opt.n_epochs, opt.n_epochs_decay).step) # Define the transform, and load the data transform = transforms.Compose([transforms.Resize((opt.sizeh, opt.sizew)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) dataloader = DataLoader(PairedImage(opt.rootdir, transform = transform, mode = 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu) # numpy arrays to store the loss of epoch loss_G_array = np.zeros(opt.n_epochs + opt.n_epochs_decay) loss_D_B_array = np.zeros(opt.n_epochs + opt.n_epochs_decay) # Training for epoch in range(opt.epoch, opt.n_epochs + opt.n_epochs_decay): start = time.strftime("%H:%M:%S") print("current epoch :", epoch, " start time :", start) # Empty list to store the loss of each mini-batch loss_G_list = [] loss_D_B_list = [] for i, batch in enumerate(dataloader): if i % 20 == 1: print("current step: ", i) current = time.strftime("%H:%M:%S") print("current time :", current) print("last loss G_A:", loss_G_list[-1], "last loss D_B:", loss_D_B_list[-1]) real_A = batch['A'].to(device) real_B = batch['B'].to(device) # Train the generator utils.set_requires_grad([netG_A], True) optimizer_G.zero_grad() # Compute fake images and reconstructed images fake_B = netG_A(real_A) # discriminators require no gradients when optimizing generators utils.set_requires_grad([netD_B], False) # GAN loss prediction_fake_B = netD_B(torch.cat((fake_B, real_A), dim=1)) loss_gan = criterion_GAN(prediction_fake_B, True) #L1 loss loss_l1 = criterion_l1(real_B, fake_B) * opt.l1_loss # total loss without the identity loss loss_G = loss_gan + loss_l1 loss_G_list.append(loss_G.item()) loss_G.backward() optimizer_G.step() # Train the discriminator utils.set_requires_grad([netG_A], False) utils.set_requires_grad([netD_B], True) # Train the discriminator D_B optimizer_D_B.zero_grad() # real images pred_real = netD_B(torch.cat((real_B, real_A), dim=1)) loss_D_real = criterion_GAN(pred_real, True) # fake images fake_B = netG_A(real_A) pred_fake = netD_B(torch.cat((fake_B, real_A), dim=1)) loss_D_fake = criterion_GAN(pred_fake, False) # total loss loss_D_B = (loss_D_real + loss_D_fake) * 0.5 loss_D_B_list.append(loss_D_B.item()) loss_D_B.backward() optimizer_D_B.step() # Update the learning rate lr_scheduler_G.step() lr_scheduler_D_B.step() # Save models checkpoints torch.save(netG_A.state_dict(), 'model/netG_A_pix.pth') torch.save(netD_B.state_dict(), 'model/netD_B_pix.pth') # Save other checkpoint information checkpoint = {'epoch': epoch, 'optimizer_G': optimizer_G.state_dict(), 'optimizer_D_B': optimizer_D_B.state_dict(), 'lr_scheduler_G': lr_scheduler_G.state_dict(), 'lr_scheduler_D_B': lr_scheduler_D_B.state_dict()} torch.save(checkpoint, 'model/checkpoint.pth') # Update the numpy arrays that record the loss loss_G_array[epoch] = sum(loss_G_list) / len(loss_G_list) loss_D_B_array[epoch] = sum(loss_D_B_list) / len(loss_D_B_list) np.savetxt('model/loss_G.txt', loss_G_array) np.savetxt('model/loss_D_B.txt', loss_D_B_array) end = time.strftime("%H:%M:%S") print("current epoch :", epoch, " end time :", end) print("G loss :", loss_G_array[epoch], "D_B loss :", loss_D_B_array[epoch])
def fit(self, fpath=None, data=None, n_features=None, n_epoch=1, r=1, c=1, **kargs): """ Parameters ------------- fpath: str. file dir for trainning set data: DataFrame. tranining set, needed only if fpath not given n_features: int. # features, needed only if fpath not given r: float. learning rate c: float. tradeoff between regularizer and loss n_epoch: int. max epoch Returns -------- self: object """ assert (r <= 1), "Learning rate must be no more than one!" if data is None: # if data not given, read from file n_features, n_samples, data_train = get_data(fpath) else: # else get data directly data_train = data n_samples = data_train.shape[0] x_temp = data_train.iloc[0]['X'] if type(x_temp) == list: n_features = len(x_temp) else: n_features = x_temp.shape[0] # 1. initialize weight, r w = init_weight(n_features) r_0 = r # 2. for epoch = 1...T for epoch in range(1, n_epoch + 1): # shuffle traning set data = shuffle_samples(data_train, epoch) data.index = range(data.shape[0]) self.data_train = data # for each example, update weight r = r_0 / epoch for t in range(n_samples): y = data.loc[t, 'y'] x = data.loc[t, 'X'] if type(x) == list: x = np.array(x) assert (w.shape == x.shape), "dim(w) != dim(x)" loss = y * np.dot(w, x) if loss <= 1: w = (1 - r) * w + r * c * y * x else: w = (1 - r) * w # print objective jw = 0.5 * np.dot(w, w) + c * max(0, 1 - loss) print("Epoch = %s J(w) = %1.4f" % (epoch, jw)) # 3. return w self.weight = w
def fit(self, fpath=None, data=None, n_features=None, n_epoch=1, r=1, sigma=1, **kargs): """ Parameters ------------- fpath: str. file dir for trainning set data: DataFrame. tranining set, needed only if fpath not given n_features: int. # features, needed only if fpath not given r: float. learning rate sigma: float. Tradeoff n_epoch: int. epoch Returns -------- self: object """ # 0. get training set if data is None: n_features, n_samples, data_train = get_data(fpath) else: data_train = data n_samples = data_train.shape[0] x_temp = data_train.iloc[0]['X'] if type(x_temp) == list: n_features = len(x_temp) else: n_features = x_temp.shape[0] # 1. initialize weight, r w = init_weight(n_features) r_0 = r # 2. for each epoch: for epoch in range(1, n_epoch + 1): jw = [] # (1) shuffle training set data_train = shuffle_samples(data_train, epoch) # (2) update weight #r = r_0 / epoch # diminishing r for i in range(n_samples): y = data_train.iloc[i, 0] x = data_train.iloc[i, 1] grad = (2 / sigma**2) * w - (y * x) / ( 1 + np.exp(y * np.dot(w, x))) assert (w.shape == x.shape), "dim(w) != dim(x)" assert (grad.shape == w.shape), "dim(w) != dim(gradient)" w = w - r * grad jw.append((1 / sigma**2) * np.dot(w, w) + np.log(1 + np.exp(-y * np.dot(w, x)))) # print objective jw = np.mean(jw) print("Epoch = %s J(w) = %1.4f" % (epoch, jw)) # 3. return w self.weight = w return self
def fit(self, X, Y, learning_rate=10e-3, mu=0.99, reg=10e-12, eps=10e-10, epochs=400, batch_sz=20, print_period=1, show_fig=False): Y = Y.astype(np.int32) N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # for the last layer W = init_weight(M1, K) b = np.zeros(K) self.W = theano.shared(W, "W_logreg") self.b = theano.shared(b, "b_logreg") self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params #adding momentum dparams = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] thX = T.matrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) grads = T.grad(cost, self.params) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % print_period == 0: costs.append(c) e = np.mean(Ybatch != p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error_rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, epochs=500, show_fig=False, activation=T.tanh): M = self.M V = self.V K = len(set(Y)) X, Y = shuffle(X, Y) Nvalid = 10 Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:] X, Y = X[:-Nvalid], Y[:-Nvalid] N = len(X) Wx = init_weight(V, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo, activation) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grad = T.grad(cost, self.param) dparams = [theano.shared(p.get_value() * 0) for p in self.params] lr = T.scalar('learning_rate') updates = [(p, p + mu * dp - lr * g) for p, dp, g in zip(self.params, grad, dparams) ] + [(dp, mu * dp - lr * g) for dp, g in zip(dparams, grad)] self.train_op = theano.function( inputs=[thX, thY, lr], outputs=[cost, prediction], updates=updates, allow_input_downcast=True, ) costs = [] for i in range(epochs): cost = 0 Ncorrect = 0 X, Y = shuffle(X, Y) for j in range(N): c, p = self.train_op(X[j], Y[j], learning_rate) cost += c if p == Y[j]: Ncorrect += 1 costs.append(cost) learning_rate *= 0.9999 NVcorrect = 0 for j in range(Nvalid): input = Xvalid[j] predict = self.predict_op(input) if predict == Yvalid[j]: NVcorrect += 1 print('epoch: %d, cost: %f ,accuracy: %f' % (i, cost, Ncorrect / N)) print('Validation accuracy: ', NVcorrect / Nvalid) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo ] thX = T.ivector('X') Ei = self.We[thX] # TxD thY = T.ivector('Y') def recurrence(x_t, h_t1): #returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function( inputs=[thX], outputs=prediction, ) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) costs = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in range(epochs): X = shuffle(X) cost = 0 n_correct = 0 for j in range(N): input_sequece = [0] + X[j] output_sequence = X[j] + [1] c, p = self.train_op(input_sequece, output_sequence) cost += c for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print('i:', i, 'cost:', cost, 'correct rate:', (float(n_correct) / n_total)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3 * D) W22 = np.random.randn(D, D, D) / np.sqrt(3 * D) W12 = np.random.randn(D, D, D) / np.sqrt(3 * D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = tf.Variable(We.astype(np.float32)) self.W11 = tf.Variable(W11.astype(np.float32)) self.W22 = tf.Variable(W22.astype(np.float32)) self.W12 = tf.Variable(W12.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.weights = [ self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo ] words = tf.placeholder(tf.int32, shape=(None, ), name='words') left_children = tf.placeholder(tf.int32, shape=(None, ), name='left_children') right_children = tf.placeholder(tf.int32, shape=(None, ), name='right_children') labels = tf.placeholder(tf.int32, shape=(None, ), name='labels') # save for later self.words = words self.left = left_children self.right = right_children self.labels = labels def dot1(a, B): return tf.tensordot(a, B, axes=[[0], [1]]) def dot2(B, a): return tf.tensordot(B, a, axes=[[1], [0]]) def recursive_net_transform(hiddens, n): h_left = hiddens.read(left_children[n]) h_right = hiddens.read(right_children[n]) return self.f( dot1(h_left, dot2(self.W11, h_left)) + dot1(h_right, dot2(self.W22, h_right)) + dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) + dot1(h_right, self.W2) + self.bh) def recurrence(hiddens, n): w = words[n] # any non-word will have index -1 h_n = tf.cond(w >= 0, lambda: tf.nn.embedding_lookup(self.We, w), lambda: recursive_net_transform(hiddens, n)) hiddens = hiddens.write(n, h_n) n = tf.add(n, 1) return hiddens, n def condition(hiddens, n): # loop should continue while n < len(words) return tf.less(n, tf.shape(words)[0]) hiddens = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) hiddens, _ = tf.while_loop(condition, recurrence, [hiddens, tf.constant(0)], parallel_iterations=1) h = hiddens.stack() logits = tf.matmul(h, self.Wo) + self.bo prediction_op = tf.argmax(logits, axis=1) self.prediction_op = prediction_op rcost = reg * sum(tf.nn.l2_loss(p) for p in self.weights) if train_inner_nodes: # filter out -1s labeled_indices = tf.where(labels >= 0) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.gather(logits, labeled_indices), labels=tf.gather(labels, labeled_indices), )) + rcost else: cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[-1], labels=labels[-1], )) + rcost train_op = tf.train.AdagradOptimizer( learning_rate=8e-3).minimize(cost_op) # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op) # NOTE: If you're using GPU, InteractiveSession breaks # AdagradOptimizer and some other optimizers # change to tf.Session() if so. self.session = tf.Session() init_op = tf.global_variables_initializer() self.session.run(init_op) costs = [] sequence_indexes = range(N) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 for j in sequence_indexes: words_, left, right, lab = trees[j] # print("words_:", words_) # print("lab:", lab) c, p, _ = self.session.run( (cost_op, prediction_op, train_op), feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) if np.isnan(c): print("Cost is nan! Let's stop here. \ Why don't you try decreasing the learning rate?") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() # calculate the test score n_test_correct = 0 n_test_total = 0 for words_, left, right, lab in test_trees: p = self.session.run(prediction_op, feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print("i:", i, "cost:", cost, "train acc:", float(n_correct) / n_total, "test acc:", float(n_test_correct) / n_test_total, "time for epoch:", (datetime.now() - t0)) costs.append(cost) plt.plot(costs) plt.show()