def __init__(self, inputs, outputs, cost, scopes, **option): """ :param model: :param option: """ if "variables" not in option or not option["variables"]: # not fine-tuning params = [ param for scope in scopes for param in ops.trainable_variables(scope) ] # regularization_loss = ops.get_regularization_loss(scopes) # if regularization_loss: # cost += regularization_loss # if option["l2_scale"]: # get_l2 = ops.l2_regularizer(option["l2_scale"]) # cost += reduce(T.add, [get_l2(param) for param in params]) else: pass # fine-tuning # _logger.debug("loading specified params") # params = option["variables"] grads = theano.grad(cost, params) gradsref = grads vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params] if "algorithm" not in option: option["algorithm"] = "sgd" if "variant" not in option: option["variant"] = None if "constraint" not in option: option["constraint"] = None if "momentum" not in option: option["momentum"] = False if "norm" not in option: option["norm"] = True if "nesterov" not in option: option["nesterov"] = False if "initialize" not in option: option["initialize"] = False if "nanguard" not in option: option["nanguard"] = False algorithm = option["algorithm"] variant = option["variant"] variant = [variant] if variant != None else [] if option["norm"]: normval = constraint.global_norm(grads) outputs = outputs[:] outputs.append(normval) if option["constraint"]: method, value = option["constraint"] if method == "value": grads = constraint.clip_by_value(grads, value[0], value[1]) if method == "norm": grads = constraint.clip_by_global_norm(grads, value) if option["nanguard"]: gnorm = constraint.global_norm(gradsref) isnan = theano.tensor.isnan(gnorm) isinf = theano.tensor.isinf(gnorm) notfinite = theano.tensor.or_(isnan, isinf) newgrads = [] for p, g in zip(params, grads): newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g)) grads = newgrads if option["nesterov"]: option["momentum"] = False gup = [] scan_updates = ops.get_updates() # append update rules if isinstance(scan_updates, OrderedDict): for key, value in scan_updates.iteritems(): gup.append((key, value)) else: gup.extend(scan_updates) for v, g in zip(vec, grads): gup.append((v, g)) if algorithm == "sgd": alpha = theano.tensor.scalar() hparams = [alpha] defaults = [("alpha", 1.0)] svar, pup = updates.sgd_updates(params, vec, *hparams) elif algorithm == "adagrad": alpha = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, epsilon] defaults = [("alpha", 1.0), ("epsilon", 1e-6)] svar, pup = updates.adagrad_updates(params, vec, *hparams) elif algorithm == "rmsprop": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)] rmsparam = hparams + variant svar, pup = updates.rmsprop_updates(params, vec, *rmsparam) elif algorithm == "rmsprop_momentum": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() momentum = theano.tensor.scalar() hparams = [alpha, rho, epsilon, momentum] defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)] defaults.append(("moment", 0.9)) svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams) elif algorithm == "adadelta": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)] svar, pup = updates.adadelta_updates(params, vec, *hparams) elif algorithm == "adam": alpha = theano.tensor.scalar() beta1 = theano.tensor.scalar() beta2 = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, beta1, beta2, epsilon] defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)] defaults.append(("epsilon", 1e-8)) svar, pup = updates.adam_updates(params, vec, *hparams) else: raise "Error: " + algorithm + " is not supported" # restore variables used by optimizer if option["initialize"]: values = option["initialize"] for v1, v2 in zip(svar, values): v1.set_value(v2) if option["momentum"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) if option["nesterov"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) optimize = theano.function(inputs, outputs, updates=gup, on_unused_input='warn') update = theano.function(hparams, [], updates=pup, on_unused_input='warn') def wrapper(**option): values = [] for item in defaults: name = item[0] val = item[1] if name not in option: option[name] = val values.append(option[name]) return update(*values) self.optimize = optimize self.update = wrapper self.option = option self.algorithm = algorithm self.parameter = svar
def fit(self, x, valid=None, epochs=10, seq_length=25, sampling_temp=0.7, sample_freq=10, checkpoint_freq=10, checkpoints_dir='models', unk_char='*'): # NOTE: checkpoints are generated only when a validation set is provided # build the character vocabulary vocab = set(x) if self.vocab is None or vocab != self.vocab: self.vocab = vocab self.vocab.add(unk_char) # special placeholder for out-of-vocabulary characters self.vocab_size = len(vocab) self.ch_to_ix = {ch: i for i, ch in enumerate(vocab)} self.ix_to_ch = {v: k for k, v in self.ch_to_ix.iteritems()} print 'Vocab size:', self.vocab_size # NOTE: checkpoints will be generated only if a validation set is provided if self.train_fn is None: print 'Compiling the training functions' X = T.imatrix() self.params = self.init() y_hat, cost = self.model(X, self.dropout_p_hidden) pgrads = T.grad(cost, wrt=self.params) # gradient clipping to avoid exploding gradients if self.grad_clip > 0.: gnorm = T.sum([T.sum(g ** 2) for g in pgrads]) # to clip gradients we use the following heuristic # new_g = g * grad_clip / total_grad_norm pgrads = [T.switch(gnorm > self.grad_clip, g * self.grad_clip / gnorm, g) for g in pgrads] updates = adagrad(cost, self.params, grads=pgrads, learning_rate=self.learning_rate) if self.momentum > 0.: updates = apply_momentum(updates, self.momentum) self.train_fn = theano.function(inputs=[X], outputs=cost, updates=updates) self.cost_fn = theano.function(inputs=[X], outputs=cost) # convert strings to integer vectors x_ix = np.asarray([self.ch_to_ix[ch] for ch in x], dtype=np.int32) if valid is not None: valid_ix = np.asarray([self.ch_to_ix.get(ch, self.ch_to_ix[unk_char]) for ch in valid], dtype=np.int32) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) # Let's check the initial cost matches the exected one # print 'Expected initial cost:', np.log(len(vocab)) # print 'Actual initial cost:', self.cost_fn(x_ix[:,None]) # split the training sequence into equal blocks of length seq_length x_ix = self._split_sequences(x_ix, seq_length, padding_char=' ') # randomly the training sequences x_ix = x_ix[self.numpy_rng.permutation(x_ix.shape[0])] # then start training num_train_batches = -(-x_ix.shape[0] // self.batch_size) print 'Training started' train_cost_history = [] if valid is not None: valid_cost_history = [] for e in range(epochs): avg_cost = 0 for bidx in range(num_train_batches): batch_x = x_ix[bidx * self.batch_size: (bidx + 1) * self.batch_size] batch_cost = self.train_fn(batch_x.transpose([1, 0])) train_cost_history.append(float(batch_cost)) if np.isnan(batch_cost): print 'NaN cost detected. Abort' return avg_cost += batch_cost avg_cost /= num_train_batches if valid is not None: valid_cost = float(self.cost_fn(valid_ix[:, None])) valid_cost_history.append(valid_cost) print 'Epoch: {} Train Loss: {:.4f} Valid Loss: {:.4f}'.format(e, avg_cost, valid_cost) if checkpoint_freq > 0 and (e + 1) % checkpoint_freq == 0: # pickle to save the current state of training chk_path = os.path.join(checkpoints_dir, 'charrnn_vanilla_{}_epoch{}_t{:.4f}_v{:.4f}.pkl'.format(len(self.rnn_layers), e, avg_cost, valid_cost)) state = { 'epoch': e, 'train_cost_history': train_cost_history, 'valid_cost_history': valid_cost_history, 'train_cost': avg_cost, 'valid_cost': valid_cost, 'params': self.export_params(), 'vocab': self.vocab, 'rnn_layers': self.rnn_layers, 'batch_size': self.batch_size, 'learning_rate': self.learning_rate, 'dropout_p_hidden': self.dropout_p_hidden, 'momentum': self.momentum, 'grad_clip': self.grad_clip, } pkl.dump(state, open(chk_path, 'wb'), pkl.HIGHEST_PROTOCOL) print 'Written checkpoint:', chk_path else: print 'Epoch: {} Train Loss: {:.4f}'.format(e + 1, avg_cost) if (e + 1) % sample_freq == 0: print '\nSampled string:\n{}\n'.format(self.sample(seed_string=''))
def __init__(self, model, **option): information = {} information["sgd"] = (1, [1.0]) information["adagrad"] = (2, [1.0, 1e-6]) information["rmsprop"] = (3, [1e-2, 0.99, 1e-8]) # torch default: 1.0, 0.9, 1e-6 information["adadelta"] = (3, [1.0, 0.95, 1e-6]) information["adam"] = (4, [0.001, 0.9, 0.999, 1e-8]) information["rmsprop_momentum"] = (4, [1e-4, 0.95, 0.9, 1e-4]) cost = model.cost params = model.parameter inputs = model.inputs outputs = model.outputs scan_updates = model.updates grads = theano.grad(cost, params) gradsref = grads vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params] if "algorithm" not in option: option["algorithm"] = "sgd" if "variant" not in option: option["variant"] = None if "constraint" not in option: option["constraint"] = None if "momentum" not in option: option["momentum"] = False if "norm" not in option: option["norm"] = True if "nesterov" not in option: option["nesterov"] = False if "initialize" not in option: option["initialize"] = False if "nanguard" not in option: option["nanguard"] = True algorithm = option["algorithm"] variant = option["variant"] variant = [variant] if variant != None else [] if option["norm"]: normval = constraint.global_norm(grads) outputs = outputs[:] outputs.insert(1, normval) if option["constraint"]: method, value = option["constraint"] if method == "value": grads = constraint.clip_by_value(grads, value[0], value[1]) if method == "norm": grads = constraint.clip_by_global_norm(grads, value) if option["nanguard"]: gnorm = constraint.global_norm(gradsref) isnan = theano.tensor.isnan(gnorm) isinf = theano.tensor.isinf(gnorm) notfinite = theano.tensor.or_(isnan, isinf) newgrads = [] for p, g in zip(params, grads): newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g)) grads = newgrads if option["nesterov"]: option["momentum"] = False gup = [] # append update rules if isinstance(scan_updates, OrderedDict): for key, value in scan_updates.iteritems(): gup.append((key, value)) else: gup.extend(scan_updates) for v, g in zip(vec, grads): gup.append((v, g)) if algorithm == "sgd": alpha = theano.tensor.scalar() hparams = [alpha] defaults = [("alpha", 1.0)] svar, pup = updates.sgd_updates(params, vec, *hparams) elif algorithm == "adagrad": alpha = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, epsilon] defaults = [("alpha", 1.0), ("epsilon", 1e-6)] svar, pup = updates.adagrad_updates(params, vec, *hparams) elif algorithm == "rmsprop": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)] rmsparam = hparams + variant svar, pup = updates.rmsprop_updates(params, vec, *rmsparam) elif algorithm == "rmsprop_momentum": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() momentum = theano.tensor.scalar() hparams = [alpha, rho, epsilon, momentum] defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)] defaults.append(("moment", 0.9)) svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams) elif algorithm == "adadelta": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)] svar, pup = updates.adadelta_updates(params, vec, *hparams) elif algorithm == "adam": alpha = theano.tensor.scalar() beta1 = theano.tensor.scalar() beta2 = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, beta1, beta2, epsilon] defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)] defaults.append(("epsilon", 1e-8)) svar, pup = updates.adam_updates(params, vec, *hparams) else: raise "Error: " + algorithm + " is not supported" if option["initialize"]: values = option["initialize"] for v1, v2 in zip(svar, values): v1.set_value(v2) if option["momentum"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) if option["nesterov"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) optimize = theano.function(inputs, outputs, updates=gup) update = theano.function(hparams, [], updates=pup) def wrapper(**option): values = [] for item in defaults: name = item[0] val = item[1] if name not in option: option[name] = val values.append(option[name]) return update(*values) self.optimize = optimize self.update = wrapper self.option = option self.algorithm = algorithm self.information = information self.parameter = svar