예제 #1
0
    def __init__(self, inputs, outputs, cost, scopes, **option):
        """

        :param model:
        :param option:
        """
        if "variables" not in option or not option["variables"]:
            # not fine-tuning

            params = [
                param for scope in scopes
                for param in ops.trainable_variables(scope)
            ]
            # regularization_loss = ops.get_regularization_loss(scopes)
            # if regularization_loss:
            #     cost += regularization_loss
            # if option["l2_scale"]:
            #     get_l2 = ops.l2_regularizer(option["l2_scale"])
            #     cost += reduce(T.add, [get_l2(param) for param in params])

        else:
            pass
            # fine-tuning
            # _logger.debug("loading specified params")
            # params = option["variables"]

        grads = theano.grad(cost, params)

        gradsref = grads

        vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params]

        if "algorithm" not in option:
            option["algorithm"] = "sgd"

        if "variant" not in option:
            option["variant"] = None

        if "constraint" not in option:
            option["constraint"] = None

        if "momentum" not in option:
            option["momentum"] = False

        if "norm" not in option:
            option["norm"] = True

        if "nesterov" not in option:
            option["nesterov"] = False

        if "initialize" not in option:
            option["initialize"] = False

        if "nanguard" not in option:
            option["nanguard"] = False

        algorithm = option["algorithm"]
        variant = option["variant"]
        variant = [variant] if variant != None else []

        if option["norm"]:
            normval = constraint.global_norm(grads)
            outputs = outputs[:]
            outputs.append(normval)

        if option["constraint"]:
            method, value = option["constraint"]
            if method == "value":
                grads = constraint.clip_by_value(grads, value[0], value[1])
            if method == "norm":
                grads = constraint.clip_by_global_norm(grads, value)

        if option["nanguard"]:
            gnorm = constraint.global_norm(gradsref)
            isnan = theano.tensor.isnan(gnorm)
            isinf = theano.tensor.isinf(gnorm)
            notfinite = theano.tensor.or_(isnan, isinf)
            newgrads = []
            for p, g in zip(params, grads):
                newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g))
            grads = newgrads

        if option["nesterov"]:
            option["momentum"] = False

        gup = []
        scan_updates = ops.get_updates()

        # append update rules
        if isinstance(scan_updates, OrderedDict):
            for key, value in scan_updates.iteritems():
                gup.append((key, value))
        else:
            gup.extend(scan_updates)

        for v, g in zip(vec, grads):
            gup.append((v, g))

        if algorithm == "sgd":
            alpha = theano.tensor.scalar()
            hparams = [alpha]
            defaults = [("alpha", 1.0)]
            svar, pup = updates.sgd_updates(params, vec, *hparams)
        elif algorithm == "adagrad":
            alpha = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, epsilon]
            defaults = [("alpha", 1.0), ("epsilon", 1e-6)]
            svar, pup = updates.adagrad_updates(params, vec, *hparams)
        elif algorithm == "rmsprop":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)]
            rmsparam = hparams + variant
            svar, pup = updates.rmsprop_updates(params, vec, *rmsparam)
        elif algorithm == "rmsprop_momentum":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            momentum = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon, momentum]
            defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)]
            defaults.append(("moment", 0.9))
            svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams)
        elif algorithm == "adadelta":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)]
            svar, pup = updates.adadelta_updates(params, vec, *hparams)
        elif algorithm == "adam":
            alpha = theano.tensor.scalar()
            beta1 = theano.tensor.scalar()
            beta2 = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, beta1, beta2, epsilon]
            defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)]
            defaults.append(("epsilon", 1e-8))
            svar, pup = updates.adam_updates(params, vec, *hparams)
        else:
            raise "Error: " + algorithm + " is not supported"

        # restore variables used by optimizer
        if option["initialize"]:
            values = option["initialize"]
            for v1, v2 in zip(svar, values):
                v1.set_value(v2)

        if option["momentum"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        if option["nesterov"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        optimize = theano.function(inputs,
                                   outputs,
                                   updates=gup,
                                   on_unused_input='warn')
        update = theano.function(hparams, [],
                                 updates=pup,
                                 on_unused_input='warn')

        def wrapper(**option):
            values = []
            for item in defaults:
                name = item[0]
                val = item[1]
                if name not in option:
                    option[name] = val
                values.append(option[name])
            return update(*values)

        self.optimize = optimize
        self.update = wrapper
        self.option = option
        self.algorithm = algorithm
        self.parameter = svar
예제 #2
0
	def fit(self, x, valid=None, epochs=10, seq_length=25, sampling_temp=0.7, sample_freq=10, checkpoint_freq=10, checkpoints_dir='models', unk_char='*'):
		# NOTE: checkpoints are generated only when a validation set is provided
		# build the character vocabulary
		vocab = set(x)
		if self.vocab is None or vocab != self.vocab:
			self.vocab = vocab
			self.vocab.add(unk_char)		# special placeholder for out-of-vocabulary characters
			self.vocab_size = len(vocab)
			self.ch_to_ix = {ch: i for i, ch in enumerate(vocab)}
			self.ix_to_ch = {v: k for k, v in self.ch_to_ix.iteritems()}
			print 'Vocab size:', self.vocab_size

		# NOTE: checkpoints will be generated only if a validation set is provided
		if self.train_fn is None:
			print 'Compiling the training functions'
			X = T.imatrix()
			self.params = self.init()
			y_hat, cost = self.model(X, self.dropout_p_hidden)
			pgrads = T.grad(cost, wrt=self.params)
			# gradient clipping to avoid exploding gradients
			if self.grad_clip > 0.:
				gnorm = T.sum([T.sum(g ** 2) for g in pgrads])
				# to clip gradients we use the following heuristic
				# new_g = g * grad_clip / total_grad_norm
				pgrads = [T.switch(gnorm > self.grad_clip, g * self.grad_clip / gnorm, g) for g in pgrads]

			updates = adagrad(cost, self.params, grads=pgrads, learning_rate=self.learning_rate)
			if self.momentum > 0.:
				updates = apply_momentum(updates, self.momentum)
			self.train_fn = theano.function(inputs=[X], outputs=cost, updates=updates)
			self.cost_fn = theano.function(inputs=[X], outputs=cost)

		# convert strings to integer vectors
		x_ix = np.asarray([self.ch_to_ix[ch] for ch in x], dtype=np.int32)
		if valid is not None:
			valid_ix = np.asarray([self.ch_to_ix.get(ch, self.ch_to_ix[unk_char]) for ch in valid], dtype=np.int32)
			if not os.path.exists(checkpoints_dir):
				os.makedirs(checkpoints_dir)

		# Let's check the initial cost matches the exected one
		# print 'Expected initial cost:', np.log(len(vocab))
		# print 'Actual initial cost:', self.cost_fn(x_ix[:,None])
		 
		# split the training sequence into equal blocks of length seq_length
		x_ix = self._split_sequences(x_ix, seq_length, padding_char=' ')
		# randomly the training sequences
		x_ix = x_ix[self.numpy_rng.permutation(x_ix.shape[0])]

		# then start training
		num_train_batches = -(-x_ix.shape[0] // self.batch_size)
		print 'Training started'
		train_cost_history = []
		if valid is not None:
			valid_cost_history = []
		for e in range(epochs):
			avg_cost = 0
			for bidx in range(num_train_batches):
				batch_x = x_ix[bidx * self.batch_size: (bidx + 1) * self.batch_size]
				batch_cost = self.train_fn(batch_x.transpose([1, 0]))
				train_cost_history.append(float(batch_cost))
				if np.isnan(batch_cost):
					print 'NaN cost detected. Abort'
					return
				avg_cost += batch_cost
			avg_cost /= num_train_batches
			if valid is not None:
				valid_cost = float(self.cost_fn(valid_ix[:, None]))
				valid_cost_history.append(valid_cost)
				print 'Epoch: {} Train Loss: {:.4f} Valid Loss: {:.4f}'.format(e, avg_cost, valid_cost)
				if checkpoint_freq > 0 and (e + 1) % checkpoint_freq == 0:
					# pickle to save the current state of training
					chk_path = os.path.join(checkpoints_dir, 'charrnn_vanilla_{}_epoch{}_t{:.4f}_v{:.4f}.pkl'.format(len(self.rnn_layers), e, avg_cost, valid_cost))
					state = {
						'epoch': e,
						'train_cost_history': train_cost_history,
						'valid_cost_history': valid_cost_history,
						'train_cost': 	avg_cost,
						'valid_cost': 	valid_cost,
						'params': 		self.export_params(),
						'vocab':		self.vocab,
						'rnn_layers':	self.rnn_layers,
						'batch_size':	self.batch_size,
						'learning_rate':	self.learning_rate,
						'dropout_p_hidden':	self.dropout_p_hidden,
						'momentum':		self.momentum,
						'grad_clip':	self.grad_clip,
					}
					pkl.dump(state, open(chk_path, 'wb'), pkl.HIGHEST_PROTOCOL)
					print 'Written checkpoint:', chk_path
			else:
				print 'Epoch: {} Train Loss: {:.4f}'.format(e + 1, avg_cost)
			if (e + 1) % sample_freq == 0:
				print '\nSampled string:\n{}\n'.format(self.sample(seed_string=''))
예제 #3
0
    def __init__(self, model, **option):
        information = {}

        information["sgd"] = (1, [1.0])
        information["adagrad"] = (2, [1.0, 1e-6])
        information["rmsprop"] = (3, [1e-2, 0.99, 1e-8])
        # torch default: 1.0, 0.9, 1e-6
        information["adadelta"] = (3, [1.0, 0.95, 1e-6])
        information["adam"] = (4, [0.001, 0.9, 0.999, 1e-8])
        information["rmsprop_momentum"] = (4, [1e-4, 0.95, 0.9, 1e-4])

        cost = model.cost
        params = model.parameter
        inputs = model.inputs
        outputs = model.outputs
        scan_updates = model.updates

        grads = theano.grad(cost, params)
        gradsref = grads

        vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params]

        if "algorithm" not in option:
            option["algorithm"] = "sgd"

        if "variant" not in option:
            option["variant"] = None

        if "constraint" not in option:
            option["constraint"] = None

        if "momentum" not in option:
            option["momentum"] = False

        if "norm" not in option:
            option["norm"] = True

        if "nesterov" not in option:
            option["nesterov"] = False

        if "initialize" not in option:
            option["initialize"] = False

        if "nanguard" not in option:
            option["nanguard"] = True

        algorithm = option["algorithm"]
        variant = option["variant"]
        variant = [variant] if variant != None else []

        if option["norm"]:
            normval = constraint.global_norm(grads)
            outputs = outputs[:]
            outputs.insert(1, normval)

        if option["constraint"]:
            method, value = option["constraint"]
            if method == "value":
                grads = constraint.clip_by_value(grads, value[0], value[1])
            if method == "norm":
                grads = constraint.clip_by_global_norm(grads, value)

        if option["nanguard"]:
            gnorm = constraint.global_norm(gradsref)
            isnan = theano.tensor.isnan(gnorm)
            isinf = theano.tensor.isinf(gnorm)
            notfinite = theano.tensor.or_(isnan, isinf)
            newgrads = []
            for p, g in zip(params, grads):
                newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g))
            grads = newgrads

        if option["nesterov"]:
            option["momentum"] = False

        gup = []

        # append update rules
        if isinstance(scan_updates, OrderedDict):
            for key, value in scan_updates.iteritems():
                gup.append((key, value))
        else:
            gup.extend(scan_updates)

        for v, g in zip(vec, grads):
            gup.append((v, g))

        if algorithm == "sgd":
            alpha = theano.tensor.scalar()
            hparams = [alpha]
            defaults = [("alpha", 1.0)]
            svar, pup = updates.sgd_updates(params, vec, *hparams)
        elif algorithm == "adagrad":
            alpha = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, epsilon]
            defaults = [("alpha", 1.0), ("epsilon", 1e-6)]
            svar, pup = updates.adagrad_updates(params, vec, *hparams)
        elif algorithm == "rmsprop":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)]
            rmsparam = hparams + variant
            svar, pup = updates.rmsprop_updates(params, vec, *rmsparam)
        elif algorithm == "rmsprop_momentum":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            momentum = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon, momentum]
            defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)]
            defaults.append(("moment", 0.9))
            svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams)
        elif algorithm == "adadelta":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)]
            svar, pup = updates.adadelta_updates(params, vec, *hparams)
        elif algorithm == "adam":
            alpha = theano.tensor.scalar()
            beta1 = theano.tensor.scalar()
            beta2 = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, beta1, beta2, epsilon]
            defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)]
            defaults.append(("epsilon", 1e-8))
            svar, pup = updates.adam_updates(params, vec, *hparams)
        else:
            raise "Error: " + algorithm + " is not supported"

        if option["initialize"]:
            values = option["initialize"]
            for v1, v2 in zip(svar, values):
                v1.set_value(v2)

        if option["momentum"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        if option["nesterov"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        optimize = theano.function(inputs, outputs, updates=gup)
        update = theano.function(hparams, [], updates=pup)

        def wrapper(**option):
            values = []
            for item in defaults:
                name = item[0]
                val = item[1]
                if name not in option:
                    option[name] = val
                values.append(option[name])
            return update(*values)

        self.optimize = optimize
        self.update = wrapper
        self.option = option
        self.algorithm = algorithm
        self.information = information
        self.parameter = svar